pass arbitrary number of vertices to the shader execution cycle

author: Zack Rusin <zack@tungstengraphics.com> 2008-04-12 15:45:28 -0400
committer: Keith Whitwell <keith@tungstengraphics.com> 2008-04-14 11:00:35 +0100
commit: aadbb1d7fbbaada6e378cb60194e5861cadf98d1 (patch)
tree: 922c7904806495502b7e78d6147715bc680b19b8 /src
parent: 4f550ab821f9aef9f19d9f1e10785f8c1f511ad4 (diff)
4 files changed, 146 insertions, 143 deletions
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 4d056f6dba..f9aea9f355 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -56,6 +56,8 @@ struct gallivm_cpu_engine;
 struct draw_pt_middle_end;
 struct draw_pt_front_end;
 
+#define MAX_SHADER_VERTICES 128
+
 /**
  * Basic vertex info.
  * Carry some useful information around with the vertices in the prim pipe.  
diff --git a/src/gallium/auxiliary/draw/draw_vertex_shader.c b/src/gallium/auxiliary/draw/draw_vertex_shader.c
index d5f37bca21..726921d77b 100644
--- a/src/gallium/auxiliary/draw/draw_vertex_shader.c
+++ b/src/gallium/auxiliary/draw/draw_vertex_shader.c
@@ -37,8 +37,6 @@
 #include "draw_context.h"
 #include "draw_vs.h"
 
-#define MAX_SHADER_VERTICES 4
-
 /**
  * Run the vertex shader on all vertices in the vertex queue.
  * Called by the draw module when the vertx cache needs to be flushed.
@@ -61,7 +59,7 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
    for (i = 0; i < draw->vs.queue_nr; i += MAX_SHADER_VERTICES) {
       struct vertex_header *dests[MAX_SHADER_VERTICES];
       unsigned elts[MAX_SHADER_VERTICES];
-      int j, n = MIN2(MAX_SHADER_VERTICES,  - i);
+      int j, n = MIN2(MAX_SHADER_VERTICES, draw->vs.queue_nr  - i);
 
       for (j = 0; j < n; j++) {
          elts[j] = draw->vs.queue[i + j].elt;
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 9629410abb..df0051d693 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -40,6 +40,7 @@
 
 #include "tgsi/util/tgsi_parse.h"
 
+#define MAX_TGSI_VERTICES 4
 
 static void
 vs_exec_prepare( struct draw_vertex_shader *shader,
@@ -71,14 +72,13 @@ vs_exec_run( struct draw_vertex_shader *shader,
 	     struct vertex_header *vOut[] )
 {
    struct tgsi_exec_machine *machine = &draw->machine;
-   unsigned int j;
+   unsigned int i, j;
 
    ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
    ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
-   assert(count <= 4);
    assert(draw->vertex_shader->info.output_semantic_name[0]
           == TGSI_SEMANTIC_POSITION);
 
@@ -92,80 +92,82 @@ vs_exec_run( struct draw_vertex_shader *shader,
       machine->Outputs = ALIGN16_ASSIGN(outputs);
    }
 
-   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
+   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
+      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
+      draw->vertex_fetch.fetch_func( draw, machine, &elts[i], max_vertices );
 
-   if (!draw->rasterizer->bypass_vs) {
-      /* run interpreter */
-      tgsi_exec_machine_run( machine );
-   }
-
-   /* store machine results */
-   for (j = 0; j < count; j++) {
-      unsigned slot;
-      float x, y, z, w;
-
-      /* Handle attr[0] (position) specially:
-       *
-       * XXX: Computing the clipmask should be done in the vertex
-       * program as a set of DP4 instructions appended to the
-       * user-provided code.
-       */
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
-
-      if (!draw->rasterizer->bypass_clipping) {
-         vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
-
-         /* divide by w */
-         w = 1.0f / w;
-         x *= w;
-         y *= w;
-         z *= w;         
-      }
-      else {
-         vOut[j]->clipmask = 0;
-      }
-      vOut[j]->edgeflag = 1;
-
-      if (!draw->identity_viewport) {
-         /* Viewport mapping */
-         vOut[j]->data[0][0] = x * scale[0] + trans[0];
-         vOut[j]->data[0][1] = y * scale[1] + trans[1];
-         vOut[j]->data[0][2] = z * scale[2] + trans[2];
-         vOut[j]->data[0][3] = w;
-      }
-      else {
-         vOut[j]->data[0][0] = x;
-         vOut[j]->data[0][1] = y;
-         vOut[j]->data[0][2] = z;
-         vOut[j]->data[0][3] = w;
+      if (!draw->rasterizer->bypass_vs) {
+         /* run interpreter */
+         tgsi_exec_machine_run( machine );
       }
 
-      /* Remaining attributes are packed into sequential post-transform
-       * vertex attrib slots.
-       */
-      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-      }
+      /* store machine results */
+      for (j = 0; j < max_vertices; j++) {
+         unsigned slot;
+         float x, y, z, w;
+
+         /* Handle attr[0] (position) specially:
+          *
+          * XXX: Computing the clipmask should be done in the vertex
+          * program as a set of DP4 instructions appended to the
+          * user-provided code.
+          */
+         x = vOut[i + j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+         y = vOut[i + j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+         z = vOut[i + j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+         w = vOut[i + j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+         if (!draw->rasterizer->bypass_clipping) {
+            vOut[i + j]->clipmask = compute_clipmask(vOut[i + j]->clip, draw->plane,
+                                                     draw->nr_planes);
+
+            /* divide by w */
+            w = 1.0f / w;
+            x *= w;
+            y *= w;
+            z *= w;
+         }
+         else {
+            vOut[i + j]->clipmask = 0;
+         }
+         vOut[i + j]->edgeflag = 1;
+
+         if (!draw->identity_viewport) {
+            /* Viewport mapping */
+            vOut[i + j]->data[0][0] = x * scale[0] + trans[0];
+            vOut[i + j]->data[0][1] = y * scale[1] + trans[1];
+            vOut[i + j]->data[0][2] = z * scale[2] + trans[2];
+            vOut[i + j]->data[0][3] = w;
+         }
+         else {
+            vOut[i + j]->data[0][0] = x;
+            vOut[i + j]->data[0][1] = y;
+            vOut[i + j]->data[0][2] = z;
+            vOut[i + j]->data[0][3] = w;
+         }
+
+         /* Remaining attributes are packed into sequential post-transform
+          * vertex attrib slots.
+          */
+         for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+            vOut[i + j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+            vOut[i + j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+            vOut[i + j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+            vOut[i + j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+         }
 
 #if 0 /*DEBUG*/
-      printf("Post xform vert:\n");
-      for (slot = 0; slot < draw->num_vs_outputs; slot++) {
-         printf("%d: %f %f %f %f\n", slot,
-                vOut[j]->data[slot][0],
-                vOut[j]->data[slot][1],
-                vOut[j]->data[slot][2],
-                vOut[j]->data[slot][3]);
-      }
-#endif      
-
-
-   } /* loop over vertices */
+         printf("%d) Post xform vert:\n", i + j);
+         for (slot = 0; slot < draw->num_vs_outputs; slot++) {
+            printf("\t%d: %f %f %f %f\n", slot,
+                   vOut[i + j]->data[slot][0],
+                   vOut[i + j]->data[slot][1],
+                   vOut[i + j]->data[slot][2],
+                   vOut[i + j]->data[slot][3]);
+         }
+#endif
+      } /* loop over vertices */
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0ee991d764..bfec89254e 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -45,6 +45,7 @@
 #include "tgsi/exec/tgsi_sse2.h"
 #include "tgsi/util/tgsi_parse.h"
 
+#define SSE_MAX_VERTICES 4
 
 typedef void (XSTDCALL *codegen_function) (
    const struct tgsi_exec_vector *input,
@@ -86,14 +87,13 @@ vs_sse_run( struct draw_vertex_shader *base,
 {
    struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
    struct tgsi_exec_machine *machine = &draw->machine;
-   unsigned int j;
+   unsigned int i, j;
 
    ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
    ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
-   assert(count <= 4);
    assert(draw->vertex_shader->info.output_semantic_name[0]
           == TGSI_SEMANTIC_POSITION);
 
@@ -108,77 +108,78 @@ vs_sse_run( struct draw_vertex_shader *base,
       machine->Outputs = ALIGN16_ASSIGN(outputs);
    }
 
-
-   /* Fetch vertices.  This may at some point be integrated into the
-    * compiled shader -- that would require a reorganization where
-    * multiple versions of the compiled shader might exist,
-    * specialized for each fetch state.
-    */
-   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
-
-
-   if (!draw->rasterizer->bypass_vs) {
-      /* run compiled shader
-       */   
-      shader->func(machine->Inputs,
-                   machine->Outputs,
-                   machine->Consts,
-                   machine->Temps,
-                   shader->immediates);
-   }
-
-
-   /* XXX: Computing the clipmask and emitting results should be done
-    *      in the vertex program as a set of instructions appended to
-    *      the user-provided code.
-    */
-   for (j = 0; j < count; j++) {
-      unsigned slot;
-      float x, y, z, w;
-
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
-
-      if (!draw->rasterizer->bypass_clipping) {
-         vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
-
-         /* divide by w */
-         w = 1.0f / w;
-         x *= w;
-         y *= w;
-         z *= w;
-      }
-      else {
-         vOut[j]->clipmask = 0;
-      }
-      vOut[j]->edgeflag = 1;
-
-      if (!draw->identity_viewport) {
-         /* Viewport mapping */
-         vOut[j]->data[0][0] = x * scale[0] + trans[0];
-         vOut[j]->data[0][1] = y * scale[1] + trans[1];
-         vOut[j]->data[0][2] = z * scale[2] + trans[2];
-         vOut[j]->data[0][3] = w;
-      }
-      else {
-         vOut[j]->data[0][0] = x;
-         vOut[j]->data[0][1] = y;
-         vOut[j]->data[0][2] = z;
-         vOut[j]->data[0][3] = w;
+   for (i = 0; i < count; i += SSE_MAX_VERTICES) {
+      unsigned int max_vertices = MIN2(SSE_MAX_VERTICES, count - i);
+      /* Fetch vertices.  This may at some point be integrated into the
+       * compiled shader -- that would require a reorganization where
+       * multiple versions of the compiled shader might exist,
+       * specialized for each fetch state.
+       */
+      draw->vertex_fetch.fetch_func(draw, machine, &elts[i], max_vertices);
+
+      if (!draw->rasterizer->bypass_vs) {
+         /* run compiled shader
+          */
+         shader->func(machine->Inputs,
+                      machine->Outputs,
+                      machine->Consts,
+                      machine->Temps,
+                      shader->immediates);
       }
 
-      /* Remaining attributes are packed into sequential post-transform
-       * vertex attrib slots.
+      /* XXX: Computing the clipmask and emitting results should be done
+       *      in the vertex program as a set of instructions appended to
+       *      the user-provided code.
        */
-      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      for (j = 0; j < max_vertices; j++) {
+         unsigned slot;
+         float x, y, z, w;
+
+         x = vOut[i + j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+         y = vOut[i + j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+         z = vOut[i + j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+         w = vOut[i + j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+         if (!draw->rasterizer->bypass_clipping) {
+            vOut[i + j]->clipmask = compute_clipmask(vOut[i + j]->clip, draw->plane,
+                                                     draw->nr_planes);
+
+            /* divide by w */
+            w = 1.0f / w;
+            x *= w;
+            y *= w;
+            z *= w;
+         }
+         else {
+            vOut[i + j]->clipmask = 0;
+         }
+         vOut[j]->edgeflag = 1;
+
+         if (!draw->identity_viewport) {
+            /* Viewport mapping */
+            vOut[i + j]->data[0][0] = x * scale[0] + trans[0];
+            vOut[i + j]->data[0][1] = y * scale[1] + trans[1];
+            vOut[i + j]->data[0][2] = z * scale[2] + trans[2];
+            vOut[i + j]->data[0][3] = w;
+         }
+         else {
+            vOut[i + j]->data[0][0] = x;
+            vOut[i + j]->data[0][1] = y;
+            vOut[i + j]->data[0][2] = z;
+            vOut[i + j]->data[0][3] = w;
+         }
+
+         /* Remaining attributes are packed into sequential post-transform
+          * vertex attrib slots.
+          */
+         for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+            vOut[i + j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+            vOut[i + j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+            vOut[i + j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+            vOut[i + j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+         }
       }
-   } 
+   }
 }
author	Zack Rusin <zack@tungstengraphics.com>	2008-04-12 15:45:28 -0400
committer	Keith Whitwell <keith@tungstengraphics.com>	2008-04-14 11:00:35 +0100
commit	aadbb1d7fbbaada6e378cb60194e5861cadf98d1 (patch)
tree	922c7904806495502b7e78d6147715bc680b19b8 /src
parent	4f550ab821f9aef9f19d9f1e10785f8c1f511ad4 (diff)