gallium: restructure vertex fetch code slightly

author: Keith Whitwell <keith@tungstengraphics.com> 2008-01-24 11:19:06 +0000
committer: José Fonseca <jrfonseca@tungstengraphics.com> 2008-01-26 10:33:18 +0900
commit: 027983f5850afea753381be454122166c6d56777 (patch)
tree: 1c8eb9dc82c6f4241e73641dbfeb24913c1a4f8d /src
parent: c3e4e9260d7527bb0369650b861cba43834f7106 (diff)
3 files changed, 149 insertions, 65 deletions
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 1c2e88264f..1e59f5bd8d 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -137,6 +137,13 @@ struct draw_vertex_shader {
 #endif
 };
 
+
+/* Internal function for vertex fetch.
+ */
+typedef void (*fetch_func)(const void *ptr, float *attrib);
+
+
+
 /**
  * Private context for the drawing module.
  */
@@ -196,6 +203,15 @@ struct draw_context
    /** TGSI program interpreter runtime state */
    struct tgsi_exec_machine machine;
 
+   /* Vertex fetch internal state
+    */
+   struct {
+      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      fetch_func fetch[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+   } vertex_fetch;
+
    /* Post-tnl vertex cache:
     */
    struct {
@@ -270,6 +286,7 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
 
 struct tgsi_exec_machine;
 
+extern void draw_update_vertex_fetch( struct draw_context *draw );
 extern void draw_vertex_fetch( struct draw_context *draw,
 			       struct tgsi_exec_machine *machine,
 			       const unsigned *elts,
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 4d64d3d4f2..23e8187899 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -42,53 +42,108 @@
 /**
  * Fetch a float[4] vertex attribute from memory, doing format/type
  * conversion as needed.
- * XXX this might be a temporary thing.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
  */
-static void
-fetch_attrib4(const void *ptr, enum pipe_format format, float attrib[4])
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static fetch_func get_fetch_func( unsigned format )
 {
-   /* defaults */
-   attrib[1] = 0.0;
-   attrib[2] = 0.0;
-   attrib[3] = 1.0;
    switch (format) {
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      attrib[3] = ((float *) ptr)[3];
-      /* fall-through */
+      return fetch_R32G32B32A32_FLOAT;
    case PIPE_FORMAT_R32G32B32_FLOAT:
-      attrib[2] = ((float *) ptr)[2];
-      /* fall-through */
+      return fetch_R32G32B32_FLOAT;
    case PIPE_FORMAT_R32G32_FLOAT:
-      attrib[1] = ((float *) ptr)[1];
-      /* fall-through */
+      return fetch_R32G32_FLOAT;
    case PIPE_FORMAT_R32_FLOAT:
-      attrib[0] = ((float *) ptr)[0];
-      break;
-
+      return fetch_R32_FLOAT;
    case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      attrib[3] = (float) ((int *) ptr)[3];
-      /* fall-through */
+      return fetch_R32G32B32A32_SSCALED;
    case PIPE_FORMAT_R32G32B32_SSCALED:
-      attrib[2] = (float) ((int *) ptr)[2];
-      /* fall-through */
+      return fetch_R32G32B32_SSCALED;
    case PIPE_FORMAT_R32G32_SSCALED:
-      attrib[1] = (float) ((int *) ptr)[1];
-      /* fall-through */
+      return fetch_R32G32_SSCALED;
    case PIPE_FORMAT_R32_SSCALED:
-      attrib[0] = (float) ((int *) ptr)[0];
-      break;
-
+      return fetch_R32_SSCALED;
    case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
-      attrib[0] = (float) ((unsigned char *) ptr)[2] / 255.0f;
-      attrib[1] = (float) ((unsigned char *) ptr)[1] / 255.0f;
-      attrib[2] = (float) ((unsigned char *) ptr)[0] / 255.0f;
-      attrib[3] = (float) ((unsigned char *) ptr)[3] / 255.0f;
-      break;
-
+      return fetch_R8G8B8A8_UNORM;
    default:
+      /* Lots of missing cases! */
       assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   //unsigned nr_attrs = draw->vertex_element_count;
+   unsigned nr_attrs = draw->vertex_shader->state->num_inputs;
+   unsigned i;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      unsigned format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) (draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset );
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
    }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
 }
 
 
@@ -100,40 +155,48 @@ void draw_vertex_fetch( struct draw_context *draw,
 			const unsigned *elts,
 			unsigned count )
 {
-   unsigned j;
-
-   /* loop over vertices */
-   for (j = 0; j < count; j++) {
-      uint attr;
-
-#if DRAW_DBG
-      printf("fetch vertex %u: \n", j);
-#endif
-
-      /* loop over vertex attributes (vertex shader inputs) */
-      for (attr = 0; attr < draw->vertex_shader->state->num_inputs; attr++) {
-
-         unsigned buf = draw->vertex_element[attr].vertex_buffer_index;
-         const void *src
-            = (const void *) ((const ubyte *) draw->user.vbuffer[buf]
-                              + draw->vertex_buffer[buf].buffer_offset
-                              + draw->vertex_element[attr].src_offset
-                              + elts[j] * draw->vertex_buffer[buf].pitch);
-         float p[4];
-
-         fetch_attrib4(src, draw->vertex_element[attr].src_format, p);
-
-#if DRAW_DBG
-         printf("  %u: %f %f %f %f\n", attr, p[0], p[1], p[2], p[3]);
-#endif
-
-         /* Transform to AoS xxxx/yyyy/zzzz/wwww representation:
-          */
-         machine->Inputs[attr].xyzw[0].f[j] = p[0]; /*X*/
-         machine->Inputs[attr].xyzw[1].f[j] = p[1]; /*Y*/
-         machine->Inputs[attr].xyzw[2].f[j] = p[2]; /*Z*/
-         machine->Inputs[attr].xyzw[3].f[j] = p[3]; /*W*/
-      }
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for ( ; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
    }
 }
 
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index d19b60198d..3041974b9a 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -201,6 +201,10 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
    unsigned i, j;
 
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch( draw );
+
 //   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
 #ifdef MESA_LLVM
    if (draw->vertex_shader->llvm_prog) {
author	Keith Whitwell <keith@tungstengraphics.com>	2008-01-24 11:19:06 +0000
committer	José Fonseca <jrfonseca@tungstengraphics.com>	2008-01-26 10:33:18 +0900
commit	027983f5850afea753381be454122166c6d56777 (patch)
tree	1c8eb9dc82c6f4241e73641dbfeb24913c1a4f8d /src
parent	c3e4e9260d7527bb0369650b861cba43834f7106 (diff)