From 524bba17a75cee597f588da9c19f25d758aa237b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 10:37:18 -0800
Subject: Initial pass at vertex shader on SPU using TGSI VM

All of the code is wired in on the SPU side, but it is not called from
the PPU yet.  Instruction / declaration fetch still needs to be
implemented in spu_exec.c.
---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 493 ++++++++++++++++++++++++++++++
 1 file changed, 493 insertions(+)
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_fetch.c

(limited to 'src/mesa/pipe/cell/spu/spu_vertex_fetch.c')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..b8f8c52eed
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,493 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+
+
+#define DRAW_DBG 0
+
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static spu_fetch_func get_fetch_func( enum pipe_format format )
+{
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+
+static void fetch_xyz_rgb( struct spu_vs_context *draw,
+			   struct spu_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
+{
+   assert(count <= 4);
+
+//   _mesa_printf("%s\n", __FUNCTION__);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
+			      struct spu_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch( struct spu_vs_context *draw,
+				  struct spu_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for (/* empty */; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   unsigned i;
+
+   
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      draw->vertex_fetch.fetch[i] =
+          get_fetch_func(draw->vertex_fetch.format[i]);
+   }
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (draw->vertex_fetch.nr_attrs) {
+   case 2:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+}
-- 
cgit v1.2.3


From 13eec106881b846538bef13d694c9d2d9cf1ae6b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 11:28:06 -0800
Subject: Implement vertex fetch / vertex shader output write-back

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 32 +++++++++++----
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 62 +++++++++++++++---------------
 src/mesa/pipe/draw/draw_context.c          |  5 ++-
 3 files changed, 58 insertions(+), 41 deletions(-)

(limited to 'src/mesa/pipe/cell/spu/spu_vertex_fetch.c')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index b8f8c52eed..0192227d57 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -30,11 +30,13 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include <spu_mfcio.h>
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"
+#include "spu_main.h"
 
 
 #define DRAW_DBG 0
@@ -412,16 +414,18 @@ static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-static void generic_vertex_fetch( struct spu_vs_context *draw,
-				  struct spu_exec_machine *machine,
-				  const unsigned *elts,
-				  unsigned count )
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
    assert(count <= 4);
 
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
 //   _mesa_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
@@ -441,13 +445,23 @@ static void generic_vertex_fetch( struct spu_vs_context *draw,
        * a prototype for an sse implementation, which would have
        * difficulties doing that.
        */
-      for (i = 0; i < count; i++) 
-	 fetch( src + elts[i] * pitch, p[i] );
+      for (i = 0; i < count; i++) {
+         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
+         const unsigned long addr = src + elts[i] * pitch;
+         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+
+         mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+         wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+
+         fetch(buffer, p[i]);
+      }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
       
       /* Transpose/swizzle into sse-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
@@ -475,6 +489,9 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 
+   /* Disable the fast path because they don't use mfc_get yet.
+    */
+#if 0
    switch (draw->vertex_fetch.nr_attrs) {
    case 2:
       if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
@@ -490,4 +507,5 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    default:
       break;
    }
+#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index e694ff729f..595f54b0eb 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -32,6 +32,8 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
+#include <spu_mfcio.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -40,9 +42,7 @@
 #include "pipe/draw/draw_private.h"
 #include "pipe/draw/draw_context.h"
 #include "pipe/cell/common.h"
-
-#define DBG_VS 0
-
+#include "spu_main.h"
 
 static INLINE unsigned
 compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
@@ -110,6 +110,12 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+			   + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+	  (struct vertex_header *) buffer;
+      const unsigned vert_size = sizeof(struct vertex_header)
+	  + (sizeof(float) * 4 * draw->num_vs_outputs);
 
       /* Handle attr[0] (position) specially:
        *
@@ -117,14 +123,14 @@ run_vertex_program(struct spu_vs_context *draw,
        * program as a set of DP4 instructions appended to the
        * user-provided code.
        */
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
 
-      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
 					   draw->nr_planes);
-      vOut[j]->edgeflag = 1;
+      tmpOut->edgeflag = 1;
 
       /* divide by w */
       w = 1.0f / w;
@@ -133,35 +139,27 @@ run_vertex_program(struct spu_vs_context *draw,
       z *= w;
 
       /* Viewport mapping */
-      vOut[j]->data[0][0] = x * scale[0] + trans[0];
-      vOut[j]->data[0][1] = y * scale[1] + trans[1];
-      vOut[j]->data[0][2] = z * scale[2] + trans[2];
-      vOut[j]->data[0][3] = w;
-
-#if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
-             vOut[j]->data[0][0],
-             vOut[j]->data[0][1],
-             vOut[j]->data[0][2],
-             vOut[j]->data[0][3]);
-#endif
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
        */
       for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-#if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
-                vOut[j]->data[slot][0],
-                vOut[j]->data[slot][1],
-                vOut[j]->data[slot][2],
-                vOut[j]->data[slot][3]);
-#endif
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
       }
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+
    } /* loop over vertices */
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
 }
 
 
diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index e8ca1f035b..711bcd02f6 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -71,10 +71,11 @@ struct draw_context *draw_create( void )
     */
    {
       uint i;
-      char *tmp = (char*) MALLOC( Elements(draw->vcache.vertex) * MAX_VERTEX_SIZE );
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
 
       for (i = 0; i < Elements(draw->vcache.vertex); i++)
-	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * MAX_VERTEX_SIZE);
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
    draw->convert_wide_points = TRUE;
-- 
cgit v1.2.3


From 7b27d9fd660c122fb2ec50007129d67e78814587 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 17:26:22 -0800
Subject: Fix size calculation in attribute fetch.

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/mesa/pipe/cell/spu/spu_vertex_fetch.c')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 0192227d57..1e846868e3 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -446,14 +446,14 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * difficulties doing that.
        */
       for (i = 0; i < count; i++) {
-         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
-         const unsigned long addr = src + elts[i] * pitch;
-         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+         uint8_t buffer[32] ALIGN16_ATTRIB;
+         const unsigned long addr = src + (elts[i] * pitch);
+         const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+         memmove(& buffer, buffer + (addr & 0x0f), 16);
 
          fetch(buffer, p[i]);
       }
-- 
cgit v1.2.3


From 5db1593c78192b764ad2ef7bdc5182d8ec4aed7c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:05:37 -0700
Subject: Cell: fix some alignment issues by aligning commands to 8-byte
 boundaries

Contributed by Ian Romanick.
Also, temporarily disable inlined vertex buffers.  They need to be 16-byte
aligned...
---
 src/mesa/pipe/cell/common.h                 | 16 ++++----
 src/mesa/pipe/cell/ppu/cell_batch.c         |  4 +-
 src/mesa/pipe/cell/ppu/cell_flush.c         |  2 +-
 src/mesa/pipe/cell/ppu/cell_state_emit.c    |  3 +-
 src/mesa/pipe/cell/ppu/cell_vbuf.c          |  4 +-
 src/mesa/pipe/cell/ppu/cell_vertex_shader.c | 22 ++++++-----
 src/mesa/pipe/cell/spu/spu_main.c           | 58 +++++++++++++----------------
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c   |  7 ++--
 src/mesa/pipe/cell/spu/spu_vertex_shader.h  |  2 +-
 9 files changed, 57 insertions(+), 61 deletions(-)

(limited to 'src/mesa/pipe/cell/spu/spu_vertex_fetch.c')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d861e82d33..cf8fc94ebf 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -57,6 +57,9 @@
 /** round up value to next multiple of 4 */
 #define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
 
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
 /** round up value to next multiple of 16 */
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
@@ -102,7 +105,7 @@
  */
 struct cell_command_framebuffer
 {
-   uint opcode;
+   uint64_t opcode;
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -114,7 +117,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint opcode;
+   uint64_t opcode;
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
 };
@@ -125,8 +128,7 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint opcode;
-    uint base;          /**< Base address of the 0th element. */
+    uint64_t base;          /**< Base address of the 0th element. */
     uint attr;          /**< Attribute that this state if for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
@@ -150,7 +152,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    struct cell_shader_info   shader;
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -163,7 +165,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint opcode;       /**< CELL_CMD_RENDER */
+   uint64_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -179,7 +181,7 @@ struct cell_command_render
 
 struct cell_command_release_verts
 {
-   int opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 2d032fc902..2fb49711b2 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -136,7 +136,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
    ASSERT(cell->cur_batch >= 0);
 
@@ -171,7 +171,7 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
    void *pos;
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
 
    assert(cell->cur_batch >= 0);
diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index cf4e676645..f62bc4650c 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -59,7 +59,7 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags)
    flushing = TRUE;
 
    if (flags & PIPE_FLUSH_WAIT) {
-      uint *cmd = (uint *) cell_batch_alloc(cell, sizeof(uint));
+      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
       *cmd = CELL_CMD_FINISH;
    }
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 3b2670f786..5d2a786449 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -37,7 +37,8 @@ static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint *dst = (uint *) cell_batch_alloc(cell, sizeof(uint) + state_size);
+   uint64_t *dst = (uint64_t *) 
+       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
    *dst = cmd;
    memcpy(dst + 1, state, state_size);
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index e63b34cf52..0fee61821a 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,7 +40,7 @@
 
 
 /** Allow vertex data to be inlined after RENDER command */
-#define ALLOW_INLINE_VERTS 1
+#define ALLOW_INLINE_VERTS 0
 
 
 /**
@@ -197,7 +197,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP4(nr_indices * 2);
+      const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
 
       const uint batch_size = sizeof(struct cell_command_render)
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
index aef329a902..80dd500b34 100644
--- a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -52,8 +52,8 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
    struct cell_command_vs *const vs = &cell_global.command[0].vs;
-   unsigned *batch;
-   struct cell_array_info array_info;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
    unsigned i, j;
 
    assert(draw->vs.queue_nr != 0);
@@ -63,17 +63,19 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    draw_update_vertex_fetch(draw);
 
    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
-      array_info.opcode = CELL_CMD_STATE_VS_ARRAY_INFO;
-      assert(draw->vertex_fetch.src_ptr[i] != NULL);
-      array_info.base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
-      array_info.attr = i;
-      array_info.pitch = draw->vertex_fetch.pitch[i];
-      array_info.format = draw->vertex_element[i].src_format;
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
 
-      cell_batch_append(cell, & array_info, sizeof(array_info));
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->format = draw->vertex_element[i].src_format;
    }
 
-   batch = cell_batch_alloc(cell, sizeof(unsigned)
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
                             + sizeof(struct pipe_viewport_state));
    batch[0] = CELL_CMD_STATE_VIEWPORT;
    (void) memcpy(&batch[1], &draw->viewport,
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index b0311db1aa..4f126d5e5b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,7 +31,6 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <vec_literal.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -220,13 +219,13 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 0;
 
    if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      12, 0, 4, 8, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              12, 0, 4, 8, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      8, 4, 0, 12, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              8, 4, 0, 12, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else
       ASSERT(0);
 }
@@ -279,16 +278,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
              spu.init.id, texture->start, texture->width, texture->height);
 
    memcpy(&spu.texture, texture, sizeof(*texture));
-   spu.tex_size = VEC_LITERAL(vector float,
-                              spu.texture.width,
-                              spu.texture.height,
-                              0.0,
-                              0.0);
-   spu.tex_size_mask = VEC_LITERAL(vector unsigned int,
-                                   spu.texture.width - 1,
-                                   spu.texture.height - 1,
-                                   0,
-                                   0);
+   spu.tex_size = (vector float)
+      { spu.texture.width, spu.texture.height, 0.0, 0.0};
+   spu.tex_size_mask = (vector unsigned int)
+      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
 }
 
 
@@ -341,8 +334,8 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
-   const uint usize = size / sizeof(uint);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
    if (Debug)
@@ -377,7 +370,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 4;
+            pos += sizeof(*fb) / 8;
          }
          break;
       case CELL_CMD_CLEAR_SURFACE:
@@ -385,7 +378,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 4;
+            pos += sizeof(*clr) / 8;
          }
          break;
       case CELL_CMD_RENDER:
@@ -394,7 +387,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 4 + pos_incr;
+            pos += sizeof(*render) / 8 + ((pos_incr + 1) / 2);
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -402,8 +395,7 @@ cmd_batch(uint opcode)
             struct cell_command_release_verts *release
                = (struct cell_command_release_verts *) &buffer[pos];
             cmd_release_verts(release);
-            ASSERT(sizeof(*release) == 8);
-            pos += sizeof(*release) / 4;
+            pos += sizeof(*release) / 8;
          }
          break;
       case CELL_CMD_FINISH:
@@ -413,36 +405,36 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_BLEND:
          cmd_state_blend((struct pipe_blend_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_blend_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_depth_stencil_alpha_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_sampler_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
          break;
       case CELL_CMD_STATE_TEXTURE:
          cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_command_texture) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct vertex_info) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
          break;
       case CELL_CMD_STATE_VIEWPORT:
          (void) memcpy(& draw.viewport, &buffer[pos+1],
                        sizeof(struct pipe_viewport_state));
-         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos]);
-         pos += (sizeof(struct cell_array_info) / 4);
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       default:
-         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
          break;
       }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 1e846868e3..5b0f2a6470 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -431,9 +431,8 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
    for (attr = 0; attr < nr_attrs; attr++) {
-
-      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
-      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
       float p[4][4];
@@ -447,7 +446,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        */
       for (i = 0; i < count; i++) {
          uint8_t buffer[32] ALIGN16_ATTRIB;
-         const unsigned long addr = src + (elts[i] * pitch);
+         const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index c52f38fd02..b261ab44a2 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -16,7 +16,7 @@ struct spu_vs_context {
    struct pipe_viewport_state viewport;
 
    struct {
-      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      uint64_t src_ptr[PIPE_ATTRIB_MAX];
       unsigned pitch[PIPE_ATTRIB_MAX];
       enum pipe_format format[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
-- 
cgit v1.2.3


From 490a7b1c73babd528b6d883471a8636157c5853a Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 17:12:20 -0800
Subject: Vectorize vertex puller

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 186 +++++++++--------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   4 +-
 2 files changed, 61 insertions(+), 129 deletions(-)

(limited to 'src/mesa/pipe/cell/spu/spu_vertex_fetch.c')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 5b0f2a6470..4133fbba17 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -42,6 +42,8 @@
 #define DRAW_DBG 0
 
 
+static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 };
+
 /**
  * Fetch a float[4] vertex attribute from memory, doing format/type
  * conversion as needed.
@@ -50,19 +52,16 @@
  * conversion, texture sampling etc.
  */
 #define FETCH_ATTRIB( NAME, SZ, CVT )			\
-static void						\
-fetch_##NAME(const void *ptr, float *attrib)		\
+static qword						\
+fetch_##NAME(const void *ptr)				\
 {							\
-   static const float defaults[4] = { 0,0,0,1 };	\
+   vec_float4 attrib = defaults;			\
    int i;						\
 							\
    for (i = 0; i < SZ; i++) {				\
-      attrib[i] = CVT;					\
-   }							\
-							\
-   for (; i < 4; i++) {					\
-      attrib[i] = defaults[i];				\
+      attrib = spu_insert(CVT, attrib, i);		\
    }							\
+   return (qword) attrib;				\
 }
 
 #define CVT_64_FLOAT   (float) ((double *) ptr)[i]
@@ -309,106 +308,59 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-static void 
-transpose_4x4( float *out, const float *in )
-{
-   /* This can be achieved in 12 sse instructions, plus the final
-    * stores I guess.  This is probably a bit more than that - maybe
-    * 32 or so?
-    */
-   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
-   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
-   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
-   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
-}
-
-
-
-static void fetch_xyz_rgb( struct spu_vs_context *draw,
-			   struct spu_exec_machine *machine,
-			   const unsigned *elts,
-			   unsigned count )
+void
+spu_transpose_4x4(qword *out, const qword *in)
 {
-   assert(count <= 4);
-
-//   _mesa_printf("%s\n", __FUNCTION__);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+   static const qword masks[8] = {
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-   }
-}
-
-
-
-
-static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
-			      struct spu_exec_machine *machine,
-			      const unsigned *elts,
-			      unsigned count )
-{
-   assert(count <= 4);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      },
+
+      { 
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+      },
+
+      { 
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+      },
+
+      { 
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+      },
+   };
 
-      {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+   out[0] = si_shufb(in[0], in[1], masks[0]);
+   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
 
-      {
-	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = 0.0f;
- 	 out[12] = 1.0f;
-      }
-   }
-}
+   out[1] = si_shufb(in[0], in[1], masks[2]);
+   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
 
+   out[2] = si_shufb(in[0], in[1], masks[4]);
+   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
 
+   out[3] = si_shufb(in[0], in[1], masks[6]);
+   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
+}
 
 
 /**
@@ -435,7 +387,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
-      float p[4][4];
+      qword p[4];
 
 
       /* Fetch four attributes for four vertices.  
@@ -452,17 +404,15 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memmove(& buffer, buffer + (addr & 0x0f), 16);
-
-         fetch(buffer, p[i]);
+         p[i] = (*fetch)(buffer + (addr & 0x0f));
       }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
-      
-      /* Transpose/swizzle into sse-friendly format.  Currently
+          p[i] = si_xor(p[i], p[i]);
+
+      /* Transpose/swizzle into vector-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
        * isn't true -- if the vertex shader only wants tex0.xy, we
        * could optimize for that.
@@ -471,7 +421,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
@@ -487,24 +437,4 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    }
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
-
-   /* Disable the fast path because they don't use mfc_get yet.
-    */
-#if 0
-   switch (draw->vertex_fetch.nr_attrs) {
-   case 2:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
-      break;
-   case 3:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
-      break;
-   default:
-      break;
-   }
-#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index b261ab44a2..2435b7ddae 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -6,7 +6,7 @@
 
 struct spu_vs_context;
 
-typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
+typedef qword (*spu_fetch_func)(const void *ptr);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
@@ -39,6 +39,8 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
+extern void spu_transpose_4x4(qword *out, const qword *in);
+
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3


From 45f4125fa83c4e43a01d44cb8eb2a4c97b72181f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 4 Feb 2008 16:03:55 -0800
Subject: Add some debug messages

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell/spu/spu_vertex_fetch.c')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 4133fbba17..cfa449e813 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -378,7 +378,10 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
    wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -401,6 +404,9 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-- 
cgit v1.2.3


From c9f98142b6a47825c49aea72a79c1be62c2b7d89 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 5 Feb 2008 09:43:52 -0800
Subject: Use _transpose_matrix4x4 from Cell SDK instead of my own version

---
 src/mesa/pipe/cell/spu/spu_exec.c          |  3 +-
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 59 ++----------------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |  2 -
 3 files changed, 5 insertions(+), 59 deletions(-)

(limited to 'src/mesa/pipe/cell/spu/spu_vertex_fetch.c')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 1bd8687d41..e51008b9b3 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,6 +52,7 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
 #include <simdmath/ceilf4.h>
 #include <simdmath/cosf4.h>
 #include <simdmath/divf4.h>
@@ -664,7 +665,7 @@ fetch_texel( struct spu_sampler *sampler,
 
    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
 
-   spu_transpose_4x4(out, rgba);
+   _transpose_matrix4x4(out, rgba);
    r->q = out[0];
    g->q = out[1];
    b->q = out[2];
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index cfa449e813..6e86a919ce 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -31,6 +31,8 @@
   */
 
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -308,61 +310,6 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-void
-spu_transpose_4x4(qword *out, const qword *in)
-{
-   static const qword masks[8] = {
-      {
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-      },
-
-      { 
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-      },
-
-      { 
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-      },
-
-      { 
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-      },
-   };
-
-   out[0] = si_shufb(in[0], in[1], masks[0]);
-   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
-
-   out[1] = si_shufb(in[0], in[1], masks[2]);
-   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
-
-   out[2] = si_shufb(in[0], in[1], masks[4]);
-   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
-
-   out[3] = si_shufb(in[0], in[1], masks[6]);
-   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
-}
-
-
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
@@ -427,7 +374,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
+      _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index 2435b7ddae..c96b93ff0a 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -39,8 +39,6 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
-extern void spu_transpose_4x4(qword *out, const qword *in);
-
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3