From 3320b1874e810583f95b93a89697b2955987b84f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 15 Feb 2008 11:03:54 -0800
Subject: Cell: Enable code gen for SPE attribute fetch

Doubles are still unsupported.
---
 src/gallium/drivers/cell/common.h                 |  15 +-
 src/gallium/drivers/cell/ppu/Makefile             |   1 +
 src/gallium/drivers/cell/ppu/cell_context.h       |   4 +
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c  |  15 +-
 src/gallium/drivers/cell/ppu/cell_vertex_shader.c |  21 +-
 src/gallium/drivers/cell/spu/spu_main.c           |  22 +-
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c   | 477 +---------------------
 src/gallium/drivers/cell/spu/spu_vertex_shader.h  |   6 +-
 8 files changed, 71 insertions(+), 490 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 4de514c358..74b131fbef 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -90,6 +90,7 @@
 #define CELL_CMD_STATE_VS_ARRAY_INFO 16
 #define CELL_CMD_STATE_BLEND         17
 #define CELL_CMD_VS_EXECUTE          18
+#define CELL_CMD_STATE_ATTRIB_FETCH  19
 
 
 #define CELL_NUM_BUFFERS 4
@@ -128,13 +129,19 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;      /**< Base address of the 0th element. */
-    uint attr;          /**< Attribute that this state is for. */
-    uint pitch;         /**< Byte pitch from one entry to the next. */
-    uint format;        /**< Pipe format of each entry. */
+   uint64_t base;      /**< Base address of the 0th element. */
+   uint attr;          /**< Attribute that this state is for. */
+   uint pitch;         /**< Byte pitch from one entry to the next. */
+   uint size;
+   uint function_offset;
 } ALIGN16_ATTRIB;
 
 
+struct cell_attribute_fetch_code {
+   uint64_t base;
+   uint size;
+};
+
 struct cell_shader_info
 {
    unsigned num_outputs;
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index a4c3f29e8a..196ab777f5 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -34,6 +34,7 @@ SOURCES = \
 	cell_surface.c \
 	cell_texture.c \
 	cell_vbuf.c \
+	cell_vertex_fetch.c \
 	cell_vertex_shader.c \
 	cell_winsys.c
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 6196c0c72f..91f8e542a2 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -36,6 +36,7 @@
 #include "draw/draw_vbuf.h"
 #include "cell_winsys.h"
 #include "cell/common.h"
+#include "ppc/rtasm/spe_asm.h"
 
 
 struct cell_vbuf_render;
@@ -111,6 +112,9 @@ struct cell_context
    /** [4] to ensure 16-byte alignment for each status word */
    uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
+
+   struct spe_function attrib_fetch;
+   unsigned attrib_fetch_offsets[PIPE_ATTRIB_MAX];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index f2432f4ff5..f10689a959 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -27,10 +27,10 @@
 #include "pipe/p_context.h"
 #include "pipe/p_format.h"
 
-#include "pipe/draw/draw_context.h"
-#include "pipe/draw/draw_private.h"
+#include "../auxiliary/draw/draw_context.h"
+#include "../auxiliary/draw/draw_private.h"
 
-#include "pipe/cell/ppu/cell_context.h"
+#include "cell_context.h"
 #include "ppc/rtasm/spe_asm.h"
 
 typedef uint64_t register_mask;
@@ -380,13 +380,4 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 	     cell->attrib_fetch_offsets[function_index[i]];
       }
    }
-   
-   static first_time = 1;
-   if (first_time) {
-      first_time = 0;
-      const unsigned instructions = p->csr - p->store;
-      for (i = 0; i < instructions; i++) {
-	 printf("\t.long\t0x%08x\n", p->store[i]);
-      }
-   }
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 0ba4506505..6a1d3bc20a 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -55,14 +55,32 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    uint64_t *batch;
    struct cell_array_info *array_info;
    unsigned i, j;
+   struct cell_attribute_fetch_code *cf;
 
    assert(draw->vs.queue_nr != 0);
 
    /* XXX: do this on statechange: 
     */
    draw_update_vertex_fetch(draw);
+   cell_update_vertex_fetch(draw);
+
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf));
+   batch[0] = CELL_CMD_STATE_ATTRIB_FETCH;
+   cf = (struct cell_attribute_fetch_code *) (&batch[1]);
+   cf->base = cell->attrib_fetch.store;
+   cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr 
+				   - (void *) cell->attrib_fetch.store));
+
 
    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format format = draw->vertex_element[i].src_format;
+      const unsigned count = ((pf_size_x(format) != 0)
+			      + (pf_size_y(format) != 0)
+			      + (pf_size_z(format) != 0)
+			      + (pf_size_w(format) != 0));
+      const unsigned size = pf_size_x(format) * count;
+
       batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
 
       batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
@@ -72,7 +90,8 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
       array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
       array_info->attr = i;
       array_info->pitch = draw->vertex_fetch.pitch[i];
-      array_info->format = draw->vertex_element[i].src_format;
+      array_info->size = size;
+      array_info->function_offset = cell->attrib_fetch_offsets[i];
    }
 
    batch = cell_batch_alloc(cell, sizeof(batch[0])
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 1e7243b863..fcbf0f841e 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -54,6 +54,9 @@ struct spu_global spu;
 
 struct spu_vs_context draw;
 
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_ATTRIB_MAX]
+    ALIGN16_ATTRIB;
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -306,7 +309,8 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info)
    ASSERT(attr < PIPE_ATTRIB_MAX);
    draw.vertex_fetch.src_ptr[attr] = vs_info->base;
    draw.vertex_fetch.pitch[attr] = vs_info->pitch;
-   draw.vertex_fetch.format[attr] = vs_info->format;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
    draw.vertex_fetch.dirty = 1;
 }
 
@@ -433,6 +437,22 @@ cmd_batch(uint opcode)
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
+      case CELL_CMD_STATE_ATTRIB_FETCH: {
+         struct cell_attribute_fetch_code *code =
+             (struct cell_attribute_fetch_code *) &buffer[pos+1];
+
+              mfc_get(attribute_fetch_code_buffer,
+                      (unsigned int) code->base,  /* src */
+                      code->size,
+                      TAG_BATCH_BUFFER,
+                      0, /* tid */
+                      0  /* rid */);
+         wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+         draw.vertex_fetch.code = attribute_fetch_code_buffer;
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      }
       default:
          printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 45e3c26c00..55c6c28717 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -1,6 +1,7 @@
 /**************************************************************************
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * (C) Copyright IBM Corporation 2008
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -28,10 +29,10 @@
  /*
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Ian Romanick <idr@us.ibm.com>
   */
 
 #include <spu_mfcio.h>
-#include <transpose_matrix4x4.h>
 
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
@@ -59,6 +60,10 @@
 
 #define DRAW_DBG 0
 
+typedef void (*spu_fetch_func)(qword *out, const qword *in,
+			       const qword *shuffle_data);
+
+
 static const qword fetch_shuffle_data[] = {
    /* Shuffle used by CVT_64_FLOAT
     */
@@ -97,22 +102,6 @@ static const qword fetch_shuffle_data[] = {
 };
 
 
-static INLINE void
-trans4x4(qword row0, qword row1, qword row2, qword row3, qword *out,
-         const qword *shuffle)
-{
-   qword t1 = si_shufb(row0, row2, shuffle[3]);
-   qword t2 = si_shufb(row0, row2, shuffle[4]);
-   qword t3 = si_shufb(row1, row3, shuffle[3]);
-   qword t4 = si_shufb(row1, row3, shuffle[4]);
-
-   out[0] = si_shufb(t1, t3, shuffle[3]);
-   out[1] = si_shufb(t1, t3, shuffle[4]);
-   out[2] = si_shufb(t2, t4, shuffle[3]);
-   out[3] = si_shufb(t2, t4, shuffle[4]);
-}
-
-
 /**
  * Fetch between 1 and 32 bytes from an unaligned address
  */
@@ -151,446 +140,6 @@ fetch_unaligned(qword *dst, unsigned ea, unsigned size)
 }
 
 
-#define CVT_32_FLOAT(q, s)    (*(q))
-
-static INLINE qword
-CVT_64_FLOAT(const qword *qw, const qword *shuffle)
-{
-   qword a = si_frds(qw[0]);
-   qword b = si_frds(si_rotqbyi(qw[0], 8));
-   qword c = si_frds(qw[1]);
-   qword d = si_frds(si_rotqbyi(qw[1], 8));
-
-   qword ab = si_shufb(a, b, shuffle[0]);
-   qword cd = si_shufb(c, d, si_rotqbyi(shuffle[0], 8));
-   
-   return si_or(ab, cd);
-}
-
-
-static INLINE qword
-CVT_8_USCALED(const qword *qw, const qword *shuffle)
-{
-   return si_cuflt(si_shufb(*qw, *qw, shuffle[1]), 0);
-}
-
-
-static INLINE qword
-CVT_16_USCALED(const qword *qw, const qword *shuffle)
-{
-   return si_cuflt(si_shufb(*qw, *qw, shuffle[2]), 0);
-}
-
-
-static INLINE qword
-CVT_32_USCALED(const qword *qw, const qword *shuffle)
-{
-   (void) shuffle;
-   return si_cuflt(*qw, 0);
-}
-
-static INLINE qword
-CVT_8_SSCALED(const qword *qw, const qword *shuffle)
-{
-   return si_csflt(si_shufb(*qw, *qw, shuffle[1]), 0);
-}
-
-
-static INLINE qword
-CVT_16_SSCALED(const qword *qw, const qword *shuffle)
-{
-   return si_csflt(si_shufb(*qw, *qw, shuffle[2]), 0);
-}
-
-
-static INLINE qword
-CVT_32_SSCALED(const qword *qw, const qword *shuffle)
-{
-   (void) shuffle;
-   return si_csflt(*qw, 0);
-}
-
-
-static INLINE qword
-CVT_8_UNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 255.0f);
-   return si_fm(CVT_8_USCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_16_UNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 65535.0f);
-   return si_fm(CVT_16_USCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_32_UNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 4294967295.0f);
-   return si_fm(CVT_32_USCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_8_SNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 127.0f);
-   return si_fm(CVT_8_SSCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_16_SNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 32767.0f);
-   return si_fm(CVT_16_SSCALED(qw, shuffle), scale);
-}
-
-
-static INLINE qword
-CVT_32_SNORM(const qword *qw, const qword *shuffle)
-{
-   const qword scale = (qword) spu_splats(1.0f / 2147483647.0f);
-   return si_fm(CVT_32_SSCALED(qw, shuffle), scale);
-}
-
-#define SZ_4 si_il(0U)
-#define SZ_3 si_fsmbi(0x000f)
-#define SZ_2 si_fsmbi(0x00ff)
-#define SZ_1 si_fsmbi(0x0fff)
-
-/**
- * Fetch a float[4] vertex attribute from memory, doing format/type
- * conversion as needed.
- *
- * This is probably needed/dupliocated elsewhere, eg format
- * conversion, texture sampling etc.
- */
-#define FETCH_ATTRIB( NAME, SZ, CVT, N )			\
-static void							\
-fetch_##NAME(qword *out, const qword *in, qword defaults, \
-                const qword *shuffle)	\
-{								\
-   qword tmp[4];						\
-								\
-   tmp[0] = si_selb(CVT(in + (0 * N), shuffle), defaults, SZ);		\
-   tmp[1] = si_selb(CVT(in + (1 * N), shuffle), defaults, SZ);		\
-   tmp[2] = si_selb(CVT(in + (2 * N), shuffle), defaults, SZ);		\
-   tmp[3] = si_selb(CVT(in + (3 * N), shuffle), defaults, SZ);		\
-   trans4x4(tmp[0], tmp[1], tmp[2], tmp[3], out, shuffle);		\
-}
-
-
-FETCH_ATTRIB( R64G64B64A64_FLOAT,   SZ_4, CVT_64_FLOAT, 2 )
-FETCH_ATTRIB( R64G64B64_FLOAT,      SZ_3, CVT_64_FLOAT, 2 )
-FETCH_ATTRIB( R64G64_FLOAT,         SZ_2, CVT_64_FLOAT, 2 )
-FETCH_ATTRIB( R64_FLOAT,            SZ_1, CVT_64_FLOAT, 2 )
-
-FETCH_ATTRIB( R32G32B32A32_FLOAT,   SZ_4, CVT_32_FLOAT, 1 )
-FETCH_ATTRIB( R32G32B32_FLOAT,      SZ_3, CVT_32_FLOAT, 1 )
-FETCH_ATTRIB( R32G32_FLOAT,         SZ_2, CVT_32_FLOAT, 1 )
-FETCH_ATTRIB( R32_FLOAT,            SZ_1, CVT_32_FLOAT, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED, 1 )
-FETCH_ATTRIB( R32G32B32_USCALED,    SZ_3, CVT_32_USCALED, 1 )
-FETCH_ATTRIB( R32G32_USCALED,       SZ_2, CVT_32_USCALED, 1 )
-FETCH_ATTRIB( R32_USCALED,          SZ_1, CVT_32_USCALED, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED, 1 )
-FETCH_ATTRIB( R32G32B32_SSCALED,    SZ_3, CVT_32_SSCALED, 1 )
-FETCH_ATTRIB( R32G32_SSCALED,       SZ_2, CVT_32_SSCALED, 1 )
-FETCH_ATTRIB( R32_SSCALED,          SZ_1, CVT_32_SSCALED, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM, 1 )
-FETCH_ATTRIB( R32G32B32_UNORM,    SZ_3, CVT_32_UNORM, 1 )
-FETCH_ATTRIB( R32G32_UNORM,       SZ_2, CVT_32_UNORM, 1 )
-FETCH_ATTRIB( R32_UNORM,          SZ_1, CVT_32_UNORM, 1 )
-
-FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM, 1 )
-FETCH_ATTRIB( R32G32B32_SNORM,    SZ_3, CVT_32_SNORM, 1 )
-FETCH_ATTRIB( R32G32_SNORM,       SZ_2, CVT_32_SNORM, 1 )
-FETCH_ATTRIB( R32_SNORM,          SZ_1, CVT_32_SNORM, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED, 1 )
-FETCH_ATTRIB( R16G16B16_USCALED,    SZ_3, CVT_16_USCALED, 1 )
-FETCH_ATTRIB( R16G16_USCALED,       SZ_2, CVT_16_USCALED, 1 )
-FETCH_ATTRIB( R16_USCALED,          SZ_1, CVT_16_USCALED, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED, 1 )
-FETCH_ATTRIB( R16G16B16_SSCALED,    SZ_3, CVT_16_SSCALED, 1 )
-FETCH_ATTRIB( R16G16_SSCALED,       SZ_2, CVT_16_SSCALED, 1 )
-FETCH_ATTRIB( R16_SSCALED,          SZ_1, CVT_16_SSCALED, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM, 1 )
-FETCH_ATTRIB( R16G16B16_UNORM,    SZ_3, CVT_16_UNORM, 1 )
-FETCH_ATTRIB( R16G16_UNORM,       SZ_2, CVT_16_UNORM, 1 )
-FETCH_ATTRIB( R16_UNORM,          SZ_1, CVT_16_UNORM, 1 )
-
-FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM, 1 )
-FETCH_ATTRIB( R16G16B16_SNORM,    SZ_3, CVT_16_SNORM, 1 )
-FETCH_ATTRIB( R16G16_SNORM,       SZ_2, CVT_16_SNORM, 1 )
-FETCH_ATTRIB( R16_SNORM,          SZ_1, CVT_16_SNORM, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_USCALED,   SZ_4, CVT_8_USCALED, 1 )
-FETCH_ATTRIB( R8G8B8_USCALED,     SZ_3, CVT_8_USCALED, 1 )
-FETCH_ATTRIB( R8G8_USCALED,       SZ_2, CVT_8_USCALED, 1 )
-FETCH_ATTRIB( R8_USCALED,         SZ_1, CVT_8_USCALED, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_SSCALED,  SZ_4, CVT_8_SSCALED, 1 )
-FETCH_ATTRIB( R8G8B8_SSCALED,    SZ_3, CVT_8_SSCALED, 1 )
-FETCH_ATTRIB( R8G8_SSCALED,      SZ_2, CVT_8_SSCALED, 1 )
-FETCH_ATTRIB( R8_SSCALED,        SZ_1, CVT_8_SSCALED, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_UNORM,  SZ_4, CVT_8_UNORM, 1 )
-FETCH_ATTRIB( R8G8B8_UNORM,    SZ_3, CVT_8_UNORM, 1 )
-FETCH_ATTRIB( R8G8_UNORM,      SZ_2, CVT_8_UNORM, 1 )
-FETCH_ATTRIB( R8_UNORM,        SZ_1, CVT_8_UNORM, 1 )
-
-FETCH_ATTRIB( R8G8B8A8_SNORM,  SZ_4, CVT_8_SNORM, 1 )
-FETCH_ATTRIB( R8G8B8_SNORM,    SZ_3, CVT_8_SNORM, 1 )
-FETCH_ATTRIB( R8G8_SNORM,      SZ_2, CVT_8_SNORM, 1 )
-FETCH_ATTRIB( R8_SNORM,        SZ_1, CVT_8_SNORM, 1 )
-
-FETCH_ATTRIB( A8R8G8B8_UNORM,       SZ_4, CVT_8_UNORM, 1 )
-
-
-
-static spu_fetch_func get_fetch_func( enum pipe_format format )
-{
-   switch (format) {
-   case PIPE_FORMAT_R64_FLOAT:
-      return fetch_R64_FLOAT;
-   case PIPE_FORMAT_R64G64_FLOAT:
-      return fetch_R64G64_FLOAT;
-   case PIPE_FORMAT_R64G64B64_FLOAT:
-      return fetch_R64G64B64_FLOAT;
-   case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return fetch_R64G64B64A64_FLOAT;
-
-   case PIPE_FORMAT_R32_FLOAT:
-      return fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
-
-   case PIPE_FORMAT_R32_UNORM:
-      return fetch_R32_UNORM;
-   case PIPE_FORMAT_R32G32_UNORM:
-      return fetch_R32G32_UNORM;
-   case PIPE_FORMAT_R32G32B32_UNORM:
-      return fetch_R32G32B32_UNORM;
-   case PIPE_FORMAT_R32G32B32A32_UNORM:
-      return fetch_R32G32B32A32_UNORM;
-
-   case PIPE_FORMAT_R32_USCALED:
-      return fetch_R32_USCALED;
-   case PIPE_FORMAT_R32G32_USCALED:
-      return fetch_R32G32_USCALED;
-   case PIPE_FORMAT_R32G32B32_USCALED:
-      return fetch_R32G32B32_USCALED;
-   case PIPE_FORMAT_R32G32B32A32_USCALED:
-      return fetch_R32G32B32A32_USCALED;
-
-   case PIPE_FORMAT_R32_SNORM:
-      return fetch_R32_SNORM;
-   case PIPE_FORMAT_R32G32_SNORM:
-      return fetch_R32G32_SNORM;
-   case PIPE_FORMAT_R32G32B32_SNORM:
-      return fetch_R32G32B32_SNORM;
-   case PIPE_FORMAT_R32G32B32A32_SNORM:
-      return fetch_R32G32B32A32_SNORM;
-
-   case PIPE_FORMAT_R32_SSCALED:
-      return fetch_R32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
-
-   case PIPE_FORMAT_R16_UNORM:
-      return fetch_R16_UNORM;
-   case PIPE_FORMAT_R16G16_UNORM:
-      return fetch_R16G16_UNORM;
-   case PIPE_FORMAT_R16G16B16_UNORM:
-      return fetch_R16G16B16_UNORM;
-   case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return fetch_R16G16B16A16_UNORM;
-
-   case PIPE_FORMAT_R16_USCALED:
-      return fetch_R16_USCALED;
-   case PIPE_FORMAT_R16G16_USCALED:
-      return fetch_R16G16_USCALED;
-   case PIPE_FORMAT_R16G16B16_USCALED:
-      return fetch_R16G16B16_USCALED;
-   case PIPE_FORMAT_R16G16B16A16_USCALED:
-      return fetch_R16G16B16A16_USCALED;
-
-   case PIPE_FORMAT_R16_SNORM:
-      return fetch_R16_SNORM;
-   case PIPE_FORMAT_R16G16_SNORM:
-      return fetch_R16G16_SNORM;
-   case PIPE_FORMAT_R16G16B16_SNORM:
-      return fetch_R16G16B16_SNORM;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      return fetch_R16G16B16A16_SNORM;
-
-   case PIPE_FORMAT_R16_SSCALED:
-      return fetch_R16_SSCALED;
-   case PIPE_FORMAT_R16G16_SSCALED:
-      return fetch_R16G16_SSCALED;
-   case PIPE_FORMAT_R16G16B16_SSCALED:
-      return fetch_R16G16B16_SSCALED;
-   case PIPE_FORMAT_R16G16B16A16_SSCALED:
-      return fetch_R16G16B16A16_SSCALED;
-
-   case PIPE_FORMAT_R8_UNORM:
-      return fetch_R8_UNORM;
-   case PIPE_FORMAT_R8G8_UNORM:
-      return fetch_R8G8_UNORM;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      return fetch_R8G8B8_UNORM;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return fetch_R8G8B8A8_UNORM;
-
-   case PIPE_FORMAT_R8_USCALED:
-      return fetch_R8_USCALED;
-   case PIPE_FORMAT_R8G8_USCALED:
-      return fetch_R8G8_USCALED;
-   case PIPE_FORMAT_R8G8B8_USCALED:
-      return fetch_R8G8B8_USCALED;
-   case PIPE_FORMAT_R8G8B8A8_USCALED:
-      return fetch_R8G8B8A8_USCALED;
-
-   case PIPE_FORMAT_R8_SNORM:
-      return fetch_R8_SNORM;
-   case PIPE_FORMAT_R8G8_SNORM:
-      return fetch_R8G8_SNORM;
-   case PIPE_FORMAT_R8G8B8_SNORM:
-      return fetch_R8G8B8_SNORM;
-   case PIPE_FORMAT_R8G8B8A8_SNORM:
-      return fetch_R8G8B8A8_SNORM;
-
-   case PIPE_FORMAT_R8_SSCALED:
-      return fetch_R8_SSCALED;
-   case PIPE_FORMAT_R8G8_SSCALED:
-      return fetch_R8G8_SSCALED;
-   case PIPE_FORMAT_R8G8B8_SSCALED:
-      return fetch_R8G8B8_SSCALED;
-   case PIPE_FORMAT_R8G8B8A8_SSCALED:
-      return fetch_R8G8B8A8_SSCALED;
-
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
-
-   case 0:
-      return NULL;		/* not sure why this is needed */
-
-   default:
-      assert(0);
-      return NULL;
-   }
-}
-
-
-static unsigned get_vertex_size( enum pipe_format format )
-{
-   switch (format) {
-   case PIPE_FORMAT_R64_FLOAT:
-      return 8;
-   case PIPE_FORMAT_R64G64_FLOAT:
-      return 2 * 8;
-   case PIPE_FORMAT_R64G64B64_FLOAT:
-      return 3 * 8;
-   case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return 4 * 8;
-
-   case PIPE_FORMAT_R32_SSCALED:
-   case PIPE_FORMAT_R32_SNORM:
-   case PIPE_FORMAT_R32_USCALED:
-   case PIPE_FORMAT_R32_UNORM:
-   case PIPE_FORMAT_R32_FLOAT:
-      return 4;
-   case PIPE_FORMAT_R32G32_SSCALED:
-   case PIPE_FORMAT_R32G32_SNORM:
-   case PIPE_FORMAT_R32G32_USCALED:
-   case PIPE_FORMAT_R32G32_UNORM:
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return 2 * 4;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-   case PIPE_FORMAT_R32G32B32_SNORM:
-   case PIPE_FORMAT_R32G32B32_USCALED:
-   case PIPE_FORMAT_R32G32B32_UNORM:
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return 3 * 4;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-   case PIPE_FORMAT_R32G32B32A32_SNORM:
-   case PIPE_FORMAT_R32G32B32A32_USCALED:
-   case PIPE_FORMAT_R32G32B32A32_UNORM:
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return 4 * 4;
-
-   case PIPE_FORMAT_R16_SSCALED:
-   case PIPE_FORMAT_R16_SNORM:
-   case PIPE_FORMAT_R16_UNORM:
-   case PIPE_FORMAT_R16_USCALED:
-      return 2;
-   case PIPE_FORMAT_R16G16_SSCALED:
-   case PIPE_FORMAT_R16G16_SNORM:
-   case PIPE_FORMAT_R16G16_USCALED:
-   case PIPE_FORMAT_R16G16_UNORM:
-      return 2 * 2;
-   case PIPE_FORMAT_R16G16B16_SSCALED:
-   case PIPE_FORMAT_R16G16B16_SNORM:
-   case PIPE_FORMAT_R16G16B16_USCALED:
-   case PIPE_FORMAT_R16G16B16_UNORM:
-      return 3 * 2;
-   case PIPE_FORMAT_R16G16B16A16_SSCALED:
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-   case PIPE_FORMAT_R16G16B16A16_USCALED:
-   case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return 4 * 2;
-
-   case PIPE_FORMAT_R8_SSCALED:
-   case PIPE_FORMAT_R8_SNORM:
-   case PIPE_FORMAT_R8_USCALED:
-   case PIPE_FORMAT_R8_UNORM:
-      return 1;
-   case PIPE_FORMAT_R8G8_SSCALED:
-   case PIPE_FORMAT_R8G8_SNORM:
-   case PIPE_FORMAT_R8G8_USCALED:
-   case PIPE_FORMAT_R8G8_UNORM:
-      return 2 * 1;
-   case PIPE_FORMAT_R8G8B8_SSCALED:
-   case PIPE_FORMAT_R8G8B8_SNORM:
-   case PIPE_FORMAT_R8G8B8_USCALED:
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      return 3 * 1;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-   case PIPE_FORMAT_R8G8B8A8_SSCALED:
-   case PIPE_FORMAT_R8G8B8A8_SNORM:
-   case PIPE_FORMAT_R8G8B8A8_USCALED:
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return 4 * 1;
-
-   case 0:
-      return 0;		/* not sure why this is needed */
-
-   default:
-      assert(0);
-      return 0;
-   }
-}
-
-
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
@@ -612,10 +161,10 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
    for (attr = 0; attr < nr_attrs; attr++) {
-      const qword default_values = (qword)(vec_float4){ 0.0, 0.0, 0.0, 1.0 };
       const unsigned pitch = draw->vertex_fetch.pitch[attr];
       const uint64_t src = draw->vertex_fetch.src_ptr[attr];
-      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      const spu_fetch_func fetch = (spu_fetch_func)
+	  (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]);
       unsigned i;
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
@@ -644,8 +193,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
       /* Convert all 4 vertices to vectors of float.
        */
-      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, default_values,
-               fetch_shuffle_data);
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data);
    }
 }
 
@@ -662,12 +210,5 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    }
 
 
-   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
-      draw->vertex_fetch.fetch[i] =
-          get_fetch_func(draw->vertex_fetch.format[i]);
-      draw->vertex_fetch.size[i] =
-          get_vertex_size(draw->vertex_fetch.format[i]);
-   }
-
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
index b5bf31e67d..0fb0bc28d0 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -6,8 +6,6 @@
 
 struct spu_vs_context;
 
-typedef void (*spu_fetch_func)(qword *out, const qword *in, qword defaults,
-			       const qword *shuffle_data);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
@@ -20,12 +18,12 @@ struct spu_vs_context {
       uint64_t src_ptr[PIPE_ATTRIB_MAX];
       unsigned pitch[PIPE_ATTRIB_MAX];
       unsigned size[PIPE_ATTRIB_MAX];
-      enum pipe_format format[PIPE_ATTRIB_MAX];
+      unsigned code_offset[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
       boolean dirty;
 
-      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
       spu_full_fetch_func fetch_func;
+      void *code;
    } vertex_fetch;
    
    /* Clip derived state:
-- 
cgit v1.2.3