10 files changed, 322 insertions, 265 deletions
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 30ef2450ec..c071de1900 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -18,6 +18,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 SOURCES = \
 	spu_main.c \
 	spu_blend.c \
+	spu_dcache.c \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
new file mode 100644
index 0000000000..a1701d80d1
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -0,0 +1,125 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+#define CACHELINE_LOG2SIZE    7
+#define LINE_SIZE             (1U << 7)
+#define ALIGN_MASK            (~(LINE_SIZE - 1))
+
+#define CACHE_NAME            data
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            data
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+/**
+ * Fetch between arbitrary number of bytes from an unaligned address
+ *
+ * \param dst   Destination data buffer
+ * \param ea    Main memory effective address of source data
+ * \param size  Number of bytes to read
+ *
+ * \warning
+ * As is hinted by the type of the \c dst pointer, this function writes
+ * multiples of 16-bytes.
+ */
+void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   const int shift = ea & 0x0f;
+   const unsigned read_size = ROUNDUP16(size + shift);
+   const unsigned last_read = ROUNDUP16(ea + size);
+   const qword *const last_write = dst + (ROUNDUP16(size) / 16);
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < size; i += 16) {
+         *(dst++) = cache_rd(data, ea + i);
+      }
+   } else {
+      qword hi;
+
+
+      /* Please exercise extreme caution when modifying this code.  This code
+       * must not read past the end of the page containing the source data,
+       * and it must not write more than ((size + 15) / 16) qwords to the
+       * destination buffer.
+       */
+      ea &= ~0x0f;
+      hi = cache_rd(data, ea);
+      for (i = 16; i < read_size; i += 16) {
+         qword lo = cache_rd(data, ea + i);
+
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift),
+                          (qword) spu_rlmaskqwbyte(lo, shift - 16));
+         hi = lo;
+      }
+
+      if (dst != last_write) {
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0));
+      }
+   }
+   
+   ASSERT((ea + i) == last_read);
+   ASSERT(dst == last_write);
+}
+
+
+/**
+ * Notify the cache that a range of main memory may have been modified
+ */
+void
+spu_dcache_mark_dirty(unsigned ea, unsigned size)
+{
+   unsigned i;
+   const unsigned aligned_start = (ea & ALIGN_MASK);
+   const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) 
+       & ALIGN_MASK;
+
+
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      const unsigned entry = __cache_dir[i];
+      const unsigned addr = entry & ~0x0f;
+
+      __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end))
+          ? (entry & ~CACHELINE_VALID) : entry;
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
new file mode 100644
index 0000000000..7a06b8c25a
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -0,0 +1,34 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_DCACHE_H
+#define SPU_DCACHE_H
+
+extern void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
+
+extern void
+spu_dcache_mark_dirty(unsigned ea, unsigned size);
+
+#endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 109540b1f7..1560c0f157 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -50,8 +50,6 @@
  *   Brian Paul
  */
 
-#include <libmisc.h>
-#include <spu_mfcio.h>
 #include <transpose_matrix4x4.h>
 #include <simdmath/ceilf4.h>
 #include <simdmath/cosf4.h>
@@ -72,6 +70,8 @@
 #include "spu_exec.h"
 #include "spu_main.h"
 #include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -146,17 +146,14 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
                       struct spu_sampler *samplers,
                       unsigned processor)
 {
-   qword zero;
-   qword not_zero;
-   uint i;
+   const qword zero = si_il(0);
+   const qword not_zero = si_il(~0);
 
+   (void) numSamplers;
    mach->Samplers = samplers;
    mach->Processor = processor;
    mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
 
-   zero = si_xor(zero, zero);
-   not_zero = si_xori(zero, 0xff);
-
    /* Setup constants. */
    mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
    mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
@@ -356,19 +353,17 @@ fetch_src_file_channel(
    case TGSI_EXTSWIZZLE_W:
       switch( file ) {
       case TGSI_FILE_CONSTANT: {
-         unsigned char buffer[32] ALIGN16_ATTRIB;
          unsigned i;
 
          for (i = 0; i < 4; i++) {
             const float *ptr = mach->Consts[index->i[i]];
-            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
-            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+            float tmp[4];
 
-            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
-            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+            spu_dcache_fetch_unaligned((qword *) tmp,
+                                       (uintptr_t)(ptr + swizzle),
+                                       sizeof(float));
 
-            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
-                + (sizeof(float) * swizzle)], sizeof(float));
+            chan->f[i] = tmp[0];
          }
          break;
       }
@@ -663,9 +658,10 @@ fetch_texel( struct spu_sampler *sampler,
    qword rgba[4];
    qword out[4];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, 
+			(float (*)[4]) rgba);
 
-   _transpose_matrix4x4(out, rgba);
+   _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
    r->q = out[0];
    g->q = out[1];
    b->q = out[2];
@@ -1903,32 +1899,28 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    /* execute declarations (interpolants) */
    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
       for (i = 0; i < mach->NumDeclarations; i++) {
-	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
-	 struct tgsi_full_declaration decl;
-	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
-	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+         union {
+            struct tgsi_full_declaration decl;
+            qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
+         } d ALIGN16_ATTRIB;
+         unsigned ea = (unsigned) (mach->Declarations + pc);
 
-	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+         spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
 
-	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-	 exec_declaration( mach, &decl );
+         exec_declaration( mach, &d.decl );
       }
    }
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
-      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
-      struct tgsi_full_instruction inst;
-      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
-      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
-
-      assert(pc < mach->NumInstructions);
-      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
-
-      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
-      exec_instruction( mach, & inst, &pc );
+      union {
+         struct tgsi_full_instruction inst;
+         qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
+      } i ALIGN16_ATTRIB;
+      unsigned ea = (unsigned) (mach->Instructions + pc);
+
+      spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
+      exec_instruction( mach, & i.inst, &pc );
    }
 
 #if 0
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index fcbf0f841e..59300028d4 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -38,6 +38,7 @@
 #include "spu_tile.h"
 //#include "spu_test.h"
 #include "spu_vertex_shader.h"
+#include "spu_dcache.h"
 #include "cell/common.h"
 #include "pipe/p_defines.h"
 
@@ -285,6 +286,8 @@ cmd_state_texture(const struct cell_command_texture *texture)
       { spu.texture.width, spu.texture.height, 0.0, 0.0};
    spu.tex_size_mask = (vector unsigned int)
       { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
+   spu.tex_size_x_mask = spu_splats(spu.texture.width - 1);
+   spu.tex_size_y_mask = spu_splats(spu.texture.height - 1);
 }
 
 
@@ -433,10 +436,19 @@ cmd_batch(uint opcode)
                        sizeof(struct pipe_viewport_state));
          pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
          break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
+      case CELL_CMD_STATE_BIND_VS:
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
       case CELL_CMD_STATE_ATTRIB_FETCH: {
          struct cell_attribute_fetch_code *code =
              (struct cell_attribute_fetch_code *) &buffer[pos+1];
@@ -453,6 +465,14 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
       }
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
       default:
          printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
@@ -566,7 +586,7 @@ main(main_param_t speid, main_param_t argp)
    one_time_init();
 
    if (Debug)
-      printf("SPU: main() speid=%lu\n", speid);
+      printf("SPU: main() speid=%lu\n", (unsigned long) speid);
 
    mfc_get(&spu.init,  /* dest */
            (unsigned int) argp, /* src */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 5c95d112ac..a13edd1702 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,6 +107,8 @@ struct spu_global
 
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
+   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
 
    vector float (*sample_texture)(vector float texcoord);
 
@@ -130,8 +132,10 @@ extern boolean Debug;
 #define TAG_INDEX_BUFFER      16
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
-#define TAG_TEXTURE_TILE      19
-#define TAG_INSTRUCTION_FETCH 20
+#define TAG_DCACHE0           20
+#define TAG_DCACHE1           21
+#define TAG_DCACHE2           22
+#define TAG_DCACHE3           23
 
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 3962aaa4a9..67eb08196a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -31,19 +31,7 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_colorpack.h"
-
-
-/**
- * Number of texture tiles to cache.
- * Note that this will probably be the largest consumer of SPU local store/
- * memory for this driver!
- */
-#define CACHE_SIZE 16
-
-static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
-
-static vector unsigned int tex_tile_xy[CACHE_SIZE];
-
+#include "spu_dcache.h"
 
 
 /**
@@ -52,78 +40,60 @@ static vector unsigned int tex_tile_xy[CACHE_SIZE];
 void
 invalidate_tex_cache(void)
 {
-   /* XXX memset? */
-   uint i;
-   for (i = 0; i < CACHE_SIZE; i++) {
-      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
-   }
+   spu_dcache_mark_dirty((unsigned) spu.texture.start,
+                         4 * spu.texture.width * spu.texture.height);
 }
 
 
-/**
- * Return the cache pos/index which corresponds to tile (tx,ty)
- */
-static INLINE uint
-cache_pos(vector unsigned int txty)
+static uint
+get_texel(vec_uint4 coordinate)
 {
-   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
-   return pos;
+   vec_uint4 tmp;
+   unsigned x = spu_extract(coordinate, 0);
+   unsigned y = spu_extract(coordinate, 1);
+   const unsigned tiles_per_row = spu.texture.width / TILE_SIZE;
+   unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
+                                            + (x / TILE_SIZE));
+   unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
+                                + (x % TILE_SIZE));
+
+   spu_dcache_fetch_unaligned((qword *) & tmp,
+                              spu.texture.start + tile_offset + texel_offset,
+                              4);
+   return spu_extract(tmp, 0);
 }
 
 
-/**
- * Make sure the tile for texel (i,j) is present, return its position/index
- * in the cache.
- */
-static uint
-get_tex_tile(vector unsigned int ij)
+static void
+get_four_texels(vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
-   /* tile address: tx,ty */
-   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
-   const uint pos = cache_pos(txty);
-
-   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
-       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
-
-      /* texture cache miss, fetch tile from main memory */
-      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
-      const uint bytes_per_tile = sizeof(tile_t);
-      const void *src = (const ubyte *) spu.texture.start
-         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
-
-      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
-             spu.init.id,
-             spu_extract(txty,0),
-             spu_extract(txty,1),
-             pos,
-             spu_extract(tex_tile_xy[pos],0),
-             spu_extract(tex_tile_xy[pos],1));
-
-      ASSERT_ALIGN16(tex_tiles[pos].ui);
-      ASSERT_ALIGN16(src);
-
-      mfc_get(tex_tiles[pos].ui,  /* dest */
-              (unsigned int) src,
-              bytes_per_tile,      /* size */
-              TAG_TEXTURE_TILE,
-              0, /* tid */
-              0  /* rid */);
-
-      wait_on_mask(1 << TAG_TEXTURE_TILE);
-
-      tex_tile_xy[pos] = txty;
-   }
-   else {
-#if 0
-      printf("SPU %u: tex cache HIT at %d, %d\n",
-             spu.init.id, tx, ty);
-#endif
-   }
-
-   return pos;
+   const unsigned texture_ea = (uintptr_t) spu.texture.start;
+   vec_uint4 tile_x = spu_rlmask(x, -5);
+   vec_uint4 tile_y = spu_rlmask(y, -5);
+   const qword offset_x = si_andi((qword) x, 0x1f);
+   const qword offset_y = si_andi((qword) y, 0x1f);
+
+   const qword tiles_per_row = (qword) spu_splats(spu.texture.width / TILE_SIZE);
+   const qword tile_size = (qword) spu_splats(sizeof(tile_t));
+
+   qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
+   tile_offset = si_mpy((qword) tile_offset, tile_size);
+
+   qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
+   texel_offset = si_mpyui(texel_offset, 4);
+   
+   vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+   
+   spu_dcache_fetch_unaligned((qword *) & texels[0],
+                              texture_ea + spu_extract(offset, 0), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[1],
+                              texture_ea + spu_extract(offset, 1), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[2],
+                              texture_ea + spu_extract(offset, 2), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[3],
+                              texture_ea + spu_extract(offset, 3), 4);
 }
 
-
 /**
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
@@ -134,9 +104,7 @@ sample_texture_nearest(vector float texcoord)
    vector float tc = spu_mul(texcoord, spu.tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
    itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
-   uint pos = get_tex_tile(itc);
-   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
+   uint texel = get_texel(itc);
    return spu_unpack_A8R8G8B8(texel);
 }
 
@@ -144,49 +112,33 @@ sample_texture_nearest(vector float texcoord)
 vector float
 sample_texture_bilinear(vector float texcoord)
 {
-   static const vector unsigned int offset10 = {1, 0, 0, 0};
-   static const vector unsigned int offset01 = {0, 1, 0, 0};
+   static const vec_uint4 offset_x = {0, 0, 1, 1};
+   static const vec_uint4 offset_y = {0, 1, 0, 1};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
    tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
 
    /* integer texcoords S,T: */
-   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
-   vector unsigned int itc01 = spu_add(itc00, offset01);
-   vector unsigned int itc10 = spu_add(itc00, offset10);
-   vector unsigned int itc11 = spu_add(itc10, offset01);
-
-   /* mask (GL_REPEAT) */
-   itc00 = spu_and(itc00, spu.tex_size_mask);
-   itc01 = spu_and(itc01, spu.tex_size_mask);
-   itc10 = spu_and(itc10, spu.tex_size_mask);
-   itc11 = spu_and(itc11, spu.tex_size_mask);
-
-   /* intra tile addr */
-   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
-   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
-   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
-   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
-
-   /* get tile cache positions */
-   uint pos00 = get_tex_tile(itc00);
-   uint pos01, pos10, pos11;
-   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
-       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
-      /* all texels are in the same tile */
-      pos01 = pos10 = pos11 = pos00;
-   }
-   else {
-      pos01 = get_tex_tile(itc01);
-      pos10 = get_tex_tile(itc10);
-      pos11 = get_tex_tile(itc11);
-   }
-
-   /* get texels from tiles and convert to float[4] */
-   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
-   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
-   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
+
+   vec_uint4 texels[4];
+   
+   vec_uint4 x = spu_splats(spu_extract(itc, 0));
+   vec_uint4 y = spu_splats(spu_extract(itc, 1));
+
+   x = spu_add(x, offset_x);
+   y = spu_add(y, offset_y);
+
+   x = spu_and(x, spu.tex_size_x_mask);
+   y = spu_and(y, spu.tex_size_y_mask);
+
+   get_four_texels(x, y, texels);
+
+   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
+   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
+   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
+   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 55c6c28717..219fd90cc0 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -32,39 +32,19 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
-#include <spu_mfcio.h>
-
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"
 #include "spu_main.h"
-
-#define CACHE_NAME            attribute
-#define CACHED_TYPE           qword
-#define CACHE_TYPE            CACHE_TYPE_RO
-#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER
-#define CACHE_LOG2NNWAY       2
-#define CACHE_LOG2NSETS       6
-#include <cache-api.h>
-
-/* Yes folks, this is ugly.
- */
-#undef CACHE_NWAY
-#undef CACHE_NSETS
-#define CACHE_NAME            attribute
-#define CACHE_NWAY            4
-#define CACHE_NSETS           (1U << 6)
-
-
-#define DRAW_DBG 0
+#include "spu_dcache.h"
 
 typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
 
 
-static const qword fetch_shuffle_data[] = {
+static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
    /* Shuffle used by CVT_64_FLOAT
     */
    {
@@ -103,44 +83,6 @@ static const qword fetch_shuffle_data[] = {
 
 
 /**
- * Fetch between 1 and 32 bytes from an unaligned address
- */
-static INLINE void
-fetch_unaligned(qword *dst, unsigned ea, unsigned size)
-{
-   qword tmp[4];
-   const int shift = ea & 0x0f;
-   const unsigned aligned_start_ea = ea & ~0x0f;
-   const unsigned aligned_end_ea = (ea + size) & ~0x0f;
-   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1;
-   unsigned i;
-
-
-   if (shift == 0) {
-      /* Data is already aligned.  Fetch directly into the destination buffer.
-       */
-      for (i = 0; i < num_entries; i++) {
-	 dst[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
-      }
-   } else {
-      /* Fetch data from the cache to the local buffer.
-       */
-      for (i = 0; i < num_entries; i++) {
-	 tmp[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16));
-      }
-
-
-      /* Fix the alignment of the data and write to the destination buffer.
-       */
-      for (i = 0; i < ((size + 15) / 16); i++) {
-	 dst[i] = si_or((qword) spu_slqwbyte(tmp[i], shift),
-			(qword) spu_rlmaskqwbyte(tmp[i + 1], shift - 16));
-      }
-   }
-}
-
-
-/**
  * Fetch vertex attributes for 'count' vertices.
  */
 static void generic_vertex_fetch(struct spu_vs_context *draw,
@@ -169,7 +111,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
       const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
-      qword in[2 * 4];
+      qword in[2 * 4] ALIGN16_ATTRIB;
 
 
       /* Fetch four attributes for four vertices.  
@@ -182,7 +124,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          printf("SPU: fetching = 0x%llx\n", addr);
 #endif
 
-         fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry);
          idx += quads_per_entry;
       }
 
@@ -200,15 +142,5 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
 void spu_update_vertex_fetch( struct spu_vs_context *draw )
 {
-   unsigned i;
-
-   
-   /* Invalidate the vertex cache.
-    */
-   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
-      CACHELINE_CLEARVALID(i);
-   }
-
-
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index 3f5bf41aa2..8363efeeb6 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -165,63 +165,55 @@ run_vertex_program(struct spu_vs_context *draw,
 }
 
 
-static void
-spu_bind_vertex_shader(struct spu_vs_context *draw,
-		       void *uniforms,
-		       void *planes,
-		       unsigned nr_planes,
-		       unsigned num_outputs
-		       )
-{
-   draw->constants = (float (*)[4]) uniforms;
-
-   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
-   draw->nr_planes = nr_planes;
-   draw->num_vs_outputs = num_outputs;
-
-   /* specify the shader to interpret/execute */
-   spu_exec_machine_init(&draw->machine,
-			 PIPE_MAX_SAMPLERS,
-			 NULL /*samplers*/,
-			 PIPE_SHADER_VERTEX);
-}
-
-
 unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
     ALIGN16_ATTRIB;
 
+
 void
-spu_execute_vertex_shader(struct spu_vs_context *draw,
-                          const struct cell_command_vs *vs)
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs)
 {
-   unsigned i;
-
-   const uint64_t immediate_addr = vs->shader.immediates;
+   const unsigned immediate_addr = vs->immediates;
    const unsigned immediate_size = 
-       ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates)
-                 + (immediate_addr & 0x0f));
+       ROUNDUP16((sizeof(float) * 4 * vs->num_immediates)
+		 + (immediate_addr & 0x0f));
+ 
 
    mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
            TAG_VERTEX_BUFFER, 0, 0);
 
    draw->machine.Instructions = (struct tgsi_full_instruction *)
-       vs->shader.instructions;
-   draw->machine.NumInstructions = vs->shader.num_instructions;
+       vs->instructions;
+   draw->machine.NumInstructions = vs->num_instructions;
 
    draw->machine.Declarations = (struct tgsi_full_declaration *)
-       vs->shader.declarations;
-   draw->machine.NumDeclarations = vs->shader.num_declarations;
+       vs->declarations;
+   draw->machine.NumDeclarations = vs->num_declarations;
 
-   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+   draw->num_vs_outputs = vs->num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
 
    wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
    (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
-                 sizeof(float) * 4 * vs->shader.num_immediates);
+                 sizeof(float) * 4 * vs->num_immediates);
+}
 
-   spu_bind_vertex_shader(draw, vs->shader.uniforms,
-                          vs->plane, vs->nr_planes,
-                          vs->shader.num_outputs);
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes);
+   draw->nr_planes = vs->nr_planes;
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
 
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
index 0fb0bc28d0..54a4b8d9b9 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.h
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -1,6 +1,7 @@
 #ifndef SPU_VERTEX_SHADER_H
 #define SPU_VERTEX_SHADER_H
 
+#include "cell/common.h"
 #include "pipe/p_format.h"
 #include "spu_exec.h"
 
@@ -55,6 +56,10 @@ static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 struct cell_command_vs;
 
 extern void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs);
+
+extern void
 spu_execute_vertex_shader(struct spu_vs_context *draw,
 			  const struct cell_command_vs *vs);