From 2194675196260c0a5d44242d698b85c86f84074b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Sun, 27 Jan 2008 12:01:47 -0700
Subject: Cell: generalize the batch buffer code for vertex buffers...

---
 src/mesa/pipe/cell/common.h           |  8 ++--
 src/mesa/pipe/cell/ppu/cell_batch.c   | 84 ++++++++++++++++++++---------------
 src/mesa/pipe/cell/ppu/cell_batch.h   |  3 ++
 src/mesa/pipe/cell/ppu/cell_context.c |  5 ++-
 src/mesa/pipe/cell/ppu/cell_context.h | 10 +++--
 src/mesa/pipe/cell/ppu/cell_spu.c     |  4 +-
 src/mesa/pipe/cell/spu/spu_main.c     | 22 ++++-----
 7 files changed, 79 insertions(+), 57 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 0b63ed39be..ce9c381907 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -81,8 +81,8 @@
 #define CELL_CMD_STATE_VERTEX_INFO   13
 
 
-#define CELL_NUM_BATCH_BUFFERS 3
-#define CELL_BATCH_BUFFER_SIZE 1024  /**< 16KB would be the max */
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
@@ -147,7 +147,9 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    struct cell_command *cmd;
-   ubyte *batch_buffers[CELL_NUM_BATCH_BUFFERS];
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index c894ef8608..178caa74e1 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -31,12 +31,46 @@
 #include "cell_spu.h"
 
 
+
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   uint buf = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+   }
+}
+
+
+
 void
 cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->batch_buffer_size[batch];
+   const uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -46,7 +80,7 @@ cell_batch_flush(struct cell_context *cell)
 
    flushing = TRUE;
 
-   assert(batch < CELL_NUM_BATCH_BUFFERS);
+   assert(batch < CELL_NUM_BUFFERS);
 
    /*
    printf("cell_batch_dispatch: buf %u at %p, size %u\n",
@@ -68,28 +102,9 @@ cell_batch_flush(struct cell_context *cell)
     * array indicating that the PPU can re-use the buffer.
     */
 
+   batch = cell_get_empty_buffer(cell);
 
-   /* Find a buffer that's marked as free by all SPUs */
-   while (1) {
-      uint num_free = 0;
-
-      batch = (batch + 1) % CELL_NUM_BATCH_BUFFERS;
-
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         if (cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_FREE)
-            num_free++;
-      }
-
-      if (num_free == cell->num_spus) {
-         /* found a free buffer, now mark status as used */
-         for (spu = 0; spu < cell->num_spus; spu++) {
-            cell->buffer_status[spu][batch][0] = CELL_BUFFER_STATUS_USED;
-         }
-         break;
-      }
-   }
-
-   cell->batch_buffer_size[batch] = 0;  /* empty */
+   cell->buffer_size[batch] = 0;  /* empty */
    cell->cur_batch = batch;
 
    flushing = FALSE;
@@ -99,8 +114,7 @@ cell_batch_flush(struct cell_context *cell)
 uint
 cell_batch_free_space(const struct cell_context *cell)
 {
-   uint free = CELL_BATCH_BUFFER_SIZE
-      - cell->batch_buffer_size[cell->cur_batch];
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
    return free;
 }
 
@@ -117,18 +131,18 @@ cell_batch_append(struct cell_context *cell, const void *cmd, uint length)
    assert(length % 4 == 0);
    assert(cell->cur_batch >= 0);
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + length > CELL_BATCH_BUFFER_SIZE) {
+   if (size + length > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + length <= CELL_BATCH_BUFFER_SIZE);
+   assert(size + length <= CELL_BUFFER_SIZE);
 
-   memcpy(cell->batch_buffer[cell->cur_batch] + size, cmd, length);
+   memcpy(cell->buffer[cell->cur_batch] + size, cmd, length);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + length;
+   cell->buffer_size[cell->cur_batch] = size + length;
 }
 
 
@@ -142,18 +156,18 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 
    assert(cell->cur_batch >= 0);
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BATCH_BUFFER_SIZE) {
+   if (size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + bytes <= CELL_BATCH_BUFFER_SIZE);
+   assert(size + bytes <= CELL_BUFFER_SIZE);
 
-   pos = (void *) (cell->batch_buffer[cell->cur_batch] + size);
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + bytes;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 
    return pos;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index c4ba7feb3d..b4c96f465a 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -35,6 +35,9 @@
 struct cell_context;
 
 
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
 extern void
 cell_batch_flush(struct cell_context *cell);
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index 8cb0c48f40..e8020a49bc 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -254,8 +254,9 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_start_spus(cell);
 
-   for (buf = 0; buf < CELL_NUM_BATCH_BUFFERS; buf++) {
-      cell->batch_buffer_size[buf] = 0;
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
 
       /* init batch buffer status values,
        * mark 0th buffer as used, rest as free.
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 3bd88bfd5b..de65fb5e9a 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -102,12 +102,14 @@ struct cell_context
 
    uint num_spus;
 
-   uint batch_buffer_size[CELL_NUM_BATCH_BUFFERS];
-   ubyte batch_buffer[CELL_NUM_BATCH_BUFFERS][CELL_BATCH_BUFFER_SIZE] ALIGN16_ATTRIB;
-   int cur_batch;  /**< which batch buffer is being filled */
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BATCH_BUFFERS][4] ALIGN16_ATTRIB;
+   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 };
 
diff --git a/src/mesa/pipe/cell/ppu/cell_spu.c b/src/mesa/pipe/cell/ppu/cell_spu.c
index 4627bc8d1f..7c83a47e57 100644
--- a/src/mesa/pipe/cell/ppu/cell_spu.c
+++ b/src/mesa/pipe/cell/ppu/cell_spu.c
@@ -111,8 +111,8 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].cmd = &cell_global.command[i];
-      for (j = 0; j < CELL_NUM_BATCH_BUFFERS; j++) {
-         cell_global.inits[i].batch_buffers[j] = cell->batch_buffer[j];
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 0c83900a18..2097683b82 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -473,22 +473,22 @@ cmd_finish(void)
 
 
 /**
- * Tell the PPU that this SPU has finished copying a batch buffer to
+ * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
  * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_contex->buffer_status[]).
+ * main memory (in cell_context->buffer_status[]).
  */
 static void
-release_batch_buffer(uint buffer)
+release_buffer(uint buffer)
 {
    /* Evidently, using less than a 16-byte status doesn't work reliably */
    static const uint status[4] ALIGN16_ATTRIB
       = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
 
-   const uint index = 4 * (spu.init.id * CELL_NUM_BATCH_BUFFERS + buffer);
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
    uint *dst = spu.init.buffer_status + index;
 
-   ASSERT(buffer < CELL_NUM_BATCH_BUFFERS);
+   ASSERT(buffer < CELL_NUM_BUFFERS);
 
    /*
    printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n",
@@ -513,24 +513,24 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BATCH_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
+   uint buffer[CELL_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
    const uint usize = size / sizeof(uint);
    uint pos;
 
    if (Debug)
       printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.batch_buffers[buf]);
+             spu.init.id, buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    size = ROUNDUP16(size);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.batch_buffers[buf],  /* src */
+           (unsigned int) spu.init.buffers[buf],  /* src */
            size,
            TAG_BATCH_BUFFER,
            0, /* tid */
@@ -538,7 +538,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   release_batch_buffer(buf);
+   release_buffer(buf);
 
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
-- 
cgit v1.2.3


From aaea9a121bc739db87e539214c23f76d4cd5bf49 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:57:13 -0700
Subject: Cell: additional debug code, misc clean-up

---
 src/mesa/pipe/cell/ppu/cell_batch.c | 52 +++++++++++++++++++++++++++++--------
 src/mesa/pipe/cell/ppu/cell_batch.h |  2 +-
 2 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 178caa74e1..2d032fc902 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -35,7 +35,7 @@
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
-   uint buf = 0;
+   uint buf = 0, tries = 0;
 
    /* Find a buffer that's marked as free by all SPUs */
    while (1) {
@@ -50,6 +50,9 @@ cell_get_empty_buffer(struct cell_context *cell)
                for (spu = 0; spu < cell->num_spus; spu++) {
                   cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
                }
+               /*
+               printf("PPU: ALLOC BUFFER %u\n", buf);
+               */
                return buf;
             }
          }
@@ -60,11 +63,17 @@ cell_get_empty_buffer(struct cell_context *cell)
 
       /* try next buf */
       buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
    }
 }
 
 
-
 void
 cell_batch_flush(struct cell_context *cell)
 {
@@ -120,29 +129,39 @@ cell_batch_free_space(const struct cell_context *cell)
 
 
 /**
- * \param cmd  command to append
- * \param length  command size in bytes
+ * Append data to current batch.
  */
 void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length)
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   assert(length % 4 == 0);
-   assert(cell->cur_batch >= 0);
+   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + length > CELL_BUFFER_SIZE) {
+   if (size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + length <= CELL_BUFFER_SIZE);
+   assert(size + bytes <= CELL_BUFFER_SIZE);
 
-   memcpy(cell->buffer[cell->cur_batch] + size, cmd, length);
+   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
 
-   cell->buffer_size[cell->cur_batch] = size + length;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 }
 
 
@@ -153,9 +172,20 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
    uint size;
 
    ASSERT(bytes % 4 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
 
    assert(cell->cur_batch >= 0);
 
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
    size = cell->buffer_size[cell->cur_batch];
 
    if (size + bytes > CELL_BUFFER_SIZE) {
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index b4c96f465a..f4f37314a4 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -45,7 +45,7 @@ extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
 extern void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length);
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
 
 extern void *
 cell_batch_alloc(struct cell_context *cell, uint bytes);
-- 
cgit v1.2.3


From 200dcb4760960f0d9c74a7053de63337e93dd85b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:57:51 -0700
Subject: Cell: If flushing for swapbuffers, wait for frame completion

---
 src/mesa/pipe/cell/ppu/cell_flush.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index b98bb566b1..cf4e676645 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -39,6 +39,9 @@ cell_flush(struct pipe_context *pipe, unsigned flags)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+      flags |= PIPE_FLUSH_WAIT;
+
    draw_flush( cell->draw );
    cell_flush_int(pipe, flags);
 }
-- 
cgit v1.2.3


From 7024019d4e6e2a1618e910a127bea8c3b7661a54 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:00:27 -0700
Subject: Cell: checkpoint commit: always inline prim indexes into batch buffer

Also, explicit release-vertex-buffer command.
Lots of debug/stale code still in place...
---
 src/mesa/pipe/cell/common.h        |  12 ++++
 src/mesa/pipe/cell/ppu/cell_vbuf.c | 113 ++++++++++++++++++++++++++-----------
 src/mesa/pipe/cell/spu/spu_main.c  | 110 +++++++++++++++++++++++++-----------
 src/mesa/pipe/cell/spu/spu_main.h  |   2 +
 4 files changed, 171 insertions(+), 66 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index ce9c381907..31637ed1cc 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -75,6 +75,7 @@
 #define CELL_CMD_FINISH               3
 #define CELL_CMD_RENDER               4
 #define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
 #define CELL_CMD_STATE_FRAMEBUFFER   10
 #define CELL_CMD_STATE_DEPTH_STENCIL 11
 #define CELL_CMD_STATE_SAMPLER       12
@@ -124,7 +125,11 @@ struct cell_command_render
    uint vertex_size;  /**< bytes per vertex */
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
+#if 0
    const void *vertex_data;
+#else
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+#endif
    const ushort *index_data;
    float xmin, ymin, xmax, ymax;
    boolean inline_indexes;
@@ -132,6 +137,13 @@ struct cell_command_render
 } ALIGN16_ATTRIB;
 
 
+struct cell_command_release_verts
+{
+   int opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
 /** XXX unions don't seem to work */
 struct cell_command
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index ee572b3a51..6e12e16fe0 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,8 +40,8 @@
 
 
 /** Allow prim indexes, verts to be inlined after RENDER command */
-#define ALLOW_INLINE_INDEXES 1
-#define ALLOW_INLINE_VERTS 1
+#define ALLOW_INLINE_INDEXES 01
+#define ALLOW_INLINE_VERTS 0
 
 
 /**
@@ -55,6 +55,9 @@ struct cell_vbuf_render
    uint prim;
    uint vertex_size;
    void *vertex_buffer;
+#if 1
+   uint vertex_buf;
+#endif
 };
 
 
@@ -81,13 +84,52 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
+#if 0
    assert(!cvbr->vertex_buffer);
    cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
+#else
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   printf("%s vertex_buf = %u\n", __FUNCTION__, cvbr->vertex_buf);
+#endif
    cvbr->vertex_size = vertex_size;
    return cvbr->vertex_buffer;
 }
 
 
+static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
+#if 0
+   align_free(vertices);
+#else
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+
+   {
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
+      release->opcode = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(&cell->pipe, 0x0);/*NEW*/
+#endif
+
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+
 static void
 cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
@@ -124,7 +166,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
    }
    printf("\n");
-#elif 0
+#elif 01
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
           nr_indices, nr_vertices,
           indices[0], indices[1], indices[2]);
@@ -157,28 +199,26 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       const uint index_bytes = ROUNDUP4(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
 
+      const uint batch_size = sizeof(struct cell_command_render)
+         + index_bytes;
+
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, sizeof(*render));
+         cell_batch_alloc(cell, batch_size);
+
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
-      if (ALLOW_INLINE_INDEXES &&
-          index_bytes <= cell_batch_free_space(cell)) {
-         /* indices inlined, right after render cmd */
-         void *dst = cell_batch_alloc(cell, index_bytes);
-         memcpy(dst, indices, nr_indices * 2);
-         render->inline_indexes = TRUE;
-         render->index_data = NULL;
-      }
-      else {
-         /* indices in separate buffer */
-         render->inline_indexes = FALSE;
-         render->index_data = indices;
-         ASSERT_ALIGN16(render->index_data);
-      }
 
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
+      render->inline_indexes = TRUE;
+      render->index_data = NULL;
+
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
@@ -188,12 +228,21 @@ cell_vbuf_draw(struct vbuf_render *vbr,
          void *dst = cell_batch_alloc(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
+#if 0
          render->vertex_data = NULL;
+#else
+         render->vertex_buf = ~0;
+#endif
       }
       else {
          render->inline_verts = FALSE;
+#if 0
          render->vertex_data = vertices;
          ASSERT_ALIGN16(render->vertex_data);
+#else
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
+#endif
       }
 
 
@@ -203,27 +252,13 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->ymax = ymax;
    }
 
-#if 01
+#if 0
    /* XXX this is temporary */
    cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT);
 #endif
 }
 
 
-static void
-cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
-                           unsigned vertex_size, unsigned vertices_used)
-{
-   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
-
-   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
-   align_free(vertices);
-
-   assert(vertices == cvbr->vertex_buffer);
-   cvbr->vertex_buffer = NULL;
-}
-
-
 static void
 cell_vbuf_destroy(struct vbuf_render *vbr)
 {
@@ -244,8 +279,17 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
 
+#if 0
    cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES;
    cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE;
+#else
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
+#endif
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
@@ -255,6 +299,9 @@ cell_init_vbuf(struct cell_context *cell)
    cell->vbuf_render->base.destroy = cell_vbuf_destroy;
 
    cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
 
    cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
 }
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 2097683b82..eb979718f8 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -69,6 +69,32 @@ wait_on_mask_all(unsigned tagMask)
 }
 
 
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
 
 /**
  * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
@@ -237,13 +263,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
+      /*
       printf("SPU %u: indices at %p  vertices at %p\n",
              spu.init.id,
              render->index_data, render->vertex_data);
+      */
    }
 
    ASSERT(sizeof(*render) % 4 == 0);
+#if 0
    ASSERT_ALIGN16(render->vertex_data);
+#else
+#endif
    ASSERT_ALIGN16(render->index_data);
 
 
@@ -251,10 +282,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     ** Get vertex, index buffers if not inlined
     **/
    if (!render->inline_verts) {
+      void *src;
       ASSERT(total_vertex_bytes % 16 == 0);
 
+#if 0
+      src = render->vertex_data;
+#else
+      spu.cur_vertex_buf = render->vertex_buf;
+      src = spu.init.buffers[render->vertex_buf];
+#endif
+
       mfc_get(vertex_data,  /* dest */
-              (unsigned int) render->vertex_data,  /* src */
+              (unsigned int) src,
               total_vertex_bytes,  /* size */
               TAG_VERTEX_BUFFER,
               0, /* tid */
@@ -298,6 +337,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          /* vertices are after indexes, if inlined */
          vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
          *pos_incr = *pos_incr + total_vertex_bytes / 4;
+         spu.cur_vertex_buf = ~0;
       }
    }
 
@@ -310,6 +350,12 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       mask |= (1 << TAG_INDEX_BUFFER);
    wait_on_mask_all(mask);
 
+#if 0
+   if (!render->inline_verts) {
+      printf("SPU %u: release vbuf %u\n", spu.init.id, render->vertex_buf);
+      release_buffer(render->vertex_buf);
+   }
+#endif
 
    /**
     ** find tiles which intersect the prim bounding box
@@ -359,6 +405,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
 
+         if (indexes[j] == 0xffff) {
+            printf("index[%u] = 0xffff\n", j);
+         }
+
+         ASSERT(indexes[j] != 0xffff);
+         ASSERT(indexes[j+1] != 0xffff);
+         ASSERT(indexes[j+2] != 0xffff);
+
          v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
@@ -391,6 +445,17 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 }
 
 
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   if (Debug)
+      printf("SPU %u: RELEASE VERTS %u\n",
+             spu.init.id, spu.cur_vertex_buf);
+   ASSERT(spu.cur_vertex_buf == release->vertex_buf);
+   release_buffer(release->vertex_buf);
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -472,38 +537,6 @@ cmd_finish(void)
 }
 
 
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BUFFERS);
-
-   /*
-   printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n",
-          spu.init.id, buffer, index, dst);
-   */
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
 /**
  * Execute a batch of commands
  * The opcode param encodes the location of the buffer and its size.
@@ -538,6 +571,8 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
+   if (Debug)
+      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
    release_buffer(buf);
 
    for (pos = 0; pos < usize; /* no incr */) {
@@ -567,6 +602,15 @@ cmd_batch(uint opcode)
             pos += sizeof(*render) / 4 + pos_incr;
          }
          break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            ASSERT(sizeof(*release) == 8);
+            pos += sizeof(*release) / 4;
+         }
+         break;
       case CELL_CMD_FINISH:
          cmd_finish();
          pos += 1;
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 5bc5d9fa99..68c7263b7f 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -65,6 +65,8 @@ struct spu_global
 
    /* XXX more state to come */
 
+   uint cur_vertex_buf;
+
 } ALIGN16_ATTRIB;
 
 
-- 
cgit v1.2.3


From 5b5ec94663d566b4840975c4ef4740abb138bb12 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:41:27 -0700
Subject: Cell: clean-up of render path

Finally removed a number of unneeded flush commands.  Vertex buffers are
allocated from the general buffer pool, freed by SPUs when done.
Still an occasional failed assertion (invalid batch buffer command)...
---
 src/mesa/pipe/cell/common.h        |  12 +---
 src/mesa/pipe/cell/ppu/cell_vbuf.c |  60 ++++++--------------
 src/mesa/pipe/cell/spu/spu_main.c  | 112 +++++++------------------------------
 src/mesa/pipe/cell/spu/spu_main.h  |   2 -
 4 files changed, 38 insertions(+), 148 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 31637ed1cc..d6e1dd4f7d 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -68,7 +68,7 @@
  * The low byte of a mailbox word contains the command opcode.
  * Remaining higher bytes are command specific.
  */
-#define CELL_CMD_OPCODE_MASK 0xf
+#define CELL_CMD_OPCODE_MASK 0xff
 
 #define CELL_CMD_EXIT                 1
 #define CELL_CMD_CLEAR_SURFACE        2
@@ -113,10 +113,6 @@ struct cell_command_clear_surface
 } ALIGN16_ATTRIB;
 
 
-#define CELL_MAX_VBUF_SIZE    (16 * 1024)
-#define CELL_MAX_VBUF_INDEXES 1024
-
-
 struct cell_command_render
 {
    uint opcode;       /**< CELL_CMD_RENDER */
@@ -125,14 +121,8 @@ struct cell_command_render
    uint vertex_size;  /**< bytes per vertex */
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
-#if 0
-   const void *vertex_data;
-#else
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-#endif
-   const ushort *index_data;
    float xmin, ymin, xmax, ymax;
-   boolean inline_indexes;
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 6e12e16fe0..b2a25d767b 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -39,9 +39,8 @@
 #include "pipe/draw/draw_vbuf.h"
 
 
-/** Allow prim indexes, verts to be inlined after RENDER command */
-#define ALLOW_INLINE_INDEXES 01
-#define ALLOW_INLINE_VERTS 0
+/** Allow vertex data to be inlined after RENDER command */
+#define ALLOW_INLINE_VERTS 1
 
 
 /**
@@ -52,12 +51,10 @@ struct cell_vbuf_render
 {
    struct vbuf_render base;
    struct cell_context *cell;
-   uint prim;
-   uint vertex_size;
-   void *vertex_buffer;
-#if 1
-   uint vertex_buf;
-#endif
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
 
@@ -84,15 +81,10 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
-#if 0
-   assert(!cvbr->vertex_buffer);
-   cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
-#else
+
    assert(cvbr->vertex_buf == ~0);
    cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
    cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
-   printf("%s vertex_buf = %u\n", __FUNCTION__, cvbr->vertex_buf);
-#endif
    cvbr->vertex_size = vertex_size;
    return cvbr->vertex_buffer;
 }
@@ -105,14 +97,13 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    struct cell_context *cell = cvbr->cell;
 
-   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
-#if 0
-   align_free(vertices);
-#else
+   /*
    printf("%s vertex_buf = %u  count = %u\n",
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
 
-   {
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
          = (struct cell_command_release_verts *)
          cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
@@ -121,8 +112,7 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    }
 
    cvbr->vertex_buf = ~0;
-   cell_flush_int(&cell->pipe, 0x0);/*NEW*/
-#endif
+   cell_flush_int(&cell->pipe, 0x0);
 
    assert(vertices == cvbr->vertex_buffer);
    cvbr->vertex_buffer = NULL;
@@ -166,7 +156,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
    }
    printf("\n");
-#elif 01
+#elif 0
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
           nr_indices, nr_vertices,
           indices[0], indices[1], indices[2]);
@@ -213,8 +203,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       /* append indices after render command */
       memcpy(render + 1, indices, nr_indices * 2);
-      render->inline_indexes = TRUE;
-      render->index_data = NULL;
 
       /* if there's room, append vertices after the indices, else leave
        * vertices in the original/separate buffer.
@@ -222,30 +210,20 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
-         render->inline_indexes &&
           vertex_bytes <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices */
          void *dst = cell_batch_alloc(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
-#if 0
-         render->vertex_data = NULL;
-#else
          render->vertex_buf = ~0;
-#endif
       }
       else {
+         /* vertex data in separate buffer */
          render->inline_verts = FALSE;
-#if 0
-         render->vertex_data = vertices;
-         ASSERT_ALIGN16(render->vertex_data);
-#else
          ASSERT(cvbr->vertex_buf >= 0);
          render->vertex_buf = cvbr->vertex_buf;
-#endif
       }
 
-
       render->xmin = xmin;
       render->ymin = ymin;
       render->xmax = xmax;
@@ -253,7 +231,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    }
 
 #if 0
-   /* XXX this is temporary */
+   /* helpful for debug */
    cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT);
 #endif
 }
@@ -279,17 +257,15 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
 
-#if 0
-   cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES;
-   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE;
-#else
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
    cell->vbuf_render->base.max_indices
       = (CELL_BUFFER_SIZE
          - sizeof(struct cell_command_render)
          - sizeof(struct cell_command_release_verts))
       / sizeof(ushort);
    cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
-#endif
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index eb979718f8..5b50ec6953 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -239,59 +239,45 @@ static void
 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 {
    /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_MAX_VBUF_SIZE] ALIGN16_ATTRIB;
-   ushort index_data[CELL_MAX_VBUF_INDEXES] ALIGN16_ATTRIB;
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
    const uint total_vertex_bytes = render->num_verts * vertex_size;
    const ubyte *vertices;
    const ushort *indexes;
-   uint mask;
    uint i, j;
 
 
    if (Debug) {
       printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u  inline_ind=%u\n",
+             "inline_vert=%u\n",
              spu.init.id,
              render->prim_type,
              render->num_verts,
              render->num_indexes,
-             render->inline_verts,
-             render->inline_indexes);
+             render->inline_verts);
 
       /*
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
-      /*
-      printf("SPU %u: indices at %p  vertices at %p\n",
-             spu.init.id,
-             render->index_data, render->vertex_data);
-      */
    }
 
    ASSERT(sizeof(*render) % 4 == 0);
-#if 0
-   ASSERT_ALIGN16(render->vertex_data);
-#else
-#endif
-   ASSERT_ALIGN16(render->index_data);
+   ASSERT(total_vertex_bytes % 16 == 0);
 
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   *pos_incr = (render->num_indexes * 2 + 3) / 4;
 
-   /**
-    ** Get vertex, index buffers if not inlined
-    **/
-   if (!render->inline_verts) {
-      void *src;
-      ASSERT(total_vertex_bytes % 16 == 0);
-
-#if 0
-      src = render->vertex_data;
-#else
-      spu.cur_vertex_buf = render->vertex_buf;
-      src = spu.init.buffers[render->vertex_buf];
-#endif
 
+   if (render->inline_verts) {
+      /* Vertices are right after indexes in batch buffer */
+      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
+      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      void *src = spu.init.buffers[render->vertex_buf];
       mfc_get(vertex_data,  /* dest */
               (unsigned int) src,
               total_vertex_bytes,  /* size */
@@ -300,63 +286,11 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
               0  /* rid */);
 
       vertices = vertex_data;
-   }
-
-   if (!render->inline_indexes) {
-      uint total_index_bytes;
-
-      *pos_incr = 0;
-
-      total_index_bytes = render->num_indexes * sizeof(ushort);
-      if (total_index_bytes < 16)
-         total_index_bytes = 16;
-      else
-         total_index_bytes = ROUNDUP16(total_index_bytes);
 
-      indexes = index_data;
-
-      /* get index data from main memory */
-      mfc_get(index_data,  /* dest */
-              (unsigned int) render->index_data,  /* src */
-              total_index_bytes,
-              TAG_INDEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-   }
-
-
-   /**
-    ** Get pointers to inlined indexes, verts, if present
-    **/
-   if (render->inline_indexes) {
-      /* indexes are right after the render command in the batch buffer */
-      indexes = (ushort *) (render + 1);
-      *pos_incr = (render->num_indexes * 2 + 3) / 4;
-
-      if (render->inline_verts) {
-         /* vertices are after indexes, if inlined */
-         vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-         *pos_incr = *pos_incr + total_vertex_bytes / 4;
-         spu.cur_vertex_buf = ~0;
-      }
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
    }
 
 
-   /* wait for vertex and/or index buffers if not inlined */
-   mask = 0x0;
-   if (!render->inline_verts)
-      mask |= (1 << TAG_VERTEX_BUFFER);
-   if (!render->inline_indexes)
-      mask |= (1 << TAG_INDEX_BUFFER);
-   wait_on_mask_all(mask);
-
-#if 0
-   if (!render->inline_verts) {
-      printf("SPU %u: release vbuf %u\n", spu.init.id, render->vertex_buf);
-      release_buffer(render->vertex_buf);
-   }
-#endif
-
    /**
     ** find tiles which intersect the prim bounding box
     **/
@@ -372,7 +306,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 #endif
 
    /* make sure any pending clears have completed */
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
    /**
@@ -405,14 +339,6 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
 
-         if (indexes[j] == 0xffff) {
-            printf("index[%u] = 0xffff\n", j);
-         }
-
-         ASSERT(indexes[j] != 0xffff);
-         ASSERT(indexes[j+1] != 0xffff);
-         ASSERT(indexes[j+2] != 0xffff);
-
          v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
@@ -450,8 +376,8 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 {
    if (Debug)
       printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, spu.cur_vertex_buf);
-   ASSERT(spu.cur_vertex_buf == release->vertex_buf);
+             spu.init.id, release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 68c7263b7f..5bc5d9fa99 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -65,8 +65,6 @@ struct spu_global
 
    /* XXX more state to come */
 
-   uint cur_vertex_buf;
-
 } ALIGN16_ATTRIB;
 
 
-- 
cgit v1.2.3


From 3f8a8eada693c9501b3e52d47986e46028c172b0 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:41:51 -0700
Subject: Cell: remove unneeded flush(), dead code

---
 src/mesa/pipe/cell/ppu/cell_clear.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c
index e01640b994..e61bfd9b0f 100644
--- a/src/mesa/pipe/cell/ppu/cell_clear.c
+++ b/src/mesa/pipe/cell/ppu/cell_clear.c
@@ -48,7 +48,6 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
 {
    struct cell_context *cell = cell_context(pipe);
-   /*uint i;*/
    uint surfIndex;
 
    if (!cell->cbuf_map[0])
@@ -61,29 +60,7 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       surfIndex = 0;
    }
 
-#if 0
-   for (i = 0; i < cell->num_spus; i++) {
-#if 1
-      uint clr = clearValue;
-      if (surfIndex == 0) {
-         /* XXX debug: clear color varied per-SPU to visualize tiles */
-         if ((clr & 0xff) == 0)
-            clr |= 64 + i * 8;
-         if ((clr & 0xff00) == 0)
-            clr |= (64 + i * 8) << 8;
-         if ((clr & 0xff0000) == 0)
-            clr |= (64 + i * 8) << 16;
-         if ((clr & 0xff000000) == 0)
-            clr |= (64 + i * 8) << 24;
-      }
-      cell_global.command[i].clear.value = clr;
-#else
-      cell_global.command[i].clear.value = clearValue;
-#endif
-      cell_global.command[i].clear.surface = surfIndex;
-      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_CLEAR_SURFACE);
-   }
-#else
+
    {
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
@@ -92,9 +69,4 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
-#endif
-
-   /* XXX temporary */
-   cell_flush(&cell->pipe, 0x0);
-
 }
-- 
cgit v1.2.3


From a8590e097e965c01ede7df47ff752b0e7adabace Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:20:47 -0700
Subject: Cell: make sure state commands aren't split across batches

---
 src/mesa/pipe/cell/ppu/cell_state_emit.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index dbca900c35..6776ec88c7 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -33,6 +33,17 @@
 
 
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint *dst = (uint *) cell_batch_alloc(cell, sizeof(uint) + state_size);
+   *dst = cmd;
+   memcpy(dst + 1, state, state_size);
+}
+
+
+
 void
 cell_emit_state(struct cell_context *cell)
 {
@@ -51,22 +62,18 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      uint cmd = CELL_CMD_STATE_DEPTH_STENCIL;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->depth_stencil,
-                        sizeof(struct pipe_depth_stencil_alpha_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
+                     cell->depth_stencil,
+                     sizeof(struct pipe_depth_stencil_alpha_state));
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-      uint cmd = CELL_CMD_STATE_SAMPLER;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->sampler[0],
-                        sizeof(struct pipe_sampler_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER,
+                     cell->sampler[0], sizeof(struct pipe_sampler_state));
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
-      uint cmd = CELL_CMD_STATE_VERTEX_INFO;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, &cell->vertex_info, sizeof(struct vertex_info));
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
    }
 }
-- 
cgit v1.2.3


From 3d1b0f4c57edaf5707e4952617dcd6c57dfbdc65 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:21:25 -0700
Subject: Cell: additional assertions

---
 src/mesa/pipe/cell/spu/spu_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 5b50ec6953..62f6a357ba 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -441,9 +441,12 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   if (Debug)
+   if (Debug) {
       printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
              vinfo->num_attribs);
+   }
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
 }
 
-- 
cgit v1.2.3


From 043fc00a60377f8cd1878e0d0e5157dfb4567289 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 12:46:05 -0700
Subject: Cell: re-enable bounding boxes

The geometry bounding box is used to restrict rasterization to just those
tiles that are relevant.
Note another dummy field had to be added to the cell_command_render struct.
Apparently, every 4th word in a struct is susceptible to corruption in some
circumstances.  Might be a compiler bug.
---
 src/mesa/pipe/cell/common.h        |  2 +-
 src/mesa/pipe/cell/ppu/cell_vbuf.c |  4 ++++
 src/mesa/pipe/cell/spu/spu_main.c  | 30 +++++++++++++++++++-----------
 3 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d6e1dd4f7d..5e32b209e6 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -122,7 +122,7 @@ struct cell_command_render
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-   float xmin, ymin, xmax, ymax;
+   float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index b2a25d767b..9f737287ad 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -180,6 +180,10 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       if (v[1] > ymax)
          ymax = v[1];
    }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
 
    if (cvbr->prim != PIPE_PRIM_TRIANGLES)
       return; /* only render tris for now */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 62f6a357ba..c2b05ed5a2 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -200,7 +200,7 @@ tile_bounding_box(const struct cell_command_render *render,
                   uint *txmin, uint *tymin,
                   uint *box_num_tiles, uint *box_width_tiles)
 {
-#if 1
+#if 0
    /* Debug: full-window bounding box */
    uint txmax = spu.fb.width_tiles - 1;
    uint tymax = spu.fb.height_tiles - 1;
@@ -223,13 +223,24 @@ tile_bounding_box(const struct cell_command_render *render,
    *box_num_tiles = *box_width_tiles * box_height_tiles;
 #endif
 #if 0
-   printf("Render bounds: %g, %g  ...  %g, %g\n",
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
           render->xmin, render->ymin, render->xmax, render->ymax);
-   printf("Render tiles:  %u, %u .. %u, %u\n", *txmin, *tymin, txmax, tymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
 #endif
 }
 
 
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
 /**
  * Render primitives
  * \param pos_incr  returns value indicating how may words to skip after
@@ -295,15 +306,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     ** find tiles which intersect the prim bounding box
     **/
    uint txmin, tymin, box_width_tiles, box_num_tiles;
-#if 0
    tile_bounding_box(render, &txmin, &tymin,
                      &box_num_tiles, &box_width_tiles);
-#else
-   txmin = 0;
-   tymin = 0;
-   box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   box_width_tiles = spu.fb.width_tiles;
-#endif
+
 
    /* make sure any pending clears have completed */
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
@@ -312,13 +317,16 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    /**
     ** loop over tiles, rendering tris
     **/
-   for (i = spu.init.id; i < box_num_tiles; i += spu.init.num_spus) {
+   for (i = 0; i < box_num_tiles; i++) {
       const uint tx = txmin + i % box_width_tiles;
       const uint ty = tymin + i / box_width_tiles;
 
       ASSERT(tx < spu.fb.width_tiles);
       ASSERT(ty < spu.fb.height_tiles);
 
+      if (!my_tile(tx, ty))
+         continue;
+
       /* Start fetching color/z tiles.  We'll wait for completion when
        * we need read/write to them later in triangle rasterization.
        */
-- 
cgit v1.2.3


From 41899c70a72cd6206acec6c4c41953fea17d4ecf Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 13:02:11 -0700
Subject: Cell: emit state in cell_clear_surface() if dirty.

Without this a program that does nothing but glClear() doesn't work.  We need
the framebuffer state.
---
 src/mesa/pipe/cell/ppu/cell_clear.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c
index e61bfd9b0f..07b908eec5 100644
--- a/src/mesa/pipe/cell/ppu/cell_clear.c
+++ b/src/mesa/pipe/cell/ppu/cell_clear.c
@@ -50,6 +50,10 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
    struct cell_context *cell = cell_context(pipe);
    uint surfIndex;
 
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+
    if (!cell->cbuf_map[0])
       cell->cbuf_map[0] = pipe_surface_map(ps);
 
-- 
cgit v1.2.3


From c2372cc7481bf3985a6a3126952ab9d5dab4bf77 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:22:12 -0700
Subject: Cell: initial texture cache/sampling code

---
 src/mesa/pipe/cell/spu/spu_texture.c | 139 +++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_texture.h |  43 +++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 src/mesa/pipe/cell/spu/spu_texture.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_texture.h

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..6d566a5006
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -0,0 +1,139 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+
+
+/**
+ * Number of texture tiles to cache.
+ * Note that this will probably be the largest consumer of SPU local store/
+ * memory for this driver!
+ */
+#define CACHE_SIZE 16
+
+static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
+
+static int tex_tile_x[CACHE_SIZE], tex_tile_y[CACHE_SIZE];
+
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   /* XXX memset? */
+   uint i;
+   for (i = 0; i < CACHE_SIZE; i++)
+      tex_tile_x[i] = tex_tile_y[i] = -1;
+}
+
+
+/**
+ * Return the cache pos/index which corresponds to texel (i,j)
+ */
+static INLINE uint
+cache_pos(uint i, uint j)
+{
+   uint tx = i / TILE_SIZE;
+   uint ty = j / TILE_SIZE;
+   uint pos = (tx + ty * 4) % CACHE_SIZE;
+   return pos;
+}
+
+
+/**
+ * Make sure the tile for texel (i,j) is present, return its position/index
+ * in the cache.
+ */
+static uint
+get_tex_tile(uint i, uint j)
+{
+   const int tx = i / TILE_SIZE;
+   const int ty = j / TILE_SIZE;
+   const uint pos = cache_pos(i, j);
+
+   if (tex_tile_x[pos] != tx || tex_tile_y[pos] != ty) {
+      /* texture cache miss, fetch tile from main memory */
+      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
+      const uint bytes_per_tile = sizeof(tile_t);
+      const void *src = (const ubyte *) spu.texture.start
+         + (ty * tiles_per_row + tx) * bytes_per_tile;
+
+      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
+             spu.init.id, tx, ty, pos,
+             tex_tile_x[pos], tex_tile_y[pos]);
+#if 0
+      printf("SPU %u: get tex tile from %p to %p\n",
+             spu.init.id, src, tex_tiles[pos].t32);
+#endif
+
+      ASSERT_ALIGN16(tex_tiles[pos].t32);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(tex_tiles[pos].t32,  /* dest */
+              (unsigned int) src,
+              bytes_per_tile,      /* size */
+              TAG_TEXTURE_TILE,
+              0, /* tid */
+              0  /* rid */);
+
+      wait_on_mask(1 << TAG_TEXTURE_TILE);
+
+      tex_tile_x[pos] = tx;
+      tex_tile_y[pos] = ty;
+   }
+   else {
+#if 0
+      printf("SPU %u: tex cache HIT at %d, %d\n",
+             spu.init.id, tx, ty);
+#endif
+   }
+
+   return pos;
+}
+
+
+/**
+ * Get texture sample at texcoord.
+ * XXX this is extremely primitive for now.
+ */
+uint
+sample_texture(const float *texcoord)
+{
+   /* wrap/repeat */
+   uint i = (uint) (texcoord[0] * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (texcoord[1] * spu.texture.height) % spu.texture.height;
+   uint pos = get_tex_tile(i, j);
+   uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
+   return texel;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..b75b7ac44f
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern uint
+sample_texture(const float *texcoord);
+
+
+#endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 425f270fcbfdbfce98adaf9da4b8eb7360f34447 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:23:44 -0700
Subject: Cell: basic texture mapping

Texture images are tiled in PPU code.  SPUs use a texture cache for getting
texels from textures.
This is very rough code, but demos/texcyl.c works.
---
 src/mesa/pipe/cell/common.h                 | 10 +++-
 src/mesa/pipe/cell/ppu/cell_context.h       |  5 +-
 src/mesa/pipe/cell/ppu/cell_state_emit.c    | 12 +++-
 src/mesa/pipe/cell/ppu/cell_state_sampler.c | 10 +++-
 src/mesa/pipe/cell/ppu/cell_texture.c       | 87 +++++++++++++++++++++++++++++
 src/mesa/pipe/cell/ppu/cell_texture.h       |  6 ++
 src/mesa/pipe/cell/spu/Makefile             |  1 +
 src/mesa/pipe/cell/spu/spu_main.c           | 17 ++++++
 src/mesa/pipe/cell/spu/spu_main.h           |  3 +
 src/mesa/pipe/cell/spu/spu_tri.c            | 60 ++++++++++++--------
 10 files changed, 183 insertions(+), 28 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 5e32b209e6..f0d48ff403 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -79,7 +79,8 @@
 #define CELL_CMD_STATE_FRAMEBUFFER   10
 #define CELL_CMD_STATE_DEPTH_STENCIL 11
 #define CELL_CMD_STATE_SAMPLER       12
-#define CELL_CMD_STATE_VERTEX_INFO   13
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
 
 
 #define CELL_NUM_BUFFERS 4
@@ -134,6 +135,13 @@ struct cell_command_release_verts
 };
 
 
+struct cell_command_texture
+{
+   void *start;         /**< Address in main memory */
+   uint width, height;
+};
+
+
 /** XXX unions don't seem to work */
 struct cell_command
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index de65fb5e9a..7d234f3e45 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -76,7 +76,7 @@ struct cell_context
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
-   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct cell_texture *texture[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
    struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
@@ -84,6 +84,9 @@ struct cell_context
    ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
    ubyte *zsbuf_map;
 
+   struct pipe_surface *tex_surf;
+   uint *tex_map;
+
    uint dirty;
 
    /** The primitive drawing context */
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 6776ec88c7..391ff454ac 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -30,7 +30,7 @@
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_batch.h"
-
+#include "cell_texture.h"
 
 
 static void
@@ -72,6 +72,16 @@ cell_emit_state(struct cell_context *cell)
                      cell->sampler[0], sizeof(struct pipe_sampler_state));
    }
 
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      struct cell_command_texture texture;
+      texture.start = cell->texture[0]->tiled_data;
+      texture.width = cell->texture[0]->base.width[0];
+      texture.height = cell->texture[0]->base.height[0];
+
+      emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
+                     &texture, sizeof(struct cell_command_texture));
+   }
+
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
       emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
                      &cell->vertex_info, sizeof(struct vertex_info));
diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
index ae1eeb4620..317f7603bb 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
@@ -30,12 +30,10 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
-#if 0
 #include "cell_texture.h"
-#include "cell_tile_cache.h"
-#endif
 
 
 void *
@@ -53,6 +51,8 @@ cell_bind_sampler_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    cell->sampler[unit] = (struct pipe_sampler_state *)sampler;
 
@@ -76,7 +76,11 @@ cell_set_sampler_texture(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->texture[sampler] = texture;
 
+   cell_update_texture_mapping(cell);
+
    cell->dirty |= CELL_NEW_TEXTURE;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index 0a8190d983..acbe4c79f0 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -163,3 +163,90 @@ cell_get_tex_surface(struct pipe_context *pipe,
    }
    return ps;
 }
+
+
+
+static void
+tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = h / tile_size, w_t = w / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* fill in tile (i, j) */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         for (i = 0; i < tile_size; i++) {
+            for (j = 0; j < tile_size; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               *tdst++ = src[srci * h + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
+static void
+cell_tile_texture(struct cell_context *cell,
+                  struct cell_texture *texture)
+{
+   uint face = 0, level = 0, zslice = 0;
+   struct pipe_surface *surf;
+   const uint w = texture->base.width[0], h = texture->base.height[0];
+   const uint *src;
+
+   /* temporary restrictions: */
+   assert(w >= TILE_SIZE);
+   assert(h >= TILE_SIZE);
+   assert(w % TILE_SIZE == 0);
+   assert(h % TILE_SIZE == 0);
+
+   surf = cell_get_tex_surface(&cell->pipe, &texture->base, face, level, zslice);
+   ASSERT(surf);
+
+   src = (const uint *) pipe_surface_map(surf);
+
+   if (texture->tiled_data) {
+      align_free(texture->tiled_data);
+   }
+   texture->tiled_data = align_malloc(w * h * 4, 16);
+
+   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+
+   pipe_surface_unmap(surf);
+
+   pipe_surface_reference(&surf, NULL);
+}
+
+
+
+void
+cell_update_texture_mapping(struct cell_context *cell)
+{
+   uint face = 0, level = 0, zslice = 0;
+
+   cell_tile_texture(cell, cell->texture[0]);
+#if 0
+   if (cell->tex_surf && cell->tex_map) {
+      pipe_surface_unmap(cell->tex_surf);
+      cell->tex_map = NULL;
+   }
+
+   /* XXX free old surface */
+
+   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
+                                         &cell->texture[0]->base,
+                                         face, level, zslice);
+
+   cell->tex_map = pipe_surface_map(cell->tex_surf);
+#endif
+}
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h
index ef5808c086..bd434c8776 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.h
+++ b/src/mesa/pipe/cell/ppu/cell_texture.h
@@ -46,6 +46,8 @@ struct cell_texture
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
+
+   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
@@ -70,4 +72,8 @@ cell_get_tex_surface(struct pipe_context *pipe,
                      unsigned face, unsigned level, unsigned zslice);
 
 
+extern void
+cell_update_texture_mapping(struct cell_context *cell);
+
+
 #endif /* CELL_TEXTURE */
diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 417ae1b072..011fdcefe3 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -17,6 +17,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_texture.c \
 	spu_tile.c \
 	spu_tri.c
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index c2b05ed5a2..5a5b17dd89 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 #include <spu_mfcio.h>
 
 #include "spu_main.h"
+#include "spu_texture.h"
 #include "spu_tri.h"
 #include "spu_tile.h"
 #include "pipe/cell/common.h"
@@ -446,6 +447,17 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 }
 
 
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   if (Debug)
+      printf("SPU %u: TEXTURE at %p  size %u x %u\n",
+             spu.init.id, texture->start, texture->width, texture->height);
+
+   memcpy(&spu.texture, texture, sizeof(*texture));
+}
+
+
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
@@ -561,6 +573,10 @@ cmd_batch(uint opcode)
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
          pos += (1 + sizeof(struct pipe_sampler_state) / 4);
          break;
+      case CELL_CMD_STATE_TEXTURE:
+         cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_command_texture) / 4);
+         break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct vertex_info) / 4);
@@ -656,6 +672,7 @@ one_time_init(void)
 {
    memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status));
    memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z));
+   invalidate_tex_cache();
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 5bc5d9fa99..480c54ebd0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -60,6 +60,7 @@ struct spu_global
    struct pipe_depth_stencil_alpha_state depth_stencil;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct cell_command_texture texture;
 
    struct vertex_info vertex_info;
 
@@ -84,6 +85,8 @@ extern struct spu_global spu;
 #define TAG_INDEX_BUFFER      16
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
+#define TAG_TEXTURE_TILE      19
+
 
 
 extern void
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 3d0d106c10..aad28f1036 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -33,6 +33,7 @@
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
 #include "spu_main.h"
+#include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
 
@@ -362,9 +363,24 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
    /* Cell: "write" quad fragments to the tile by setting prim color */
    const int ix = x - setup->cliprect_minx;
    const int iy = y - setup->cliprect_miny;
-   float colors[4][4];
-
-   eval_coeff(setup, 1, (float) x, (float) y, colors);
+   uint colors[4];  /* indexed by QUAD_x */
+
+   if (spu.texture.start) {
+      float texcoords[4][4];
+      uint i;
+      eval_coeff(setup, 2, (float) x, (float) y, texcoords);
+      for (i = 0; i < 4; i++) {
+         colors[i] = sample_texture(texcoords[i]);
+      }
+   }
+   else {
+      float fcolors[4][4];
+      eval_coeff(setup, 1, (float) x, (float) y, fcolors);
+      colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
+      colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
+      colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
+      colors[QUAD_BOTTOM_RIGHT] = pack_color(fcolors[QUAD_BOTTOM_RIGHT]);
+   }
 
    if (spu.depth_stencil.depth.enabled) {
       mask &= do_depth_test(setup, x, y, mask);
@@ -382,13 +398,13 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
       tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
 
       if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = pack_color(colors[QUAD_TOP_LEFT]);
+         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = pack_color(colors[QUAD_TOP_RIGHT]);
+         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = pack_color(colors[QUAD_BOTTOM_LEFT]);
+         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = pack_color(colors[QUAD_BOTTOM_RIGHT]);
+         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
    }
 #endif
 }
@@ -606,7 +622,6 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 }
 
 
-#if 0
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  * The value value comes from vertex->data[slot][i].
@@ -614,21 +629,20 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
  * \param slot  which attribute slot 
  * \param i  which component of the slot (0..3)
  */
-static void const_coeff( struct setup_stage *setup,
-			 unsigned slot,
-			 unsigned i )
+static void const_coeff(struct setup_stage *setup, uint slot)
 {
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   uint i;
+   ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-   setup->coef[slot].dadx[i] = 0;
-   setup->coef[slot].dady[i] = 0;
+   for (i = 0; i < 4; i++) {
+      setup->coef[slot].dadx[i] = 0;
+      setup->coef[slot].dady[i] = 0;
 
-   /* need provoking vertex info!
-    */
-   setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+      /* need provoking vertex info!
+       */
+      setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+   }
 }
-#endif
 
 
 /**
@@ -735,15 +749,17 @@ static void setup_tri_coefficients( struct setup_stage *setup )
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(setup, i, 2, 3);  /* slot 0, z */
+         tri_linear_coeff(setup, i, 2, 3);
          /* XXX interp W if PERSPECTIVE... */
          break;
       case INTERP_CONSTANT:
-         /* fall-through */
+         const_coeff(setup, i);
+         break;
       case INTERP_LINEAR:
-         tri_linear_coeff(setup, i, 0, 4);  /* slot 1, color */
+         tri_linear_coeff(setup, i, 0, 4);
          break;
       case INTERP_PERSPECTIVE:
+         tri_linear_coeff(setup, i, 0, 4); /* XXX temporary */
          break;
       default:
          ASSERT(0);
-- 
cgit v1.2.3


From 25105276b38451439516928d188e07f2eb3e250e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:32:23 -0700
Subject: Cell: minor optimization for flat shading

---
 src/mesa/pipe/cell/spu/spu_tri.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index aad28f1036..19a231d9c4 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -200,16 +200,35 @@ static INLINE void
 eval_coeff( struct setup_stage *setup, uint slot,
             float x, float y, float result[4][4])
 {
-   uint i;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
+   switch (spu.vertex_info.interp_mode[slot]) {
+   case INTERP_CONSTANT:
+      {
+         uint i;
+         for (i = 0; i < 4; i++) {
+            result[QUAD_TOP_LEFT][i] =
+            result[QUAD_TOP_RIGHT][i] =
+            result[QUAD_BOTTOM_LEFT][i] =
+            result[QUAD_BOTTOM_RIGHT][i] = setup->coef[slot].a0[i];
+         }
+      }
+      break;
 
-   /* loop over XYZW comps */
-   for (i = 0; i < 4; i++) {
-      result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-      result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
-      result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
-      result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
+   case INTERP_LINEAR:
+      /* fall-through, for now */
+   default:
+      {
+         uint i;
+         const float *dadx = setup->coef[slot].dadx;
+         const float *dady = setup->coef[slot].dady;
+
+         /* loop over XYZW comps */
+         for (i = 0; i < 4; i++) {
+            result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+            result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
+            result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
+            result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
+         }
+      }
    }
 }
 
-- 
cgit v1.2.3


From e2406b47883d74933e74507af65695c8c7d7861a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:03:45 -0700
Subject: Cell: compute min index referenced in draw command, use it to reduce
 size of vertex data payload

---
 src/mesa/pipe/cell/common.h        |  2 ++
 src/mesa/pipe/cell/ppu/cell_vbuf.c | 13 +++++++++++--
 src/mesa/pipe/cell/spu/spu_main.c  | 20 ++++++++++++++++----
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index f0d48ff403..90aa46a534 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,6 +124,8 @@ struct cell_command_render
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
    float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint dummy3;
+   uint min_index;
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 9f737287ad..e63b34cf52 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -138,16 +138,23 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    struct cell_context *cell = cvbr->cell;
    float xmin, ymin, xmax, ymax;
    uint i;
-   uint nr_vertices = 0;
+   uint nr_vertices = 0, min_index = ~0;
    const void *vertices = cvbr->vertex_buffer;
    const uint vertex_size = cvbr->vertex_size;
 
    for (i = 0; i < nr_indices; i++) {
       if (indices[i] > nr_vertices)
          nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
    }
    nr_vertices++;
 
+#if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
 #if 0
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
           nr_indices, nr_vertices);
@@ -169,7 +176,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    /* compute x/y bounding box */
    xmin = ymin = 1e50;
    xmax = ymax = -1e50;
-   for (i = 0; i < nr_vertices; i++) {
+   for (i = min_index; i < nr_vertices; i++) {
       const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
       if (v[0] < xmin)
          xmin = v[0];
@@ -204,6 +211,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
+      render->min_index = min_index;
 
       /* append indices after render command */
       memcpy(render + 1, indices, nr_indices * 2);
@@ -214,6 +222,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
+          min_index == 0 &&
           vertex_bytes <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices */
          void *dst = cell_batch_alloc(cell, vertex_bytes);
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 5a5b17dd89..3c9efb4741 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -253,7 +253,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    /* we'll DMA into these buffers */
    ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
-   const uint total_vertex_bytes = render->num_verts * vertex_size;
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
@@ -289,9 +289,21 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    }
    else {
       /* Begin DMA fetch of vertex buffer */
-      void *src = spu.init.buffers[render->vertex_buf];
-      mfc_get(vertex_data,  /* dest */
-              (unsigned int) src,
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
               total_vertex_bytes,  /* size */
               TAG_VERTEX_BUFFER,
               0, /* tid */
-- 
cgit v1.2.3


From 4bede9219be1f93844c5897216c6674b46a23a88 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:09:16 -0700
Subject: Cell: add a few null texture tests

---
 src/mesa/pipe/cell/ppu/cell_state_emit.c | 13 ++++++++++---
 src/mesa/pipe/cell/ppu/cell_texture.c    |  3 ++-
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 391ff454ac..702184416b 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -74,9 +74,16 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       struct cell_command_texture texture;
-      texture.start = cell->texture[0]->tiled_data;
-      texture.width = cell->texture[0]->base.width[0];
-      texture.height = cell->texture[0]->base.height[0];
+      if (cell->texture[0]) {
+         texture.start = cell->texture[0]->tiled_data;
+         texture.width = cell->texture[0]->base.width[0];
+         texture.height = cell->texture[0]->base.height[0];
+      }
+      else {
+         texture.start = NULL;
+         texture.width = 0;
+         texture.height = 0;
+      }
 
       emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
                      &texture, sizeof(struct cell_command_texture));
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index acbe4c79f0..2cf6022939 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -234,7 +234,8 @@ cell_update_texture_mapping(struct cell_context *cell)
 {
    uint face = 0, level = 0, zslice = 0;
 
-   cell_tile_texture(cell, cell->texture[0]);
+   if (cell->texture[0])
+      cell_tile_texture(cell, cell->texture[0]);
 #if 0
    if (cell->tex_surf && cell->tex_map) {
       pipe_surface_unmap(cell->tex_surf);
-- 
cgit v1.2.3


From 64935c875128d2d1254b6b39ced72b9848d477fe Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:17:30 -0700
Subject: Cell: move cmd_render() into new spu_render.c file

---
 src/mesa/pipe/cell/spu/Makefile     |   1 +
 src/mesa/pipe/cell/spu/spu_main.c   | 206 +------------------------------
 src/mesa/pipe/cell/spu/spu_main.h   |   1 +
 src/mesa/pipe/cell/spu/spu_render.c | 240 ++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_render.h |  38 ++++++
 5 files changed, 283 insertions(+), 203 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_render.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_render.h

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 011fdcefe3..d5b30e1f27 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -17,6 +17,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
 	spu_tri.c
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 3c9efb4741..6e02f2c964 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -34,8 +34,8 @@
 #include <spu_mfcio.h>
 
 #include "spu_main.h"
+#include "spu_render.h"
 #include "spu_texture.h"
-#include "spu_tri.h"
 #include "spu_tile.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
@@ -47,7 +47,7 @@ helpful headers:
 /opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
 */
 
-static boolean Debug = FALSE;
+boolean Debug = FALSE;
 
 struct spu_global spu;
 
@@ -61,7 +61,7 @@ wait_on_mask(unsigned tagMask)
 }
 
 
-static void
+static INLINE void
 wait_on_mask_all(unsigned tagMask)
 {
    mfc_write_tag_mask( tagMask );
@@ -192,206 +192,6 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 }
 
 
-/**
- * Given a rendering command's bounding box (in pixels) compute the
- * location of the corresponding screen tile bounding box.
- */
-static INLINE void
-tile_bounding_box(const struct cell_command_render *render,
-                  uint *txmin, uint *tymin,
-                  uint *box_num_tiles, uint *box_width_tiles)
-{
-#if 0
-   /* Debug: full-window bounding box */
-   uint txmax = spu.fb.width_tiles - 1;
-   uint tymax = spu.fb.height_tiles - 1;
-   *txmin = 0;
-   *tymin = 0;
-   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   *box_width_tiles = spu.fb.width_tiles;
-   (void) render;
-   (void) txmax;
-   (void) tymax;
-#else
-   uint txmax, tymax, box_height_tiles;
-
-   *txmin = (uint) render->xmin / TILE_SIZE;
-   *tymin = (uint) render->ymin / TILE_SIZE;
-   txmax = (uint) render->xmax / TILE_SIZE;
-   tymax = (uint) render->ymax / TILE_SIZE;
-   *box_width_tiles = txmax - *txmin + 1;
-   box_height_tiles = tymax - *tymin + 1;
-   *box_num_tiles = *box_width_tiles * box_height_tiles;
-#endif
-#if 0
-   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
-          render->xmin, render->ymin, render->xmax, render->ymax);
-   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
-           spu.init.id, *txmin, *tymin, txmax, tymax);
-   ASSERT(render->xmin <= render->xmax);
-   ASSERT(render->ymin <= render->ymax);
-#endif
-}
-
-
-/** Check if the tile at (tx,ty) belongs to this SPU */
-static INLINE boolean
-my_tile(uint tx, uint ty)
-{
-   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
-}
-
-
-/**
- * Render primitives
- * \param pos_incr  returns value indicating how may words to skip after
- *                  this command in the batch buffer
- */
-static void
-cmd_render(const struct cell_command_render *render, uint *pos_incr)
-{
-   /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
-   const uint vertex_size = render->vertex_size; /* in bytes */
-   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
-   const ubyte *vertices;
-   const ushort *indexes;
-   uint i, j;
-
-
-   if (Debug) {
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-   }
-
-   ASSERT(sizeof(*render) % 4 == 0);
-   ASSERT(total_vertex_bytes % 16 == 0);
-
-   /* indexes are right after the render command in the batch buffer */
-   indexes = (const ushort *) (render + 1);
-   *pos_incr = (render->num_indexes * 2 + 3) / 4;
-
-
-   if (render->inline_verts) {
-      /* Vertices are right after indexes in batch buffer */
-      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-      *pos_incr = *pos_incr + total_vertex_bytes / 4;
-   }
-   else {
-      /* Begin DMA fetch of vertex buffer */
-      ubyte *src = spu.init.buffers[render->vertex_buf];
-      ubyte *dest = vertex_data;
-
-      /* skip vertex data we won't use */
-#if 01
-      src += render->min_index * vertex_size;
-      dest += render->min_index * vertex_size;
-      total_vertex_bytes -= render->min_index * vertex_size;
-#endif
-      ASSERT(total_vertex_bytes % 16 == 0);
-      ASSERT_ALIGN16(dest);
-      ASSERT_ALIGN16(src);
-
-      mfc_get(dest,   /* in vertex_data[] array */
-              (unsigned int) src,  /* src in main memory */
-              total_vertex_bytes,  /* size */
-              TAG_VERTEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-
-      vertices = vertex_data;
-
-      wait_on_mask(1 << TAG_VERTEX_BUFFER);
-   }
-
-
-   /**
-    ** find tiles which intersect the prim bounding box
-    **/
-   uint txmin, tymin, box_width_tiles, box_num_tiles;
-   tile_bounding_box(render, &txmin, &tymin,
-                     &box_num_tiles, &box_width_tiles);
-
-
-   /* make sure any pending clears have completed */
-   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
-
-
-   /**
-    ** loop over tiles, rendering tris
-    **/
-   for (i = 0; i < box_num_tiles; i++) {
-      const uint tx = txmin + i % box_width_tiles;
-      const uint ty = tymin + i / box_width_tiles;
-
-      ASSERT(tx < spu.fb.width_tiles);
-      ASSERT(ty < spu.fb.height_tiles);
-
-      if (!my_tile(tx, ty))
-         continue;
-
-      /* Start fetching color/z tiles.  We'll wait for completion when
-       * we need read/write to them later in triangle rasterization.
-       */
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
-            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         }
-      }
-
-      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      }
-
-      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
-      ASSERT(render->num_indexes % 3 == 0);
-
-      /* loop over tris */
-      for (j = 0; j < render->num_indexes; j += 3) {
-         const float *v0, *v1, *v2;
-
-         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
-         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
-         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
-
-         tri_draw(v0, v1, v2, tx, ty);
-      }
-
-      /* write color/z tiles back to main framebuffer, if dirtied */
-      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
-         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-         tile_status[ty][tx] = TILE_STATUS_DEFINED;
-      }
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
-            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
-         }
-      }
-
-      /* XXX move these... */
-      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-      if (spu.depth_stencil.depth.enabled) {
-         wait_on_mask(1 << TAG_WRITE_TILE_Z);
-      }
-   }
-
-   if (Debug)
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
-}
-
-
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 480c54ebd0..009e046ba5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -70,6 +70,7 @@ struct spu_global
 
 
 extern struct spu_global spu;
+extern boolean Debug;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
new file mode 100644
index 0000000000..21a286a23d
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -0,0 +1,240 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "pipe/cell/common.h"
+
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+
+
+   if (Debug) {
+      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
+             "inline_vert=%u\n",
+             spu.init.id,
+             render->prim_type,
+             render->num_verts,
+             render->num_indexes,
+             render->inline_verts);
+
+      /*
+      printf("       bound: %g, %g .. %g, %g\n",
+             render->xmin, render->ymin, render->xmax, render->ymax);
+      */
+   }
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   *pos_incr = (render->num_indexes * 2 + 3) / 4;
+
+
+   if (render->inline_verts) {
+      /* Vertices are right after indexes in batch buffer */
+      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
+      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      /* Start fetching color/z tiles.  We'll wait for completion when
+       * we need read/write to them later in triangle rasterization.
+       */
+      if (spu.depth_stencil.depth.enabled) {
+         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+         }
+      }
+
+      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+      }
+
+      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+      ASSERT(render->num_indexes % 3 == 0);
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         tri_draw(v0, v1, v2, tx, ty);
+      }
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
+         tile_status[ty][tx] = TILE_STATUS_DEFINED;
+      }
+      if (spu.depth_stencil.depth.enabled) {
+         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
+            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+         }
+      }
+
+      /* XXX move these... */
+      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+      if (spu.depth_stencil.depth.enabled) {
+         wait_on_mask(1 << TAG_WRITE_TILE_Z);
+      }
+   }
+
+   if (Debug)
+      printf("SPU %u: RENDER done\n",
+             spu.init.id);
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_render.h b/src/mesa/pipe/cell/spu/spu_render.h
new file mode 100644
index 0000000000..fbcdc5ec31
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "pipe/cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
-- 
cgit v1.2.3


From 7f2713a29ff46a608de0feac2f56f034dbc738cb Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 29 Jan 2008 11:22:57 -0700
Subject: Cell: use _pack_rgba8() from pack_rgba8.h to do float[4]->uint color
 conversion

texcyl.c is twice as fast now in non-texture mode
---
 src/mesa/pipe/cell/spu/spu_tri.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 19a231d9c4..7c6a54134f 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -29,6 +29,8 @@
  * Triangle rendering within a tile.
  */
 
+#include <pack_rgba8.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
@@ -38,7 +40,6 @@
 #include "spu_tri.h"
 
 
-
 /**
  * Simplified types taken from other parts of Gallium
  */
@@ -252,19 +253,11 @@ eval_z( struct setup_stage *setup,
 static INLINE uint
 pack_color(const float color[4])
 {
-   uint r = (uint) (color[0] * 255.0);
-   uint g = (uint) (color[1] * 255.0);
-   uint b = (uint) (color[2] * 255.0);
-   uint a = (uint) (color[3] * 255.0);
-   r = MIN2(r, 255);
-   g = MIN2(g, 255);
-   b = MIN2(b, 255);
-   a = MIN2(a, 255);
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return (a << 24) | (r << 16) | (g << 8) | b;
+      return _pack_rgba8(color[3], color[0], color[1], color[2]);
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return (b << 24) | (g << 16) | (r << 8) | a;
+      return _pack_rgba8(color[2], color[1], color[0], color[3]);
    default:
       ASSERT(0);
       return 0;
-- 
cgit v1.2.3


From da6eac242d9b79ad77389b6ab579804bc0261005 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:49:26 -0700
Subject: Cell: move CELL_MAX_SPUS

---
 src/mesa/pipe/cell/common.h           | 2 ++
 src/mesa/pipe/cell/ppu/cell_context.h | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 90aa46a534..d5e86863d4 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -61,6 +61,8 @@
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
 
+#define CELL_MAX_SPUS 6
+
 #define TILE_SIZE 32
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 7d234f3e45..65b89518ad 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -38,9 +38,6 @@
 #include "pipe/cell/common.h"
 
 
-#define CELL_MAX_SPUS 6
-
-
 struct cell_vbuf_render;
 
 struct cell_vertex_shader_state
-- 
cgit v1.2.3


From 41bdf4cf4c924e4c04c62dc144584cf7ead3cf44 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:49:51 -0700
Subject: Cell: make wait_on_mask() static/inlined

---
 src/mesa/pipe/cell/spu/spu_main.c | 19 -------------------
 src/mesa/pipe/cell/spu/spu_main.h | 23 +++++++++++++++++++++--
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 6e02f2c964..6886f283be 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,7 +31,6 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <spu_mfcio.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -52,24 +51,6 @@ boolean Debug = FALSE;
 struct spu_global spu;
 
 
-void
-wait_on_mask(unsigned tagMask)
-{
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_any();
-}
-
-
-static INLINE void
-wait_on_mask_all(unsigned tagMask)
-{
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_all();
-}
-
-
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 009e046ba5..8908bf8bc0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -29,6 +29,8 @@
 #define SPU_MAIN_H
 
 
+#include <spu_mfcio.h>
+
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_vertex.h"
 #include "pipe/p_state.h"
@@ -90,8 +92,25 @@ extern boolean Debug;
 
 
-extern void
-wait_on_mask(unsigned tag);
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
 
 
 static INLINE void
-- 
cgit v1.2.3


From 0d3f60ec64965a07ef26b551436f0d768154e4d3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:56:14 -0700
Subject: Cell: check tile status before wait_on_mask()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 7c6a54134f..01a47a4851 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -278,7 +278,7 @@ do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else {
+   else if (tile_status_z[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
@@ -403,7 +403,7 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else {
+      else if (tile_status[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-- 
cgit v1.2.3


From dcf41a0eed71a67060b4efa9ab4befc86eafc177 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:56:41 -0700
Subject: Cell: minor code refactoring, movement

---
 src/mesa/pipe/cell/spu/spu_render.c | 85 ++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 30 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index 21a286a23d..f506095116 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -88,6 +88,55 @@ my_tile(uint tx, uint ty)
 }
 
 
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.depth_stencil.depth.enabled) {
+      if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+      }
+   }
+
+   if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+      get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+      put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
+      tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+   }
+
+   if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+      put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
+      tile_status[ty][tx] = TILE_STATUS_DEFINED;
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.depth_stencil.depth.enabled) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
 /**
  * Render primitives
  * \param pos_incr  returns value indicating how may words to skip after
@@ -122,6 +171,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
 
    /* indexes are right after the render command in the batch buffer */
    indexes = (const ushort *) (render + 1);
@@ -186,21 +238,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
-      /* Start fetching color/z tiles.  We'll wait for completion when
-       * we need read/write to them later in triangle rasterization.
-       */
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
-            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         }
-      }
-
-      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      }
-
-      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
-      ASSERT(render->num_indexes % 3 == 0);
+      get_cz_tiles(tx, ty);
 
       /* loop over tris */
       for (j = 0; j < render->num_indexes; j += 3) {
@@ -214,22 +252,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       }
 
       /* write color/z tiles back to main framebuffer, if dirtied */
-      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
-         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-         tile_status[ty][tx] = TILE_STATUS_DEFINED;
-      }
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
-            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
-         }
-      }
+      put_cz_tiles(tx, ty);
 
-      /* XXX move these... */
-      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-      if (spu.depth_stencil.depth.enabled) {
-         wait_on_mask(1 << TAG_WRITE_TILE_Z);
-      }
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
    }
 
    if (Debug)
-- 
cgit v1.2.3


From 022bf6dfa1ef1c18f0439553e39e473b678848e2 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 12:08:23 -0700
Subject: Cell: make 'setup' a regular var instead of passing around a pointer
 everywhere

We'll never have more than one of these objects.
Avoiding pointer deref improves performance a bit.
---
 src/mesa/pipe/cell/spu/spu_tri.c | 419 +++++++++++++++++++--------------------
 1 file changed, 209 insertions(+), 210 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 01a47a4851..5bb2cb12e3 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -135,6 +135,12 @@ struct setup_stage {
 };
 
 
+
+static struct setup_stage setup;
+
+
+
+
 #if 0
 /**
  * Basically a cast wrapper.
@@ -147,33 +153,33 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
 
 #if 0
 /**
- * Clip setup->quad against the scissor/surface bounds.
+ * Clip setup.quad against the scissor/surface bounds.
  */
 static INLINE void
 quad_clip(struct setup_stage *setup)
 {
-   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
    const int minx = (int) cliprect->minx;
    const int maxx = (int) cliprect->maxx;
    const int miny = (int) cliprect->miny;
    const int maxy = (int) cliprect->maxy;
 
-   if (setup->quad.x0 >= maxx ||
-       setup->quad.y0 >= maxy ||
-       setup->quad.x0 + 1 < minx ||
-       setup->quad.y0 + 1 < miny) {
+   if (setup.quad.x0 >= maxx ||
+       setup.quad.y0 >= maxy ||
+       setup.quad.x0 + 1 < minx ||
+       setup.quad.y0 + 1 < miny) {
       /* totally clipped */
-      setup->quad.mask = 0x0;
+      setup.quad.mask = 0x0;
       return;
    }
-   if (setup->quad.x0 < minx)
-      setup->quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup->quad.y0 < miny)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup->quad.x0 == maxx - 1)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup->quad.y0 == maxy - 1)
-      setup->quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+   if (setup.quad.x0 < minx)
+      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (setup.quad.y0 < miny)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (setup.quad.x0 == maxx - 1)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (setup.quad.y0 == maxy - 1)
+      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
 }
 #endif
 
@@ -185,9 +191,9 @@ static INLINE void
 clip_emit_quad(struct setup_stage *setup)
 {
    quad_clip(setup);
-   if (setup->quad.mask) {
-      struct softpipe_context *sp = setup->softpipe;
-      sp->quad.first->run(sp->quad.first, &setup->quad);
+   if (setup.quad.mask) {
+      struct softpipe_context *sp = setup.softpipe;
+      sp->quad.first->run(sp->quad.first, &setup.quad);
    }
 }
 #endif
@@ -198,8 +204,7 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff( struct setup_stage *setup, uint slot,
-            float x, float y, float result[4][4])
+eval_coeff(uint slot, float x, float y, float result[4][4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
@@ -209,7 +214,7 @@ eval_coeff( struct setup_stage *setup, uint slot,
             result[QUAD_TOP_LEFT][i] =
             result[QUAD_TOP_RIGHT][i] =
             result[QUAD_BOTTOM_LEFT][i] =
-            result[QUAD_BOTTOM_RIGHT][i] = setup->coef[slot].a0[i];
+            result[QUAD_BOTTOM_RIGHT][i] = setup.coef[slot].a0[i];
          }
       }
       break;
@@ -219,12 +224,12 @@ eval_coeff( struct setup_stage *setup, uint slot,
    default:
       {
          uint i;
-         const float *dadx = setup->coef[slot].dadx;
-         const float *dady = setup->coef[slot].dady;
+         const float *dadx = setup.coef[slot].dadx;
+         const float *dady = setup.coef[slot].dady;
 
          /* loop over XYZW comps */
          for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+            result[QUAD_TOP_LEFT][i] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
             result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
             result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
             result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
@@ -235,15 +240,14 @@ eval_coeff( struct setup_stage *setup, uint slot,
 
 
 static INLINE void
-eval_z( struct setup_stage *setup,
-        float x, float y, float result[4])
+eval_z(float x, float y, float result[4])
 {
    const uint slot = 0;
    const uint i = 2;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
+   const float *dadx = setup.coef[slot].dadx;
+   const float *dady = setup.coef[slot].dady;
 
-   result[QUAD_TOP_LEFT] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+   result[QUAD_TOP_LEFT] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
    result[QUAD_TOP_RIGHT] = result[0] + dadx[i];
    result[QUAD_BOTTOM_LEFT] = result[0] + dady[i];
    result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i];
@@ -266,23 +270,23 @@ pack_color(const float color[4])
 
 
 static uint
-do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
+do_depth_test(int x, int y, unsigned mask)
 {
-   int ix = x - setup->cliprect_minx;
-   int iy = y - setup->cliprect_miny;
+   int ix = x - setup.cliprect_minx;
+   int iy = y - setup.cliprect_miny;
    float zvals[4];
 
-   eval_z(setup, (float) x, (float) y, zvals);
+   eval_z((float) x, (float) y, zvals);
 
-   if (tile_status_z[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
+   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else if (tile_status_z[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
+   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
+   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
@@ -363,31 +367,31 @@ do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
+emit_quad( int x, int y, unsigned mask )
 {
 #if 0
-   struct softpipe_context *sp = setup->softpipe;
-   setup->quad.x0 = x;
-   setup->quad.y0 = y;
-   setup->quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup->quad);
+   struct softpipe_context *sp = setup.softpipe;
+   setup.quad.x0 = x;
+   setup.quad.y0 = y;
+   setup.quad.mask = mask;
+   sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
    /* Cell: "write" quad fragments to the tile by setting prim color */
-   const int ix = x - setup->cliprect_minx;
-   const int iy = y - setup->cliprect_miny;
+   const int ix = x - setup.cliprect_minx;
+   const int iy = y - setup.cliprect_miny;
    uint colors[4];  /* indexed by QUAD_x */
 
    if (spu.texture.start) {
       float texcoords[4][4];
       uint i;
-      eval_coeff(setup, 2, (float) x, (float) y, texcoords);
+      eval_coeff(2, (float) x, (float) y, texcoords);
       for (i = 0; i < 4; i++) {
          colors[i] = sample_texture(texcoords[i]);
       }
    }
    else {
       float fcolors[4][4];
-      eval_coeff(setup, 1, (float) x, (float) y, fcolors);
+      eval_coeff(1, (float) x, (float) y, fcolors);
       colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
       colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
       colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
@@ -395,19 +399,19 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-      mask &= do_depth_test(setup, x, y, mask);
+      mask &= do_depth_test(x, y, mask);
    }
 
    if (mask) {
-      if (tile_status[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
+      if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else if (tile_status[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
+      else if (tile_status[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-      tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
+      tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
       if (mask & MASK_TOP_LEFT)
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
@@ -439,20 +443,20 @@ static INLINE int block( int x )
  * this is pretty nasty...  may need to rework flush_spans again to
  * fix it, if possible.
  */
-static unsigned calculate_mask( struct setup_stage *setup, int x )
+static unsigned calculate_mask( int x )
 {
    unsigned mask = 0x0;
 
-   if (x >= setup->span.left[0] && x < setup->span.right[0]) 
+   if (x >= setup.span.left[0] && x < setup.span.right[0]) 
       mask |= MASK_TOP_LEFT;
 
-   if (x >= setup->span.left[1] && x < setup->span.right[1]) 
+   if (x >= setup.span.left[1] && x < setup.span.right[1]) 
       mask |= MASK_BOTTOM_LEFT;
       
-   if (x+1 >= setup->span.left[0] && x+1 < setup->span.right[0]) 
+   if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) 
       mask |= MASK_TOP_RIGHT;
 
-   if (x+1 >= setup->span.left[1] && x+1 < setup->span.right[1]) 
+   if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) 
       mask |= MASK_BOTTOM_RIGHT;
 
    return mask;
@@ -462,28 +466,28 @@ static unsigned calculate_mask( struct setup_stage *setup, int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( struct setup_stage *setup )
+static void flush_spans( void )
 {
    int minleft, maxright;
    int x;
 
-   switch (setup->span.y_flags) {
+   switch (setup.span.y_flags) {
    case 0x3:
       /* both odd and even lines written (both quad rows) */
-      minleft = MIN2(setup->span.left[0], setup->span.left[1]);
-      maxright = MAX2(setup->span.right[0], setup->span.right[1]);
+      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
+      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
       break;
 
    case 0x1:
       /* only even line written (quad top row) */
-      minleft = setup->span.left[0];
-      maxright = setup->span.right[0];
+      minleft = setup.span.left[0];
+      maxright = setup.span.right[0];
       break;
 
    case 0x2:
       /* only odd line written (quad bottom row) */
-      minleft = setup->span.left[1];
-      maxright = setup->span.right[1];
+      minleft = setup.span.left[1];
+      maxright = setup.span.right[1];
       break;
 
    default:
@@ -494,31 +498,29 @@ static void flush_spans( struct setup_stage *setup )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( setup, x, setup->span.y, 
-                 calculate_mask( setup, x ) );
+      emit_quad( x, setup.span.y, 
+                 calculate_mask( x ) );
    }
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
 }
 
 #if DEBUG_VERTS
-static void print_vertex(const struct setup_stage *setup,
-                         const struct vertex_header *v)
+static void print_vertex(const struct vertex_header *v)
 {
    int i;
    fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup.quad.nr_attrs; i++) {
       fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
    }
 }
 #endif
 
-static boolean setup_sort_vertices( struct setup_stage *setup,
-				      const struct prim_header *prim )
+static boolean setup_sort_vertices(const struct prim_header *prim )
 {
    const struct vertex_header *v0 = prim->v[0];
    const struct vertex_header *v1 = prim->v[1];
@@ -526,12 +528,12 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-   print_vertex(setup, v2);
+   print_vertex(v0);
+   print_vertex(v1);
+   print_vertex(v2);
 #endif
 
-   setup->vprovoke = v2;
+   setup.vprovoke = v2;
 
    /* determine bottom to top order of vertices */
    {
@@ -541,65 +543,65 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
       if (y0 <= y1) {
 	 if (y1 <= y2) {
 	    /* y0<=y1<=y2 */
-	    setup->vmin = v0;   
-	    setup->vmid = v1;   
-	    setup->vmax = v2;
+	    setup.vmin = v0;   
+	    setup.vmid = v1;   
+	    setup.vmax = v2;
 	 }
 	 else if (y2 <= y0) {
 	    /* y2<=y0<=y1 */
-	    setup->vmin = v2;   
-	    setup->vmid = v0;   
-	    setup->vmax = v1;   
+	    setup.vmin = v2;   
+	    setup.vmid = v0;   
+	    setup.vmax = v1;   
 	 }
 	 else {
 	    /* y0<=y2<=y1 */
-	    setup->vmin = v0;   
-	    setup->vmid = v2;   
-	    setup->vmax = v1;  
+	    setup.vmin = v0;   
+	    setup.vmid = v2;   
+	    setup.vmax = v1;  
 	 }
       }
       else {
 	 if (y0 <= y2) {
 	    /* y1<=y0<=y2 */
-	    setup->vmin = v1;   
-	    setup->vmid = v0;   
-	    setup->vmax = v2;  
+	    setup.vmin = v1;   
+	    setup.vmid = v0;   
+	    setup.vmax = v2;  
 	 }
 	 else if (y2 <= y1) {
 	    /* y2<=y1<=y0 */
-	    setup->vmin = v2;   
-	    setup->vmid = v1;   
-	    setup->vmax = v0;  
+	    setup.vmin = v2;   
+	    setup.vmid = v1;   
+	    setup.vmax = v0;  
 	 }
 	 else {
 	    /* y1<=y2<=y0 */
-	    setup->vmin = v1;   
-	    setup->vmid = v2;   
-	    setup->vmax = v0;
+	    setup.vmin = v1;   
+	    setup.vmid = v2;   
+	    setup.vmax = v0;
 	 }
       }
    }
 
    /* Check if triangle is completely outside the tile bounds */
-   if (setup->vmin->data[0][1] > setup->cliprect_maxy)
+   if (setup.vmin->data[0][1] > setup.cliprect_maxy)
       return FALSE;
-   if (setup->vmax->data[0][1] < setup->cliprect_miny)
+   if (setup.vmax->data[0][1] < setup.cliprect_miny)
       return FALSE;
-   if (setup->vmin->data[0][0] < setup->cliprect_minx &&
-       setup->vmid->data[0][0] < setup->cliprect_minx &&
-       setup->vmax->data[0][0] < setup->cliprect_minx)
+   if (setup.vmin->data[0][0] < setup.cliprect_minx &&
+       setup.vmid->data[0][0] < setup.cliprect_minx &&
+       setup.vmax->data[0][0] < setup.cliprect_minx)
       return FALSE;
-   if (setup->vmin->data[0][0] > setup->cliprect_maxx &&
-       setup->vmid->data[0][0] > setup->cliprect_maxx &&
-       setup->vmax->data[0][0] > setup->cliprect_maxx)
+   if (setup.vmin->data[0][0] > setup.cliprect_maxx &&
+       setup.vmid->data[0][0] > setup.cliprect_maxx &&
+       setup.vmax->data[0][0] > setup.cliprect_maxx)
       return FALSE;
 
-   setup->ebot.dx = setup->vmid->data[0][0] - setup->vmin->data[0][0];
-   setup->ebot.dy = setup->vmid->data[0][1] - setup->vmin->data[0][1];
-   setup->emaj.dx = setup->vmax->data[0][0] - setup->vmin->data[0][0];
-   setup->emaj.dy = setup->vmax->data[0][1] - setup->vmin->data[0][1];
-   setup->etop.dx = setup->vmax->data[0][0] - setup->vmid->data[0][0];
-   setup->etop.dy = setup->vmax->data[0][1] - setup->vmid->data[0][1];
+   setup.ebot.dx = setup.vmid->data[0][0] - setup.vmin->data[0][0];
+   setup.ebot.dy = setup.vmid->data[0][1] - setup.vmin->data[0][1];
+   setup.emaj.dx = setup.vmax->data[0][0] - setup.vmin->data[0][0];
+   setup.emaj.dy = setup.vmax->data[0][1] - setup.vmin->data[0][1];
+   setup.etop.dx = setup.vmax->data[0][0] - setup.vmid->data[0][0];
+   setup.etop.dy = setup.vmax->data[0][1] - setup.vmid->data[0][1];
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
@@ -612,13 +614,13 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     * use the prim->det value because its sign is correct.
     */
    {
-      const float area = (setup->emaj.dx * setup->ebot.dy - 
-			    setup->ebot.dx * setup->emaj.dy);
+      const float area = (setup.emaj.dx * setup.ebot.dy - 
+			    setup.ebot.dx * setup.emaj.dy);
 
-      setup->oneoverarea = 1.0f / area;
+      setup.oneoverarea = 1.0f / area;
       /*
       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup->oneoverarea, area, prim->det );
+                   __FUNCTION__, setup.oneoverarea, area, prim->det );
       */
    }
 
@@ -627,7 +629,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.facing = (prim->det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 #endif
 
    return TRUE;
@@ -637,22 +639,22 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  * The value value comes from vertex->data[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
+ * The result will be put into setup.coef[slot].a0[i].
  * \param slot  which attribute slot 
  * \param i  which component of the slot (0..3)
  */
-static void const_coeff(struct setup_stage *setup, uint slot)
+static void const_coeff(uint slot)
 {
    uint i;
    ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
    for (i = 0; i < 4; i++) {
-      setup->coef[slot].dadx[i] = 0;
-      setup->coef[slot].dady[i] = 0;
+      setup.coef[slot].dadx[i] = 0;
+      setup.coef[slot].dady[i] = 0;
 
       /* need provoking vertex info!
        */
-      setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+      setup.coef[slot].a0[i] = setup.vprovoke->data[slot][i];
    }
 }
 
@@ -661,20 +663,19 @@ static void const_coeff(struct setup_stage *setup, uint slot)
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( struct setup_stage *setup,
-                              uint slot, uint firstComp, uint lastComp )
+static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
 {
    uint i;
    for (i = firstComp; i < lastComp; i++) {
-      float botda = setup->vmid->data[slot][i] - setup->vmin->data[slot][i];
-      float majda = setup->vmax->data[slot][i] - setup->vmin->data[slot][i];
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float botda = setup.vmid->data[slot][i] - setup.vmin->data[slot][i];
+      float majda = setup.vmax->data[slot][i] - setup.vmin->data[slot][i];
+      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
    
       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-      setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-      setup->coef[slot].dady[i] = b * setup->oneoverarea;
+      setup.coef[slot].dadx[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady[i] = b * setup.oneoverarea;
 
       /* calculate a0 as the value which would be sampled for the
        * fragment at (0,0), taking into account that we want to sample at
@@ -688,17 +689,17 @@ static void tri_linear_coeff( struct setup_stage *setup,
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup->coef[slot].a0[i] = (setup->vmin->data[slot][i] - 
-                                 (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-                                  setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0[i] = (setup.vmin->data[slot][i] - 
+                                 (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
+                                  setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
    }
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
-		setup->coef[slot].a0[i],
-		setup->coef[slot].dadx[i],
-		setup->coef[slot].dady[i]);
+		setup.coef[slot].a0[i],
+		setup.coef[slot].dadx[i],
+		setup.coef[slot].dady[i]);
    */
 }
 
@@ -712,46 +713,45 @@ static void tri_linear_coeff( struct setup_stage *setup,
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( struct setup_stage *setup,
-                             unsigned slot,
+static void tri_persp_coeff( unsigned slot,
                              unsigned i )
 {
    /* premultiply by 1/w:
     */
-   float mina = setup->vmin->data[slot][i] * setup->vmin->data[0][3];
-   float mida = setup->vmid->data[slot][i] * setup->vmid->data[0][3];
-   float maxa = setup->vmax->data[slot][i] * setup->vmax->data[0][3];
+   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
+   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
+   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
 
    float botda = mida - mina;
    float majda = maxa - mina;
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
       
    /*
    printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup->vmin->data[slot][i],
-          setup->vmid->data[slot][i],
-          setup->vmax->data[slot][i]
+          setup.vmin->data[slot][i],
+          setup.vmid->data[slot][i],
+          setup.vmax->data[slot][i]
           );
    */
 
    assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-   setup->coef[slot].dady[i] = b * setup->oneoverarea;
-   setup->coef[slot].a0[i] = (mina - 
-			    (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-			     setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0[i] = (mina - 
+			    (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
 }
 #endif
 
 
 /**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients( struct setup_stage *setup )
+static void setup_tri_coefficients(void)
 {
 #if 1
    uint i;
@@ -761,17 +761,17 @@ static void setup_tri_coefficients( struct setup_stage *setup )
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(setup, i, 2, 3);
+         tri_linear_coeff(i, 2, 3);
          /* XXX interp W if PERSPECTIVE... */
          break;
       case INTERP_CONSTANT:
-         const_coeff(setup, i);
+         const_coeff(i);
          break;
       case INTERP_LINEAR:
-         tri_linear_coeff(setup, i, 0, 4);
+         tri_linear_coeff(i, 0, 4);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff(setup, i, 0, 4); /* XXX temporary */
+         tri_linear_coeff(i, 0, 4); /* XXX temporary */
          break;
       default:
          ASSERT(0);
@@ -781,35 +781,35 @@ static void setup_tri_coefficients( struct setup_stage *setup )
    ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
    ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
           spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(setup, 0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(setup, 1, 0, 4);  /* slot 1, color */
+   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
+   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
 #endif
 }
 
 
-static void setup_tri_edges( struct setup_stage *setup )
+static void setup_tri_edges(void)
 {
-   float vmin_x = setup->vmin->data[0][0] + 0.5f;
-   float vmid_x = setup->vmid->data[0][0] + 0.5f;
-
-   float vmin_y = setup->vmin->data[0][1] - 0.5f;
-   float vmid_y = setup->vmid->data[0][1] - 0.5f;
-   float vmax_y = setup->vmax->data[0][1] - 0.5f;
-
-   setup->emaj.sy = CEILF(vmin_y);
-   setup->emaj.lines = (int) CEILF(vmax_y - setup->emaj.sy);
-   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
-   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
-
-   setup->etop.sy = CEILF(vmid_y);
-   setup->etop.lines = (int) CEILF(vmax_y - setup->etop.sy);
-   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
-   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
-
-   setup->ebot.sy = CEILF(vmin_y);
-   setup->ebot.lines = (int) CEILF(vmid_y - setup->ebot.sy);
-   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
-   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+   float vmin_x = setup.vmin->data[0][0] + 0.5f;
+   float vmid_x = setup.vmid->data[0][0] + 0.5f;
+
+   float vmin_y = setup.vmin->data[0][1] - 0.5f;
+   float vmid_y = setup.vmid->data[0][1] - 0.5f;
+   float vmax_y = setup.vmax->data[0][1] - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 }
 
 
@@ -817,15 +817,14 @@ static void setup_tri_edges( struct setup_stage *setup )
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct setup_stage *setup,
-			 struct edge *eleft,
+static void subtriangle( struct edge *eleft,
 			 struct edge *eright,
 			 unsigned lines )
 {
-   const int minx = setup->cliprect_minx;
-   const int maxx = setup->cliprect_maxx;
-   const int miny = setup->cliprect_miny;
-   const int maxy = setup->cliprect_maxy;
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
    int y, start_y, finish_y;
    int sy = (int)eleft->sy;
 
@@ -867,14 +866,14 @@ static void subtriangle( struct setup_stage *setup,
 
       if (left < right) {
          int _y = sy + y;
-         if (block(_y) != setup->span.y) {
-            flush_spans(setup);
-            setup->span.y = block(_y);
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
          }
 
-         setup->span.left[_y&1] = left;
-         setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
+         setup.span.left[_y&1] = left;
+         setup.span.right[_y&1] = right;
+         setup.span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -892,41 +891,41 @@ static void subtriangle( struct setup_stage *setup,
  * Do setup for triangle rasterization, then render the triangle.
  */
 static void
-setup_tri(struct setup_stage *setup, struct prim_header *prim)
+setup_tri(struct prim_header *prim)
 {
-   if (!setup_sort_vertices( setup, prim )) {
+   if (!setup_sort_vertices( prim )) {
       return; /* totally clipped */
    }
 
-   setup_tri_coefficients( setup );
-   setup_tri_edges( setup );
+   setup_tri_coefficients();
+   setup_tri_edges();
 
 #if 0
-   setup->quad.prim = PRIM_TRI;
+   setup.quad.prim = PRIM_TRI;
 #endif
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
    /*   init_constant_attribs( setup ); */
       
-   if (setup->oneoverarea < 0.0) {
+   if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
-      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
       /* emaj on right:
        */
-      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
 
-   flush_spans( setup );
+   flush_spans();
 }
 
 
@@ -939,7 +938,7 @@ void
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
    struct prim_header tri;
-   struct setup_stage setup;
+   /*struct setup_stage setup;*/
 
    tri.v[0] = (struct vertex_header *) v0;
    tri.v[1] = (struct vertex_header *) v1;
@@ -954,5 +953,5 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
-   setup_tri(&setup, &tri);
+   setup_tri(&tri);
 }
-- 
cgit v1.2.3


From 7b149449df3a7de62f79eb96d5b722cc9d3b5912 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 12:13:04 -0700
Subject: Cell: fold setup_tri() into tri_draw()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 62 ++++++++++++----------------------------
 1 file changed, 19 insertions(+), 43 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 5bb2cb12e3..1c615a6e6a 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -47,9 +47,6 @@ struct vertex_header {
    float data[0][4];
 };
 
-struct prim_header {
-   struct vertex_header *v[3];
-};
 
 
 /* XXX fix this */
@@ -520,11 +517,10 @@ static void print_vertex(const struct vertex_header *v)
 }
 #endif
 
-static boolean setup_sort_vertices(const struct prim_header *prim )
+static boolean setup_sort_vertices(const struct vertex_header *v0,
+                                   const struct vertex_header *v1,
+                                   const struct vertex_header *v2)
 {
-   const struct vertex_header *v0 = prim->v[0];
-   const struct vertex_header *v1 = prim->v[1];
-   const struct vertex_header *v2 = prim->v[2];
 
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
@@ -888,22 +884,30 @@ static void subtriangle( struct edge *eleft,
 
 
 /**
- * Do setup for triangle rasterization, then render the triangle.
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
  */
-static void
-setup_tri(struct prim_header *prim)
+void
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
-   if (!setup_sort_vertices( prim )) {
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
       return; /* totally clipped */
    }
 
    setup_tri_coefficients();
    setup_tri_edges();
 
-#if 0
-   setup.quad.prim = PRIM_TRI;
-#endif
-
    setup.span.y = 0;
    setup.span.y_flags = 0;
    setup.span.right[0] = 0;
@@ -927,31 +931,3 @@ setup_tri(struct prim_header *prim)
 
    flush_spans();
 }
-
-
-
-/**
- * Draw triangle into tile at (tx, ty) (tile coords)
- * The tile data should have already been fetched.
- */
-void
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
-{
-   struct prim_header tri;
-   /*struct setup_stage setup;*/
-
-   tri.v[0] = (struct vertex_header *) v0;
-   tri.v[1] = (struct vertex_header *) v1;
-   tri.v[2] = (struct vertex_header *) v2;
-
-   setup.tx = tx;
-   setup.ty = ty;
-
-   /* set clipping bounds to tile bounds */
-   setup.cliprect_minx = tx * TILE_SIZE;
-   setup.cliprect_miny = ty * TILE_SIZE;
-   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
-   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
-
-   setup_tri(&tri);
-}
-- 
cgit v1.2.3


From 24f0e54c1b9ff43dcb75758c8e0faba355c0617c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 15:26:51 -0700
Subject: Cell: start to SIMD-ize triangle attribute interpolation

Using the spu_add(), etc intrinsics.
About a 15% speed-up with some tests.
---
 src/mesa/pipe/cell/spu/spu_main.h    |   7 ++
 src/mesa/pipe/cell/spu/spu_texture.c |   6 +-
 src/mesa/pipe/cell/spu/spu_texture.h |   2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     | 126 +++++++++++++++++++----------------
 4 files changed, 79 insertions(+), 62 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 8908bf8bc0..73f9ed29d6 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -36,6 +36,13 @@
 #include "pipe/p_state.h"
 
 
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 6d566a5006..7a1ca097c0 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -128,11 +128,11 @@ get_tex_tile(uint i, uint j)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(const float *texcoord)
+sample_texture(float4 texcoord)
 {
    /* wrap/repeat */
-   uint i = (uint) (texcoord[0] * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (texcoord[1] * spu.texture.height) % spu.texture.height;
+   uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
    uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index b75b7ac44f..938a42b549 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(const float *texcoord);
+sample_texture(float4 texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 1c615a6e6a..4fc6d90895 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -81,9 +81,9 @@ struct edge {
 
 struct interp_coef
 {
-   float a0[4];
-   float dadx[4];
-   float dady[4];
+   float4 a0;
+   float4 dadx;
+   float4 dady;
 };
 
 
@@ -201,36 +201,31 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, float result[4][4])
+eval_coeff(uint slot, float x, float y, float4 result[4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
-      {
-         uint i;
-         for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] =
-            result[QUAD_TOP_RIGHT][i] =
-            result[QUAD_BOTTOM_LEFT][i] =
-            result[QUAD_BOTTOM_RIGHT][i] = setup.coef[slot].a0[i];
-         }
-      }
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
       break;
 
    case INTERP_LINEAR:
       /* fall-through, for now */
    default:
       {
-         uint i;
-         const float *dadx = setup.coef[slot].dadx;
-         const float *dady = setup.coef[slot].dady;
-
-         /* loop over XYZW comps */
-         for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-            result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
-            result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
-            result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
-         }
+         register vector float dadx = setup.coef[slot].dadx.v;
+         register vector float dady = setup.coef[slot].dady.v;
+         register vector float topLeft
+            = spu_add(setup.coef[slot].a0.v,
+                      spu_add(spu_mul(spu_splats(x), dadx),
+                              spu_mul(spu_splats(y), dady)));
+
+         result[QUAD_TOP_LEFT].v = topLeft;
+         result[QUAD_TOP_RIGHT].v = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT].v = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT].v = spu_add(spu_add(topLeft, dadx), dady);
       }
    }
 }
@@ -240,28 +235,46 @@ static INLINE void
 eval_z(float x, float y, float result[4])
 {
    const uint slot = 0;
-   const uint i = 2;
-   const float *dadx = setup.coef[slot].dadx;
-   const float *dady = setup.coef[slot].dady;
-
-   result[QUAD_TOP_LEFT] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-   result[QUAD_TOP_RIGHT] = result[0] + dadx[i];
-   result[QUAD_BOTTOM_LEFT] = result[0] + dady[i];
-   result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i];
+   const float dzdx = setup.coef[slot].dadx.f[2];
+   const float dzdy = setup.coef[slot].dady.f[2];
+   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+#if 1
+   result[QUAD_TOP_LEFT] = topLeft;
+   result[QUAD_TOP_RIGHT] = topLeft + dzdx;
+   result[QUAD_BOTTOM_LEFT] = topLeft + dzdy;
+   result[QUAD_BOTTOM_RIGHT] = topLeft + dzdx + dzdy;
+#else
+   /* XXX vectorize */
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs
+      = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   vector float *res = (vector float *) result;
+   *res = spu_add(topLeftv, derivs);
+#endif
 }
 
 
-static INLINE uint
-pack_color(const float color[4])
+static INLINE void
+pack_colors(uint uicolors[4], const float4 fcolors[4])
 {
+   /* XXX grab the code for _pack_rgba8() and use the shuffle
+    * command to do the swizzling seen here.
+    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return _pack_rgba8(color[3], color[0], color[1], color[2]);
+      uicolors[0] = _pack_rgba8(fcolors[0].f[3], fcolors[0].f[0], fcolors[0].f[1], fcolors[0].f[2]);
+      uicolors[1] = _pack_rgba8(fcolors[1].f[3], fcolors[1].f[0], fcolors[1].f[1], fcolors[1].f[2]);
+      uicolors[2] = _pack_rgba8(fcolors[2].f[3], fcolors[2].f[0], fcolors[2].f[1], fcolors[2].f[2]);
+      uicolors[3] = _pack_rgba8(fcolors[3].f[3], fcolors[0].f[0], fcolors[3].f[1], fcolors[3].f[2]);
+      break;
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return _pack_rgba8(color[2], color[1], color[0], color[3]);
+      uicolors[0] = _pack_rgba8(fcolors[0].f[2], fcolors[0].f[1], fcolors[0].f[0], fcolors[0].f[3]);
+      uicolors[1] = _pack_rgba8(fcolors[1].f[2], fcolors[1].f[1], fcolors[1].f[0], fcolors[1].f[3]);
+      uicolors[2] = _pack_rgba8(fcolors[2].f[2], fcolors[2].f[1], fcolors[2].f[0], fcolors[2].f[3]);
+      uicolors[3] = _pack_rgba8(fcolors[3].f[2], fcolors[3].f[1], fcolors[3].f[0], fcolors[3].f[3]);
+      break;
    default:
       ASSERT(0);
-      return 0;
    }
 }
 
@@ -379,7 +392,7 @@ emit_quad( int x, int y, unsigned mask )
    uint colors[4];  /* indexed by QUAD_x */
 
    if (spu.texture.start) {
-      float texcoords[4][4];
+      float4 texcoords[4];
       uint i;
       eval_coeff(2, (float) x, (float) y, texcoords);
       for (i = 0; i < 4; i++) {
@@ -387,12 +400,9 @@ emit_quad( int x, int y, unsigned mask )
       }
    }
    else {
-      float fcolors[4][4];
+      float4 fcolors[4];
       eval_coeff(1, (float) x, (float) y, fcolors);
-      colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
-      colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
-      colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
-      colors[QUAD_BOTTOM_RIGHT] = pack_color(fcolors[QUAD_BOTTOM_RIGHT]);
+      pack_colors(colors, fcolors);
    }
 
    if (spu.depth_stencil.depth.enabled) {
@@ -645,12 +655,12 @@ static void const_coeff(uint slot)
    ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
    for (i = 0; i < 4; i++) {
-      setup.coef[slot].dadx[i] = 0;
-      setup.coef[slot].dady[i] = 0;
+      setup.coef[slot].dadx.f[i] = 0;
+      setup.coef[slot].dady.f[i] = 0;
 
       /* need provoking vertex info!
        */
-      setup.coef[slot].a0[i] = setup.vprovoke->data[slot][i];
+      setup.coef[slot].a0.f[i] = setup.vprovoke->data[slot][i];
    }
 }
 
@@ -670,8 +680,8 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
    
       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-      setup.coef[slot].dadx[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady[i] = b * setup.oneoverarea;
+      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
 
       /* calculate a0 as the value which would be sampled for the
        * fragment at (0,0), taking into account that we want to sample at
@@ -685,17 +695,17 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup.coef[slot].a0[i] = (setup.vmin->data[slot][i] - 
-                                 (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
-                                  setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0.f[i] = (setup.vmin->data[slot][i] - 
+                                 (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+                                  setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
    }
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
 		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx[i],
-		setup.coef[slot].dady[i]);
+		setup.coef[slot].dadx.f[i],
+		setup.coef[slot].dady.f[i]);
    */
 }
 
@@ -734,11 +744,11 @@ static void tri_persp_coeff( unsigned slot,
    assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup.coef[slot].dadx[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0[i] = (mina - 
-			    (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0.f[i] = (mina - 
+			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
 }
 #endif
 
-- 
cgit v1.2.3


From 8fb73a59939ac9ec1e41abf89a4a8c8dde09b8df Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 20:40:26 -0700
Subject: Cell: prototype SIMD code for z testing

---
 src/mesa/pipe/cell/spu/spu_tile.h |  10 +++
 src/mesa/pipe/cell/spu/spu_tri.c  | 147 +++++++++++++++++++++++++++++---------
 2 files changed, 123 insertions(+), 34 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index f83dc009c2..18d1b3c117 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,6 +42,7 @@
 typedef union {
    ushort t16[TILE_SIZE][TILE_SIZE];
    uint   t32[TILE_SIZE][TILE_SIZE];
+   float4 f4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
 
 
@@ -83,9 +84,18 @@ clear_z_tile(tile_t *ztile)
                TILE_SIZE * TILE_SIZE);
    }
    else {
+      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
+#if SIMD_Z
+      union fi z;
+      z.f = 1.0;
+      memset32((uint*) ztile->t32,
+               z.i,/*spu.fb.depth_clear_value,*/
+               TILE_SIZE * TILE_SIZE);
+#else
       memset32((uint*) ztile->t32,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
+#endif
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 4fc6d90895..e436e153ec 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -40,6 +40,19 @@
 #include "spu_tri.h"
 
 
+/*
+ * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
+ * to do Z testing/updating.
+ */
+#define SIMD_Z 0
+
+#if SIMD_Z
+typedef vector unsigned int mask_t;
+#else
+typedef uint mask_t;
+#endif
+
+
 /**
  * Simplified types taken from other parts of Gallium
  */
@@ -231,26 +244,16 @@ eval_coeff(uint slot, float x, float y, float4 result[4])
 }
 
 
-static INLINE void
-eval_z(float x, float y, float result[4])
+static INLINE vector float
+eval_z(float x, float y)
 {
    const uint slot = 0;
    const float dzdx = setup.coef[slot].dadx.f[2];
    const float dzdy = setup.coef[slot].dady.f[2];
    const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
-#if 1
-   result[QUAD_TOP_LEFT] = topLeft;
-   result[QUAD_TOP_RIGHT] = topLeft + dzdx;
-   result[QUAD_BOTTOM_LEFT] = topLeft + dzdy;
-   result[QUAD_BOTTOM_RIGHT] = topLeft + dzdx + dzdy;
-#else
-   /* XXX vectorize */
    const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs
-      = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
-   vector float *res = (vector float *) result;
-   *res = spu_add(topLeftv, derivs);
-#endif
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
 }
 
 
@@ -279,14 +282,22 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
 }
 
 
-static uint
-do_depth_test(int x, int y, unsigned mask)
+
+static unsigned int
+do_depth_test(int x, int y, unsigned int mask)
 {
+   static const float4 zscale16
+      = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
+   static const float4 zscale32
+      = {.f={(float)0xffffffff,
+             (float)0xffffffff,
+             (float)0xffffffff,
+             (float)0xffffffff}};
    int ix = x - setup.cliprect_minx;
    int iy = y - setup.cliprect_miny;
-   float zvals[4];
+   float4 zvals;
 
-   eval_z((float) x, (float) y, zvals);
+   zvals.v = eval_z((float) x, (float) y);
 
    if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
@@ -300,9 +311,9 @@ do_depth_test(int x, int y, unsigned mask)
 
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      const float zscale = 65535.0;
+      zvals.v = spu_mul(zvals.v, zscale16.v);
       if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
+         uint z = (uint) zvals.f[0];
          if (z < ztile.t16[iy][ix])
             ztile.t16[iy][ix] = z;
          else
@@ -310,7 +321,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
+         uint z = (uint) zvals.f[1];
          if (z < ztile.t16[iy][ix+1])
             ztile.t16[iy][ix+1] = z;
          else
@@ -318,7 +329,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
+         uint z = (uint) zvals.f[2];
          if (z < ztile.t16[iy+1][ix])
             ztile.t16[iy+1][ix] = z;
          else
@@ -326,7 +337,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
+         uint z = (uint) zvals.f[3];
          if (z < ztile.t16[iy+1][ix+1])
             ztile.t16[iy+1][ix+1] = z;
          else
@@ -334,10 +345,10 @@ do_depth_test(int x, int y, unsigned mask)
       }
    }
    else {
-      const float zscale = (float) 0xffffffff;
+      zvals.v = spu_mul(zvals.v, zscale32.v);
       ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
       if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
+         uint z = (uint) zvals.f[0];
          if (z < ztile.t32[iy][ix])
             ztile.t32[iy][ix] = z;
          else
@@ -345,7 +356,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
+         uint z = (uint) zvals.f[1];
          if (z < ztile.t32[iy][ix+1])
             ztile.t32[iy][ix+1] = z;
          else
@@ -353,7 +364,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
+         uint z = (uint) zvals.f[2];
          if (z < ztile.t32[iy+1][ix])
             ztile.t32[iy+1][ix] = z;
          else
@@ -361,7 +372,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
+         uint z = (uint) zvals.f[3];
          if (z < ztile.t32[iy+1][ix+1])
             ztile.t32[iy+1][ix+1] = z;
          else
@@ -373,11 +384,45 @@ do_depth_test(int x, int y, unsigned mask)
 }
 
 
+
+
+static vector unsigned int
+do_depth_test_simd(int x, int y, vector unsigned int quadmask)
+{
+   int ix = (x - setup.cliprect_minx) / 2;
+   int iy = (y - setup.cliprect_miny) / 2;
+   float4 zvals;
+
+   vector unsigned int zmask;
+
+   zvals.v = eval_z((float) x, (float) y);
+
+   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+      /* now, _really_ clear the tile */
+      clear_z_tile(&ztile);
+   }
+   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+      /* make sure we've got the tile from main mem */
+      wait_on_mask(1 << TAG_READ_TILE_Z);
+   }
+   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+
+   /* XXX fetch Z value sooner to hide latency here */
+   zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
+   zmask = spu_and(zmask, quadmask);
+
+   ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
+   //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
+
+   return zmask;
+}
+
+
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( int x, int y, unsigned mask )
+emit_quad( int x, int y, mask_t mask )
 {
 #if 0
    struct softpipe_context *sp = setup.softpipe;
@@ -406,10 +451,17 @@ emit_quad( int x, int y, unsigned mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-      mask &= do_depth_test(x, y, mask);
+#if SIMD_Z
+      mask = do_depth_test_simd(x, y, mask);
+#else
+      mask = do_depth_test(x, y, mask);
+#endif
    }
 
-   if (mask) {
+#if !SIMD_Z
+   if (mask)
+#endif
+   {
       if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
@@ -420,6 +472,21 @@ emit_quad( int x, int y, unsigned mask )
       }
       tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
+#if SIMD_Z
+      if (spu_extract(mask, 0))
+         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
+      if (spu_extract(mask, 1))
+         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+      if (spu_extract(mask, 2))
+         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+      if (spu_extract(mask, 3))
+         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+#elif 0
+      /* SIMD_Z with swizzled color buffer (someday) */
+      vector float icolors = *((vector float *) &colors);
+      ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
+
+#else
       if (mask & MASK_TOP_LEFT)
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (mask & MASK_TOP_RIGHT)
@@ -428,7 +495,9 @@ emit_quad( int x, int y, unsigned mask )
          ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (mask & MASK_BOTTOM_RIGHT)
          ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+#endif
    }
+
 #endif
 }
 
@@ -450,8 +519,18 @@ static INLINE int block( int x )
  * this is pretty nasty...  may need to rework flush_spans again to
  * fix it, if possible.
  */
-static unsigned calculate_mask( int x )
+static mask_t calculate_mask( int x )
 {
+#if SIMD_Z
+   uint m0, m1, m2, m3;
+
+   m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
+   m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
+   m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
+   m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
+
+   return (vector unsigned int) {m0, m1, m2, m3};
+#else
    unsigned mask = 0x0;
 
    if (x >= setup.span.left[0] && x < setup.span.right[0]) 
@@ -467,6 +546,7 @@ static unsigned calculate_mask( int x )
       mask |= MASK_BOTTOM_RIGHT;
 
    return mask;
+#endif
 }
 
 
@@ -505,8 +585,7 @@ static void flush_spans( void )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( x, setup.span.y, 
-                 calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ) );
    }
 
    setup.span.y = 0;
-- 
cgit v1.2.3


From 524bba17a75cee597f588da9c19f25d758aa237b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 10:37:18 -0800
Subject: Initial pass at vertex shader on SPU using TGSI VM

All of the code is wired in on the SPU side, but it is not called from
the PPU yet.  Instruction / declaration fetch still needs to be
implemented in spu_exec.c.
---
 src/mesa/pipe/cell/common.h                |   38 +
 src/mesa/pipe/cell/spu/Makefile            |    6 +-
 src/mesa/pipe/cell/spu/spu_exec.c          | 2355 ++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_exec.h          |  171 ++
 src/mesa/pipe/cell/spu/spu_main.c          |   28 +
 src/mesa/pipe/cell/spu/spu_util.c          |  165 ++
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  |  493 ++++++
 src/mesa/pipe/cell/spu/spu_vertex_shader.c |  224 +++
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   61 +
 9 files changed, 3540 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_exec.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_exec.h
 create mode 100644 src/mesa/pipe/cell/spu/spu_util.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_fetch.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_shader.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_shader.h

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d5e86863d4..80a1425ec7 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -83,6 +83,9 @@
 #define CELL_CMD_STATE_SAMPLER       12
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_VS_ARRAY_INFO 16
+#define CELL_CMD_VS_EXECUTE          17
 
 
 #define CELL_NUM_BUFFERS 4
@@ -116,6 +119,41 @@ struct cell_command_clear_surface
 } ALIGN16_ATTRIB;
 
 
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+    void *base;               /**< Base address of the 0th element. */
+    uint attr;                /**< Attribute that this state if for. */
+    uint pitch;               /**< Byte pitch from one entry to the next. */
+    enum pipe_format format;  /**< Pipe format of each entry. */
+} ALIGN16_ATTRIB;
+
+
+struct cell_shader_info
+{
+   unsigned processor;
+   unsigned num_outputs;
+
+   void *declarations;
+   unsigned num_declarations;
+   void *instructions;
+   unsigned num_instructions;
+   void *uniforms;
+} ALIGN16_ATTRIB;
+
+
+struct cell_command_vs
+{
+   struct cell_shader_info   shader;
+   void *elts;
+   unsigned num_elts;
+   unsigned bytes_per_elt;
+   void *vOut;
+} ALIGN16_ATTRIB;
+
+
 struct cell_command_render
 {
    uint opcode;       /**< CELL_CMD_RENDER */
diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index d5b30e1f27..2d031bfbc6 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -20,7 +20,11 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c
+	spu_tri.c \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
 
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..6888e97caf
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -0,0 +1,2355 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+#include "spu_exec.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   uint i;
+
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   for( i = 0; i < 4; i++ ) {
+      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
+      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
+      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
+      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
+      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
+      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
+      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
+      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+   }
+}
+
+
+static void
+micro_abs(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) fabs( (double) src->f[0] );
+   dst->f[1] = (float) fabs( (double) src->f[1] );
+   dst->f[2] = (float) fabs( (double) src->f[2] );
+   dst->f[3] = (float) fabs( (double) src->f[3] );
+}
+
+static void
+micro_add(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] + src1->f[0];
+   dst->f[1] = src0->f[1] + src1->f[1];
+   dst->f[2] = src0->f[2] + src1->f[2];
+   dst->f[3] = src0->f[3] + src1->f[3];
+}
+
+static void
+micro_iadd(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] + src1->i[0];
+   dst->i[1] = src0->i[1] + src1->i[1];
+   dst->i[2] = src0->i[2] + src1->i[2];
+   dst->i[3] = src0->i[3] + src1->i[3];
+}
+
+static void
+micro_and(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] & src1->u[0];
+   dst->u[1] = src0->u[1] & src1->u[1];
+   dst->u[2] = src0->u[2] & src1->u[2];
+   dst->u[3] = src0->u[3] & src1->u[3];
+}
+
+static void
+micro_ceil(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) ceil( (double) src->f[0] );
+   dst->f[1] = (float) ceil( (double) src->f[1] );
+   dst->f[2] = (float) ceil( (double) src->f[2] );
+   dst->f[3] = (float) ceil( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_cos(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) cos( (double) src->f[0] );
+   dst->f[1] = (float) cos( (double) src->f[1] );
+   dst->f[2] = (float) cos( (double) src->f[2] );
+   dst->f[3] = (float) cos( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_ddx(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_div(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] / src1->f[0];
+   dst->f[1] = src0->f[1] / src1->f[1];
+   dst->f[2] = src0->f[2] / src1->f[2];
+   dst->f[3] = src0->f[3] / src1->f[3];
+}
+
+static void
+micro_udiv(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] / src1->u[0];
+   dst->u[1] = src0->u[1] / src1->u[1];
+   dst->u[2] = src0->u[2] / src1->u[2];
+   dst->u[3] = src0->u[3] / src1->u[3];
+}
+
+static void
+micro_eq(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ieq(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_exp2(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src)
+{
+#if 0
+   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
+   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
+   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
+   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+#endif
+}
+
+static void
+micro_f2it(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->i[0] = (int) src->f[0];
+   dst->i[1] = (int) src->f[1];
+   dst->i[2] = (int) src->f[2];
+   dst->i[3] = (int) src->f[3];
+}
+
+static void
+micro_f2ut(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->u[0] = (uint) src->f[0];
+   dst->u[1] = (uint) src->f[1];
+   dst->u[2] = (uint) src->f[2];
+   dst->u[3] = (uint) src->f[3];
+}
+
+static void
+micro_flr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) floor( (double) src->f[0] );
+   dst->f[1] = (float) floor( (double) src->f[1] );
+   dst->f[2] = (float) floor( (double) src->f[2] );
+   dst->f[3] = (float) floor( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_frc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
+   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
+   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
+   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_ge(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_i2f(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) src->i[0];
+   dst->f[1] = (float) src->i[1];
+   dst->f[2] = (float) src->i[2];
+   dst->f[3] = (float) src->i[3];
+}
+
+static void
+micro_lg2(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
+   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
+   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
+   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+#endif
+}
+
+static void
+micro_lt(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ilt(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_ult(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+}
+
+static void
+micro_max(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imax(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umax(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_min(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umod(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] % src1->u[0];
+   dst->u[1] = src0->u[1] % src1->u[1];
+   dst->u[2] = src0->u[2] % src1->u[2];
+   dst->u[3] = src0->u[3] % src1->u[3];
+}
+
+static void
+micro_mul(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] * src1->f[0];
+   dst->f[1] = src0->f[1] * src1->f[1];
+   dst->f[2] = src0->f[2] * src1->f[2];
+   dst->f[3] = src0->f[3] * src1->f[3];
+}
+
+static void
+micro_imul(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] * src1->i[0];
+   dst->i[1] = src0->i[1] * src1->i[1];
+   dst->i[2] = src0->i[2] * src1->i[2];
+   dst->i[3] = src0->i[3] * src1->i[3];
+}
+
+static void
+micro_imul64(
+   union spu_exec_channel *dst0,
+   union spu_exec_channel *dst1,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst1->i[0] = src0->i[0] * src1->i[0];
+   dst1->i[1] = src0->i[1] * src1->i[1];
+   dst1->i[2] = src0->i[2] * src1->i[2];
+   dst1->i[3] = src0->i[3] * src1->i[3];
+   dst0->i[0] = 0;
+   dst0->i[1] = 0;
+   dst0->i[2] = 0;
+   dst0->i[3] = 0;
+}
+
+static void
+micro_umul64(
+   union spu_exec_channel *dst0,
+   union spu_exec_channel *dst1,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst1->u[0] = src0->u[0] * src1->u[0];
+   dst1->u[1] = src0->u[1] * src1->u[1];
+   dst1->u[2] = src0->u[2] * src1->u[2];
+   dst1->u[3] = src0->u[3] * src1->u[3];
+   dst0->u[0] = 0;
+   dst0->u[1] = 0;
+   dst0->u[2] = 0;
+   dst0->u[3] = 0;
+}
+
+static void
+micro_movc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2 )
+{
+   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
+   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
+   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
+   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
+}
+
+static void
+micro_neg(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = -src->f[0];
+   dst->f[1] = -src->f[1];
+   dst->f[2] = -src->f[2];
+   dst->f[3] = -src->f[3];
+}
+
+static void
+micro_ineg(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_not(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_or(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] | src1->u[0];
+   dst->u[1] = src0->u[1] | src1->u[1];
+   dst->u[2] = src0->u[2] | src1->u[2];
+   dst->u[3] = src0->u[3] | src1->u[3];
+}
+
+static void
+micro_pow(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+#if 0
+   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
+   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
+   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
+   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+#endif
+}
+
+static void
+micro_rnd(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
+   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
+   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
+   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+#endif
+}
+
+static void
+micro_shl(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] << src1->i[0];
+   dst->i[1] = src0->i[1] << src1->i[1];
+   dst->i[2] = src0->i[2] << src1->i[2];
+   dst->i[3] = src0->i[3] << src1->i[3];
+}
+
+static void
+micro_ishr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] >> src1->i[0];
+   dst->i[1] = src0->i[1] >> src1->i[1];
+   dst->i[2] = src0->i[2] >> src1->i[2];
+   dst->i[3] = src0->i[3] >> src1->i[3];
+}
+
+static void
+micro_trunc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0 )
+{
+   dst->f[0] = (float) (int) src0->f[0];
+   dst->f[1] = (float) (int) src0->f[1];
+   dst->f[2] = (float) (int) src0->f[2];
+   dst->f[3] = (float) (int) src0->f[3];
+}
+
+static void
+micro_ushr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] >> src1->u[0];
+   dst->u[1] = src0->u[1] >> src1->u[1];
+   dst->u[2] = src0->u[2] >> src1->u[2];
+   dst->u[3] = src0->u[3] >> src1->u[3];
+}
+
+static void
+micro_sin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) sin( (double) src->f[0] );
+   dst->f[1] = (float) sin( (double) src->f[1] );
+   dst->f[2] = (float) sin( (double) src->f[2] );
+   dst->f[3] = (float) sin( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_sqrt( union spu_exec_channel *dst,
+            const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) sqrt( (double) src->f[0] );
+   dst->f[1] = (float) sqrt( (double) src->f[1] );
+   dst->f[2] = (float) sqrt( (double) src->f[2] );
+   dst->f[3] = (float) sqrt( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_sub(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] - src1->f[0];
+   dst->f[1] = src0->f[1] - src1->f[1];
+   dst->f[2] = src0->f[2] - src1->f[2];
+   dst->f[3] = src0->f[3] - src1->f[3];
+}
+
+static void
+micro_u2f(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) src->u[0];
+   dst->f[1] = (float) src->u[1];
+   dst->f[2] = (float) src->u[2];
+   dst->f[3] = (float) src->u[3];
+}
+
+static void
+micro_xor(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] ^ src1->u[0];
+   dst->u[1] = src0->u[1] ^ src1->u[1];
+   dst->u[2] = src0->u[2] ^ src1->u[2];
+   dst->u[3] = src0->u[3] ^ src1->u[3];
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT:
+         chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         assert( index->i[1] < (int) mach->ImmLimit );
+         assert( index->i[2] < (int) mach->ImmLimit );
+         assert( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.i[0] *= 17;
+         index.i[1] *= 17;
+         index.i[2] *= 17;
+         index.i[3] *= 17;
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.i[0] *= 4096;
+         index.i[1] *= 4096;
+         index.i[2] *= 4096;
+         index.i[3] *= 4096;
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.i[0] += indir_index.i[0];
+         index.i[1] += indir_index.i[1];
+         index.i[2] += indir_index.i[2];
+         index.i[3] += indir_index.i[3];
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      micro_abs( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      micro_abs( chan, chan );
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kilp(struct spu_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   uint j;
+   float rgba[NUM_CHANNELS][QUAD_SIZE];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+
+   for (j = 0; j < 4; j++) {
+      r->f[j] = rgba[0][j];
+      g->f[j] = rgba[1][j];
+      b->f[j] = rgba[2][j];
+      a->f[j] = rgba[3][j];
+   }
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[1], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[1] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+         first = decl->u.DeclarationRange.First;
+         last = decl->u.DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Interpolation.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+	 micro_f2it( &r[0], &r[0] );
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   /* TGSI_OPCODE_SWZ */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	    FETCH( &r[1], 0, CHAN_Y );
+	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+	    FETCH( &r[2], 0, CHAN_W );
+	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
+	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
+	    micro_pow( &r[1], &r[1], &r[2] );
+	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Z );
+	 }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sqrt( &r[0], &r[0] );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_mul( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+       micro_mul( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+	 micro_mul( &r[0], &r[0], &r[1] );
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_min()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_max()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_mul( &r[0], &r[0], &r[1] );
+         FETCH( &r[1], 2, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_sub( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_sub( &r[1], &r[1], &r[2] );
+         micro_mul( &r[0], &r[0], &r[1] );
+         micro_add( &r[0], &r[0], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_frc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_flr( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_rnd( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_lg2( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_pow( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+
+      micro_mul( &r[2], &r[0], &r[1] );
+
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      micro_mul( &r[5], &r[3], &r[4] );
+      micro_sub( &r[2], &r[2], &r[5] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+
+      micro_mul( &r[3], &r[3], &r[2] );
+
+      FETCH(&r[5], 0, CHAN_X);
+
+      micro_mul( &r[1], &r[1], &r[5] );
+      micro_sub( &r[3], &r[3], &r[1] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      micro_mul( &r[5], &r[5], &r[4] );
+      micro_mul( &r[0], &r[0], &r[2] );
+      micro_sub( &r[5], &r[5], &r[0] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          micro_abs( &r[0], &r[0] );
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_cos( &r[0], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddx( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddy( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* for enabled ExecMask bits, set the killed bit */
+      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1],
+                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
+                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sin( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         micro_cos( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         micro_sin( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ceil( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_i2f( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_not( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_trunc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_shl( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ishr( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_and( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_or( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_xor( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   for (i = 0; i < mach->NumDeclarations; i++) {
+      exec_declaration( mach, mach->Declarations+i );
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      assert(pc < mach->NumInstructions);
+      exec_instruction( mach, mach->Instructions + pc, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..89e422ba48
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -0,0 +1,171 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/tgsi/exec/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+				      + TGSI_EXEC_NUM_ADDRS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 6886f283be..9daa3ec735 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -36,6 +36,7 @@
 #include "spu_render.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+#include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
 
@@ -50,6 +51,7 @@ boolean Debug = FALSE;
 
 struct spu_global spu;
 
+struct spu_vs_context draw;
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -264,6 +266,18 @@ cmd_state_vertex_info(const struct vertex_info *vinfo)
 }
 
 
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_ATTRIB_MAX);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.format[attr] = vs_info->format;
+   draw.vertex_fetch.dirty = 1;
+}
+
 
 static void
 cmd_finish(void)
@@ -374,6 +388,20 @@ cmd_batch(uint opcode)
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct vertex_info) / 4);
          break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_array_info) / 4);
+         break;
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw,
+                                   (struct cell_command_vs *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_command_vs) / 4);
+         break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
          ASSERT(0);
diff --git a/src/mesa/pipe/cell/spu/spu_util.c b/src/mesa/pipe/cell/spu/spu_util.c
new file mode 100644
index 0000000000..ac373240c1
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_util.c
@@ -0,0 +1,165 @@
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..b8f8c52eed
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,493 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+
+
+#define DRAW_DBG 0
+
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static spu_fetch_func get_fetch_func( enum pipe_format format )
+{
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+
+static void fetch_xyz_rgb( struct spu_vs_context *draw,
+			   struct spu_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
+{
+   assert(count <= 4);
+
+//   _mesa_printf("%s\n", __FUNCTION__);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
+			      struct spu_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch( struct spu_vs_context *draw,
+				  struct spu_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for (/* empty */; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   unsigned i;
+
+   
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      draw->vertex_fetch.fetch[i] =
+          get_fetch_func(draw->vertex_fetch.format[i]);
+   }
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (draw->vertex_fetch.nr_attrs) {
+   case 2:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..e694ff729f
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,224 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "pipe/draw/draw_private.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/cell/common.h"
+
+#define DBG_VS 0
+
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   struct vertex_header *vOut[])
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+
+   /* Consts does not require 16 byte alignment. */
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
+					   draw->nr_planes);
+      vOut[j]->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      vOut[j]->data[0][0] = x * scale[0] + trans[0];
+      vOut[j]->data[0][1] = y * scale[1] + trans[1];
+      vOut[j]->data[0][2] = z * scale[2] + trans[2];
+      vOut[j]->data[0][3] = w;
+
+#if DBG_VS
+      printf("output[%d]win: %f %f %f %f\n", j,
+             vOut[j]->data[0][0],
+             vOut[j]->data[0][1],
+             vOut[j]->data[0][2],
+             vOut[j]->data[0][3]);
+#endif
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+#if DBG_VS
+         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+                vOut[j]->data[slot][0],
+                vOut[j]->data[slot][1],
+                vOut[j]->data[slot][2],
+                vOut[j]->data[slot][3]);
+#endif
+      }
+   } /* loop over vertices */
+}
+
+
+static void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       void *uniforms,
+		       void *planes,
+		       unsigned nr_planes,
+		       unsigned num_outputs
+		       )
+{
+   draw->constants = (float (*)[4]) uniforms;
+
+   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
+   draw->nr_planes = nr_planes;
+   draw->num_vs_outputs = num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+}
+
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs)
+{
+   unsigned i;
+   unsigned j;
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->shader.instructions;
+   draw->machine.NumInstructions = vs->shader.num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->shader.declarations;
+   draw->machine.NumDeclarations = vs->shader.num_declarations;
+
+   spu_bind_vertex_shader(draw, vs->shader.uniforms,
+			  NULL, -1,
+			  vs->shader.num_outputs);
+   
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+      unsigned elts[4];
+
+      for (j = 0; j < batch_size; j++) {
+	 switch (vs->bytes_per_elt) {
+	 case 1: elts[j] = ((unsigned char *) vs->elts)[i + j]; break;
+	 case 2: elts[j] = ((unsigned short *)vs->elts)[i + j]; break;
+	 case 4: elts[j] = ((unsigned int *)  vs->elts)[i + j]; break;
+	 }
+      }
+
+      run_vertex_program(draw, elts, batch_size,
+			 (struct vertex_header (*)[]) vs->vOut);
+   }
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..c52f38fd02
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,61 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      enum pipe_format format[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
+      spu_full_fetch_func fetch_func;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
-- 
cgit v1.2.3


From 33cac4824195337d9cf3dfda3fc1147c429ae43c Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 10:56:53 -0800
Subject: Initial pass at instruction / declaration fetch

---
 src/mesa/pipe/cell/spu/spu_exec.c | 22 ++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_main.h |  1 +
 2 files changed, 23 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 6888e97caf..f43278198e 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -50,6 +50,9 @@
  *   Brian Paul
  */
 
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
@@ -57,6 +60,7 @@
 #include "pipe/tgsi/util/tgsi_parse.h"
 #include "pipe/tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
+#include "spu_main.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -2329,12 +2333,30 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
    /* execute declarations (interpolants) */
    for (i = 0; i < mach->NumDeclarations; i++) {
+      uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_declaration decl;
+      unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+      unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+
+      mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
       exec_declaration( mach, mach->Declarations+i );
    }
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
+      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_instruction inst;
+      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
+      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
+
       assert(pc < mach->NumInstructions);
+      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
       exec_instruction( mach, mach->Instructions + pc, &pc );
    }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 73f9ed29d6..8be5268f52 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -96,6 +96,7 @@ extern boolean Debug;
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
 #define TAG_TEXTURE_TILE      19
+#define TAG_INSTRUCTION_FETCH 20
 
 
-- 
cgit v1.2.3


From 13eec106881b846538bef13d694c9d2d9cf1ae6b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 11:28:06 -0800
Subject: Implement vertex fetch / vertex shader output write-back

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 32 +++++++++++----
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 62 +++++++++++++++---------------
 src/mesa/pipe/draw/draw_context.c          |  5 ++-
 3 files changed, 58 insertions(+), 41 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index b8f8c52eed..0192227d57 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -30,11 +30,13 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include <spu_mfcio.h>
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"
+#include "spu_main.h"
 
 
 #define DRAW_DBG 0
@@ -412,16 +414,18 @@ static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-static void generic_vertex_fetch( struct spu_vs_context *draw,
-				  struct spu_exec_machine *machine,
-				  const unsigned *elts,
-				  unsigned count )
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
    assert(count <= 4);
 
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
 //   _mesa_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
@@ -441,13 +445,23 @@ static void generic_vertex_fetch( struct spu_vs_context *draw,
        * a prototype for an sse implementation, which would have
        * difficulties doing that.
        */
-      for (i = 0; i < count; i++) 
-	 fetch( src + elts[i] * pitch, p[i] );
+      for (i = 0; i < count; i++) {
+         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
+         const unsigned long addr = src + elts[i] * pitch;
+         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+
+         mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+         wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+
+         fetch(buffer, p[i]);
+      }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
       
       /* Transpose/swizzle into sse-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
@@ -475,6 +489,9 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 
+   /* Disable the fast path because they don't use mfc_get yet.
+    */
+#if 0
    switch (draw->vertex_fetch.nr_attrs) {
    case 2:
       if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
@@ -490,4 +507,5 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    default:
       break;
    }
+#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index e694ff729f..595f54b0eb 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -32,6 +32,8 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
+#include <spu_mfcio.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -40,9 +42,7 @@
 #include "pipe/draw/draw_private.h"
 #include "pipe/draw/draw_context.h"
 #include "pipe/cell/common.h"
-
-#define DBG_VS 0
-
+#include "spu_main.h"
 
 static INLINE unsigned
 compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
@@ -110,6 +110,12 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+			   + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+	  (struct vertex_header *) buffer;
+      const unsigned vert_size = sizeof(struct vertex_header)
+	  + (sizeof(float) * 4 * draw->num_vs_outputs);
 
       /* Handle attr[0] (position) specially:
        *
@@ -117,14 +123,14 @@ run_vertex_program(struct spu_vs_context *draw,
        * program as a set of DP4 instructions appended to the
        * user-provided code.
        */
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
 
-      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
 					   draw->nr_planes);
-      vOut[j]->edgeflag = 1;
+      tmpOut->edgeflag = 1;
 
       /* divide by w */
       w = 1.0f / w;
@@ -133,35 +139,27 @@ run_vertex_program(struct spu_vs_context *draw,
       z *= w;
 
       /* Viewport mapping */
-      vOut[j]->data[0][0] = x * scale[0] + trans[0];
-      vOut[j]->data[0][1] = y * scale[1] + trans[1];
-      vOut[j]->data[0][2] = z * scale[2] + trans[2];
-      vOut[j]->data[0][3] = w;
-
-#if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
-             vOut[j]->data[0][0],
-             vOut[j]->data[0][1],
-             vOut[j]->data[0][2],
-             vOut[j]->data[0][3]);
-#endif
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
        */
       for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-#if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
-                vOut[j]->data[slot][0],
-                vOut[j]->data[slot][1],
-                vOut[j]->data[slot][2],
-                vOut[j]->data[slot][3]);
-#endif
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
       }
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+
    } /* loop over vertices */
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
 }
 
 
diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index e8ca1f035b..711bcd02f6 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -71,10 +71,11 @@ struct draw_context *draw_create( void )
     */
    {
       uint i;
-      char *tmp = (char*) MALLOC( Elements(draw->vcache.vertex) * MAX_VERTEX_SIZE );
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
 
       for (i = 0; i < Elements(draw->vcache.vertex); i++)
-	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * MAX_VERTEX_SIZE);
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
    draw->convert_wide_points = TRUE;
-- 
cgit v1.2.3


From fcf944177325cdf8bf6e4f1b70296c19476e2375 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 10:43:23 -0800
Subject: Pass ptr to local memory copy instead of main memory to
 exec_instruction

This was essentially a cut-and-paste bug when the instruction fetcher
was added.  Also, the test for TGSI_PROCESSOR_FRAGMENT was moved
outside the loop for exec_declaration.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index f43278198e..b3db6716d5 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -2332,17 +2332,19 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
 
    /* execute declarations (interpolants) */
-   for (i = 0; i < mach->NumDeclarations; i++) {
-      uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
-      struct tgsi_full_declaration decl;
-      unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
-      unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+	 struct tgsi_full_declaration decl;
+	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
 
-      mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
-      memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-      exec_declaration( mach, mach->Declarations+i );
+	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
+	 exec_declaration( mach, decl );
+      }
    }
 
    /* execute instructions, until pc is set to -1 */
@@ -2357,7 +2359,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
       wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
       memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
-      exec_instruction( mach, mach->Instructions + pc, &pc );
+      exec_instruction( mach, & inst, &pc );
    }
 
 #if 0
-- 
cgit v1.2.3


From a0a707342a353024271f09cd52bd955d8df310a8 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 10:46:55 -0800
Subject: Missing amperstand in previous commit.  Oops.

---
 src/mesa/pipe/cell/spu/spu_exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index b3db6716d5..85b5815cad 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -2343,7 +2343,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
 	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-	 exec_declaration( mach, decl );
+	 exec_declaration( mach, &decl );
       }
    }
 
-- 
cgit v1.2.3


From 708d699e0cebb2dfbca7b6639ee5b177dc8c4c61 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 12:59:09 -0800
Subject: Fetch uniforms from main memory.

---
 src/mesa/pipe/cell/spu/spu_exec.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 85b5815cad..78f7d0962f 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -791,12 +791,23 @@ fetch_src_file_channel(
    case TGSI_EXTSWIZZLE_Z:
    case TGSI_EXTSWIZZLE_W:
       switch( file ) {
-      case TGSI_FILE_CONSTANT:
-         chan->f[0] = mach->Consts[index->i[0]][swizzle];
-         chan->f[1] = mach->Consts[index->i[1]][swizzle];
-         chan->f[2] = mach->Consts[index->i[2]][swizzle];
-         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+      case TGSI_FILE_CONSTANT: {
+         unsigned char buffer[32] ALIGN16_ATTRIB;
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
+            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+
+            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
+                + (sizeof(float) * swizzle)], sizeof(float));
+         }
          break;
+      }
 
       case TGSI_FILE_INPUT:
          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
-- 
cgit v1.2.3


From 7b27d9fd660c122fb2ec50007129d67e78814587 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 17:26:22 -0800
Subject: Fix size calculation in attribute fetch.

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 0192227d57..1e846868e3 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -446,14 +446,14 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * difficulties doing that.
        */
       for (i = 0; i < count; i++) {
-         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
-         const unsigned long addr = src + elts[i] * pitch;
-         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+         uint8_t buffer[32] ALIGN16_ATTRIB;
+         const unsigned long addr = src + (elts[i] * pitch);
+         const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+         memmove(& buffer, buffer + (addr & 0x0f), 16);
 
          fetch(buffer, p[i]);
       }
-- 
cgit v1.2.3


From 334986114665df650649634b63184be6f1b9cd9b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 17:28:48 -0800
Subject: Implement micro_pow and micro_sqrt

Unimplemented micro ops get assertions for now.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 43 ++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 12 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 78f7d0962f..168bada3bb 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,6 +52,8 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/powf4.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
@@ -207,6 +209,7 @@ micro_ceil(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) ceil( (double) src->f[0] );
    dst->f[1] = (float) ceil( (double) src->f[1] );
@@ -220,6 +223,7 @@ micro_cos(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) cos( (double) src->f[0] );
    dst->f[1] = (float) cos( (double) src->f[1] );
@@ -307,6 +311,7 @@ micro_exp2(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src)
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
    dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
@@ -342,6 +347,7 @@ micro_flr(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) floor( (double) src->f[0] );
    dst->f[1] = (float) floor( (double) src->f[1] );
@@ -355,6 +361,7 @@ micro_frc(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
    dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
@@ -393,6 +400,7 @@ micro_lg2(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
    dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
@@ -649,12 +657,18 @@ micro_pow(
    const union spu_exec_channel *src0,
    const union spu_exec_channel *src1 )
 {
-#if 0
-   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
-   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
-   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
-   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
-#endif
+   vec_float4 s0 = (vec_float4) {
+      src0->f[0], src0->f[1], src0->f[2], src0->f[3]
+   };
+   vec_float4 s1 = (vec_float4) {
+      src1->f[0], src1->f[1], src1->f[2], src1->f[3]
+   };
+   vec_float4 d = _powf4(s0, s1);
+
+   dst->f[0] = spu_extract(d, 0);
+   dst->f[1] = spu_extract(d, 1);
+   dst->f[2] = spu_extract(d, 2);
+   dst->f[3] = spu_extract(d, 3);
 }
 
 static void
@@ -662,6 +676,7 @@ micro_rnd(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
    dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
@@ -722,6 +737,7 @@ micro_sin(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) sin( (double) src->f[0] );
    dst->f[1] = (float) sin( (double) src->f[1] );
@@ -734,12 +750,15 @@ static void
 micro_sqrt( union spu_exec_channel *dst,
             const union spu_exec_channel *src )
 {
-#if 0
-   dst->f[0] = (float) sqrt( (double) src->f[0] );
-   dst->f[1] = (float) sqrt( (double) src->f[1] );
-   dst->f[2] = (float) sqrt( (double) src->f[2] );
-   dst->f[3] = (float) sqrt( (double) src->f[3] );
-#endif
+   vec_float4 s = (vec_float4) {
+      src->f[0], src->f[1], src->f[2], src->f[3]
+   };
+   vec_float4 d = _sqrtf4(s);
+
+   dst->f[0] = spu_extract(d, 0);
+   dst->f[1] = spu_extract(d, 1);
+   dst->f[2] = spu_extract(d, 2);
+   dst->f[3] = spu_extract(d, 3);
 }
 
 static void
-- 
cgit v1.2.3


From 137cb72284a115d8f5ffadf2154b6f5eb5323a7d Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:24:40 -0800
Subject: Elts are always ints, pass vOut pointers in-line in command

---
 src/mesa/pipe/cell/common.h                |  6 +++---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 14 ++------------
 2 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 80a1425ec7..fbbdf728a1 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -144,13 +144,13 @@ struct cell_shader_info
 } ALIGN16_ATTRIB;
 
 
+#define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
    struct cell_shader_info   shader;
-   void *elts;
    unsigned num_elts;
-   unsigned bytes_per_elt;
-   void *vOut;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 595f54b0eb..82165501c5 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -81,7 +81,7 @@ compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
 static void
 run_vertex_program(struct spu_vs_context *draw,
                    unsigned elts[4], unsigned count,
-                   struct vertex_header *vOut[])
+                   const uint64_t *vOut)
 {
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
@@ -206,17 +206,7 @@ spu_execute_vertex_shader(struct spu_vs_context *draw,
    
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
-      unsigned elts[4];
-
-      for (j = 0; j < batch_size; j++) {
-	 switch (vs->bytes_per_elt) {
-	 case 1: elts[j] = ((unsigned char *) vs->elts)[i + j]; break;
-	 case 2: elts[j] = ((unsigned short *)vs->elts)[i + j]; break;
-	 case 4: elts[j] = ((unsigned int *)  vs->elts)[i + j]; break;
-	 }
-      }
 
-      run_vertex_program(draw, elts, batch_size,
-			 (struct vertex_header (*)[]) vs->vOut);
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
    }
 }
-- 
cgit v1.2.3


From fb348c2cb16d0bc216d29889474972d5c14d0980 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:25:47 -0800
Subject: Set machine->Processor

The default value is 0, which is TGSI_PROCESSOR_FRAGMENT...not correct
for a vertex shader!
---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 82165501c5..125b2c3a43 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -93,7 +93,8 @@ run_vertex_program(struct spu_vs_context *draw,
 
    assert(count <= 4);
 
-   /* Consts does not require 16 byte alignment. */
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
    ASSERT_ALIGN16(draw->constants);
    machine->Consts = (float (*)[4]) draw->constants;
 
-- 
cgit v1.2.3


From 193491cbd3ad2ad95243181c201da4640f3a29c2 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:30:15 -0800
Subject: Handle CELL_CMD_VS_EXECUTE *only* outside batch commands.

---
 src/mesa/pipe/cell/common.h       | 3 ++-
 src/mesa/pipe/cell/spu/spu_main.c | 8 +++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index fbbdf728a1..a40cfb8210 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -133,7 +133,6 @@ struct cell_array_info
 
 struct cell_shader_info
 {
-   unsigned processor;
    unsigned num_outputs;
 
    void *declarations;
@@ -147,6 +146,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
+   uint opcode;       /**< CELL_CMD_VS_EXECUTE */
    struct cell_shader_info   shader;
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -190,6 +190,7 @@ struct cell_command
    struct cell_command_framebuffer fb;
    struct cell_command_clear_surface clear;
    struct cell_command_render render;
+   struct cell_command_vs vs;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 9daa3ec735..7105c0f897 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -397,11 +397,6 @@ cmd_batch(uint opcode)
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct cell_array_info) / 4);
          break;
-      case CELL_CMD_VS_EXECUTE:
-         spu_execute_vertex_shader(&draw,
-                                   (struct cell_command_vs *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_command_vs) / 4);
-         break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
          ASSERT(0);
@@ -470,6 +465,9 @@ main_loop(void)
             assert(pos_incr == 0);
          }
          break;
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+         break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
          break;
-- 
cgit v1.2.3


From 10270fbe2d362fe8f27384b9a5423381e2882460 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:33:30 -0800
Subject: Correctly read / write vertex header from / to main memory

---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 125b2c3a43..ea5ffae6bc 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -112,11 +112,16 @@ run_vertex_program(struct spu_vs_context *draw,
       unsigned slot;
       float x, y, z, w;
       unsigned char buffer[sizeof(struct vertex_header)
-			   + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
       struct vertex_header *const tmpOut =
-	  (struct vertex_header *) buffer;
-      const unsigned vert_size = sizeof(struct vertex_header)
-	  + (sizeof(float) * 4 * draw->num_vs_outputs);
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
 
       /* Handle attr[0] (position) specially:
        *
@@ -155,12 +160,8 @@ run_vertex_program(struct spu_vs_context *draw,
          tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
       }
 
-      wait_on_mask(1 << TAG_VERTEX_BUFFER);
       mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
-
    } /* loop over vertices */
-
-   wait_on_mask(1 << TAG_VERTEX_BUFFER);
 }
 
 
-- 
cgit v1.2.3


From de949a471ed66f0e6db0819bf55b2ec74b7e4048 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:34:22 -0800
Subject: cell_array_info should not be 16-byte aligned

Forcing cell_array_info to be 16-byte aligned makes it more difficult
to stuff that state in batch commands.
---
 src/mesa/pipe/cell/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index a40cfb8210..533ad2cf6e 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -128,7 +128,7 @@ struct cell_array_info
     uint attr;                /**< Attribute that this state if for. */
     uint pitch;               /**< Byte pitch from one entry to the next. */
     enum pipe_format format;  /**< Pipe format of each entry. */
-} ALIGN16_ATTRIB;
+};
 
 
 struct cell_shader_info
-- 
cgit v1.2.3


From 9ad986b88763f6baefa73830dcd5762156ab9b20 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:40:24 -0800
Subject: Numerous small fixed to PPU-SPU vertex shader protocol

---
 src/mesa/pipe/cell/common.h                | 19 ++++++++++++-------
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 27 ++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 533ad2cf6e..28b0c59a0a 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,10 +124,10 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    void *base;               /**< Base address of the 0th element. */
-    uint attr;                /**< Attribute that this state if for. */
-    uint pitch;               /**< Byte pitch from one entry to the next. */
-    enum pipe_format format;  /**< Pipe format of each entry. */
+    uint64_t base;      /**< Base address of the 0th element. */
+    uint attr;          /**< Attribute that this state if for. */
+    uint pitch;         /**< Byte pitch from one entry to the next. */
+    uint format;        /**< Pipe format of each entry. */
 };
 
 
@@ -135,11 +135,13 @@ struct cell_shader_info
 {
    unsigned num_outputs;
 
-   void *declarations;
+   uint64_t declarations;
    unsigned num_declarations;
-   void *instructions;
+   uint64_t instructions;
    unsigned num_instructions;
-   void *uniforms;
+   uint64_t uniforms;
+   uint64_t  immediates;
+   unsigned num_immediates;
 } ALIGN16_ATTRIB;
 
 
@@ -151,6 +153,9 @@ struct cell_command_vs
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
    uint64_t vOut[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index ea5ffae6bc..c1cbbb6d1e 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -187,12 +187,22 @@ spu_bind_vertex_shader(struct spu_vs_context *draw,
 }
 
 
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
 void
 spu_execute_vertex_shader(struct spu_vs_context *draw,
-			  const struct cell_command_vs *vs)
+                          const struct cell_command_vs *vs)
 {
    unsigned i;
-   unsigned j;
+
+   const uint64_t immediate_addr = vs->shader.immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates)
+                 + (immediate_addr & 0x0f));
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
 
    draw->machine.Instructions = (struct tgsi_full_instruction *)
        vs->shader.instructions;
@@ -202,10 +212,17 @@ spu_execute_vertex_shader(struct spu_vs_context *draw,
        vs->shader.declarations;
    draw->machine.NumDeclarations = vs->shader.num_declarations;
 
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->shader.num_immediates);
+
    spu_bind_vertex_shader(draw, vs->shader.uniforms,
-			  NULL, -1,
-			  vs->shader.num_outputs);
-   
+                          vs->plane, vs->nr_planes,
+                          vs->shader.num_outputs);
+
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
 
-- 
cgit v1.2.3


From 5a6fd9393021b9476273b7831bcda2186c9324a1 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:12:00 -0800
Subject: Use SPUs for vertex shader processing

---
 src/mesa/pipe/cell/ppu/Makefile             |   1 +
 src/mesa/pipe/cell/ppu/cell_context.c       |  12 ++-
 src/mesa/pipe/cell/ppu/cell_context.h       |   2 +
 src/mesa/pipe/cell/ppu/cell_vertex_shader.c | 118 ++++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/pipe/cell/ppu/cell_vertex_shader.c

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/Makefile b/src/mesa/pipe/cell/ppu/Makefile
index e7f2562da7..50060f5cd3 100644
--- a/src/mesa/pipe/cell/ppu/Makefile
+++ b/src/mesa/pipe/cell/ppu/Makefile
@@ -34,6 +34,7 @@ SOURCES = \
 	cell_surface.c \
 	cell_texture.c \
 	cell_vbuf.c \
+	cell_vertex_shader.c \
 	cell_winsys.c
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index e8020a49bc..4885cd0d2c 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -39,6 +39,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
@@ -156,6 +157,15 @@ cell_destroy_context( struct pipe_context *pipe )
 }
 
 
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create();
+
+   draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+   draw->driver_private = cell;
+   return draw;
+}
 
 
 struct pipe_context *
@@ -242,7 +252,7 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_init_surface_functions(cell);
 
-   cell->draw = draw_create();
+   cell->draw = cell_draw_create(cell);
 
    cell_init_vbuf(cell);
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 65b89518ad..3b63419b5e 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -126,6 +126,8 @@ cell_context(struct pipe_context *pipe)
 extern struct pipe_context *
 cell_create_context(struct pipe_winsys *ws, struct cell_winsys *cws);
 
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..aef329a902
--- /dev/null
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,118 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "pipe/cell/common.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   unsigned *batch;
+   struct cell_array_info array_info;
+   unsigned i, j;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      array_info.opcode = CELL_CMD_STATE_VS_ARRAY_INFO;
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info.base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info.attr = i;
+      array_info.pitch = draw->vertex_fetch.pitch[i];
+      array_info.format = draw->vertex_element[i].src_format;
+
+      cell_batch_append(cell, & array_info, sizeof(array_info));
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(unsigned)
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->shader.num_outputs = draw->num_vs_outputs;
+   vs->shader.declarations = (uintptr_t) draw->machine.Declarations;
+   vs->shader.num_declarations = draw->machine.NumDeclarations;
+   vs->shader.instructions = (uintptr_t) draw->machine.Instructions;
+   vs->shader.num_instructions = draw->machine.NumInstructions;
+   vs->shader.uniforms = (uintptr_t) draw->user.constants;
+   vs->shader.immediates = (uintptr_t) draw->machine.Imms;
+   vs->shader.num_immediates = draw->machine.ImmLimit / 4;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].dest;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = vs->vOut[0];
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(& cell->pipe, PIPE_FLUSH_WAIT);
+   }
+
+   draw->vs.queue_nr = 0;
+}
-- 
cgit v1.2.3


From 62d11b98c4a4904b56fab153407f49619d6d331d Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:14:14 -0800
Subject: I don't know why using uint64_t for "base" doesn't work.  Ugh.

---
 src/mesa/pipe/cell/common.h       | 5 +++--
 src/mesa/pipe/cell/spu/spu_main.c | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 28b0c59a0a..05aeed83ab 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,11 +124,12 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;      /**< Base address of the 0th element. */
+    uint opcode;
+    uint base;          /**< Base address of the 0th element. */
     uint attr;          /**< Attribute that this state if for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
-};
+} ALIGN16_ATTRIB;
 
 
 struct cell_shader_info
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 7105c0f897..d6393048f5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -394,8 +394,8 @@ cmd_batch(uint opcode)
          pos += (1 + sizeof(struct pipe_viewport_state) / 4);
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_array_info) / 4);
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos]);
+         pos += (sizeof(struct cell_array_info) / 4);
          break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
-- 
cgit v1.2.3


From 256486829f0bc2be7a986a6bdc08df5fc16b77d8 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 08:12:47 -0700
Subject: Cell: set GALLIUM_CELL_VS env var to enable SPU-based vertex
 transformation

---
 src/mesa/pipe/cell/ppu/cell_context.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index 4885cd0d2c..bbe1fd7a11 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -162,8 +162,12 @@ cell_draw_create(struct cell_context *cell)
 {
    struct draw_context *draw = draw_create();
 
-   draw->shader_queue_flush = cell_vertex_shader_queue_flush;
-   draw->driver_private = cell;
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+
    return draw;
 }
 
-- 
cgit v1.2.3


From cd53eb0db19daf1c9aac94011a54e902eb10fe75 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 08:21:38 -0700
Subject: Cell: SIMD-ize const_coeff()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index e436e153ec..08b8bf0c9c 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -723,24 +723,18 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex->data[slot][i].
- * The result will be put into setup.coef[slot].a0[i].
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
  * \param slot  which attribute slot 
- * \param i  which component of the slot (0..3)
  */
-static void const_coeff(uint slot)
+static INLINE void const_coeff(uint slot)
 {
-   uint i;
-   ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-   for (i = 0; i < 4; i++) {
-      setup.coef[slot].dadx.f[i] = 0;
-      setup.coef[slot].dady.f[i] = 0;
-
-      /* need provoking vertex info!
-       */
-      setup.coef[slot].a0.f[i] = setup.vprovoke->data[slot][i];
-   }
+   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0.f[0] = setup.vprovoke->data[slot][0];
+   setup.coef[slot].a0.f[1] = setup.vprovoke->data[slot][1];
+   setup.coef[slot].a0.f[2] = setup.vprovoke->data[slot][2];
+   setup.coef[slot].a0.f[3] = setup.vprovoke->data[slot][3];
 }
 
 
-- 
cgit v1.2.3


From b108bea6b44c1abc6d61e3e47096e5122de89cd1 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 09:27:57 -0700
Subject: Cell: store current tile status in cur_tile_status_c/z, add
 TILE_STATUS_GETTING

---
 src/mesa/pipe/cell/spu/spu_render.c | 36 ++++++++++++++++-----
 src/mesa/pipe/cell/spu/spu_tile.c   |  1 +
 src/mesa/pipe/cell/spu/spu_tile.h   |  8 +++--
 src/mesa/pipe/cell/spu/spu_tri.c    | 62 ++++++++++++++++++++++++++++++-------
 src/mesa/pipe/cell/spu/spu_tri.h    |  2 +-
 5 files changed, 87 insertions(+), 22 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index f506095116..ca54a103bd 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -95,13 +95,15 @@ static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
-      if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+      if (cur_tile_status_z != TILE_STATUS_CLEAR) {
          get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+         cur_tile_status_z = TILE_STATUS_GETTING;
       }
    }
 
-   if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+   if (cur_tile_status_c != TILE_STATUS_CLEAR) {
       get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+      cur_tile_status_c = TILE_STATUS_GETTING;
    }
 }
 
@@ -112,14 +114,24 @@ get_cz_tiles(uint tx, uint ty)
 static INLINE void
 put_cz_tiles(uint tx, uint ty)
 {
-   if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+   if (cur_tile_status_z == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
       put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-      tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+      cur_tile_status_z = TILE_STATUS_DEFINED;
+   }
+   else if (cur_tile_status_z == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      cur_tile_status_z = TILE_STATUS_DEFINED;
    }
 
-   if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+   if (cur_tile_status_c == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
       put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-      tile_status[ty][tx] = TILE_STATUS_DEFINED;
+      cur_tile_status_c = TILE_STATUS_DEFINED;
+   }
+   else if (cur_tile_status_c == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      cur_tile_status_c = TILE_STATUS_DEFINED;
    }
 }
 
@@ -238,8 +250,13 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      cur_tile_status_c = tile_status[ty][tx];
+      cur_tile_status_z = tile_status_z[ty][tx];
+
       get_cz_tiles(tx, ty);
 
+      uint drawn = 0;
+
       /* loop over tris */
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
@@ -248,13 +265,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty);
       }
 
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
       /* write color/z tiles back to main framebuffer, if dirtied */
       put_cz_tiles(tx, ty);
 
       wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      tile_status[ty][tx] = cur_tile_status_c;
+      tile_status_z[ty][tx] = cur_tile_status_z;
    }
 
    if (Debug)
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index ca1352f9f8..aea4785bc2 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -37,6 +37,7 @@ tile_t ztile ALIGN16_ATTRIB;
 ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+ubyte cur_tile_status_c, cur_tile_status_z;
 
 
 void
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 18d1b3c117..1f123a2b7b 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -51,12 +51,16 @@ extern tile_t ztile ALIGN16_ATTRIB;
 
 
 #define TILE_STATUS_CLEAR   1
-#define TILE_STATUS_DEFINED 2  /**< defined pixel data */
-#define TILE_STATUS_DIRTY   3  /**< modified, but not put back yet */
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+extern ubyte cur_tile_status_c, cur_tile_status_z;
+
 
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 08b8bf0c9c..a32878d917 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -299,16 +299,23 @@ do_depth_test(int x, int y, unsigned int mask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+   if (cur_tile_status_c == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
+      cur_tile_status_z = TILE_STATUS_DIRTY;
    }
-   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+
+#if 0
+   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
+      /* now, _really_ clear the tile */
+      clear_z_tile(&ztile);
+   }
+   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
-
+   cur_tile_status_z = TILE_STATUS_DIRTY;
+#endif
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       zvals.v = spu_mul(zvals.v, zscale16.v);
@@ -380,6 +387,9 @@ do_depth_test(int x, int y, unsigned int mask)
       }
    }
 
+   if (mask)
+      cur_tile_status_z = TILE_STATUS_DIRTY;
+
    return mask;
 }
 
@@ -397,15 +407,15 @@ do_depth_test_simd(int x, int y, vector unsigned int quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+   cur_tile_status_z = TILE_STATUS_DIRTY;
 
    /* XXX fetch Z value sooner to hide latency here */
    zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
@@ -462,15 +472,23 @@ emit_quad( int x, int y, mask_t mask )
    if (mask)
 #endif
    {
-      if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else if (tile_status[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+
+#if 0
+      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+         /* now, _really_ clear the tile */
+         clear_c_tile(&ctile);
+         cur_tile_status_c = TILE_STATUS_DIRTY;
+      }
+      else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-      tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+#endif
+      cur_tile_status_c = TILE_STATUS_DIRTY;
 
 #if SIMD_Z
       if (spu_extract(mask, 0))
@@ -970,7 +988,7 @@ static void subtriangle( struct edge *eleft,
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
-void
+boolean
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
    setup.tx = tx;
@@ -985,7 +1003,7 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
-      return; /* totally clipped */
+      return FALSE; /* totally clipped */
    }
 
    setup_tri_coefficients();
@@ -999,6 +1017,24 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
+   if (cur_tile_status_c == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      cur_tile_status_c = TILE_STATUS_CLEAN;
+   }
+
+   ASSERT(cur_tile_status_c != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (cur_tile_status_z == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         cur_tile_status_z = TILE_STATUS_CLEAN;
+      }
+   ASSERT(cur_tile_status_z != TILE_STATUS_DEFINED);
+   }
+
+
    if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
@@ -1013,4 +1049,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    }
 
    flush_spans();
+
+   return TRUE;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_tri.h b/src/mesa/pipe/cell/spu/spu_tri.h
index 86c42b6339..aa694dd7c9 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.h
+++ b/src/mesa/pipe/cell/spu/spu_tri.h
@@ -30,7 +30,7 @@
 #define SPU_TRI_H
 
 
-extern void
+extern boolean
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
-- 
cgit v1.2.3


From 59be082909de6021ec7d08476253bd4c9920e137 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 13:45:58 -0700
Subject: Cell: implement Z16 and Z32 testing with SIMD instructions.

---
 src/mesa/pipe/cell/spu/spu_tile.h  |   3 +-
 src/mesa/pipe/cell/spu/spu_tri.c   | 222 +++++--------------------------------
 src/mesa/pipe/cell/spu/spu_ztest.h | 135 ++++++++++++++++++++++
 3 files changed, 163 insertions(+), 197 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_ztest.h

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 1f123a2b7b..4b1ef2a4c8 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,7 +42,8 @@
 typedef union {
    ushort t16[TILE_SIZE][TILE_SIZE];
    uint   t32[TILE_SIZE][TILE_SIZE];
-   float4 f4[TILE_SIZE/2][TILE_SIZE/2];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a32878d917..a26a4f098d 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -39,18 +39,11 @@
 #include "spu_tile.h"
 #include "spu_tri.h"
 
+#include "spu_ztest.h"
 
-/*
- * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
- * to do Z testing/updating.
- */
-#define SIMD_Z 0
 
-#if SIMD_Z
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
-#else
-typedef uint mask_t;
-#endif
 
 
 /**
@@ -282,20 +275,11 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
 }
 
 
-
-static unsigned int
-do_depth_test(int x, int y, unsigned int mask)
+static INLINE mask_t
+do_depth_test(int x, int y, mask_t quadmask)
 {
-   static const float4 zscale16
-      = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
-   static const float4 zscale32
-      = {.f={(float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff}};
-   int ix = x - setup.cliprect_minx;
-   int iy = y - setup.cliprect_miny;
    float4 zvals;
+   mask_t mask;
 
    zvals.v = eval_z((float) x, (float) y);
 
@@ -305,129 +289,20 @@ do_depth_test(int x, int y, unsigned int mask)
       cur_tile_status_z = TILE_STATUS_DIRTY;
    }
 
-#if 0
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-#endif
-
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      zvals.v = spu_mul(zvals.v, zscale16.v);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t16[iy][ix])
-            ztile.t16[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t16[iy][ix+1])
-            ztile.t16[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t16[iy+1][ix])
-            ztile.t16[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t16[iy+1][ix+1])
-            ztile.t16[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 4;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
-      zvals.v = spu_mul(zvals.v, zscale32.v);
-      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t32[iy][ix])
-            ztile.t32[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t32[iy][ix+1])
-            ztile.t32[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t32[iy+1][ix])
-            ztile.t32[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t32[iy+1][ix+1])
-            ztile.t32[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 2;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
    }
-
-   if (mask)
-      cur_tile_status_z = TILE_STATUS_DIRTY;
-
    return mask;
 }
 
 
-
-
-static vector unsigned int
-do_depth_test_simd(int x, int y, vector unsigned int quadmask)
-{
-   int ix = (x - setup.cliprect_minx) / 2;
-   int iy = (y - setup.cliprect_miny) / 2;
-   float4 zvals;
-
-   vector unsigned int zmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-
-   /* XXX fetch Z value sooner to hide latency here */
-   zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
-   zmask = spu_and(zmask, quadmask);
-
-   ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
-   //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
-
-   return zmask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
@@ -461,36 +336,18 @@ emit_quad( int x, int y, mask_t mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-#if SIMD_Z
-      mask = do_depth_test_simd(x, y, mask);
-#else
       mask = do_depth_test(x, y, mask);
-#endif
    }
 
-#if !SIMD_Z
-   if (mask)
-#endif
-   {
-      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
-      }
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
 
-#if 0
       if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
-         cur_tile_status_c = TILE_STATUS_DIRTY;
       }
-      else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
-         /* make sure we've got the tile from main mem */
-         wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      }
-#endif
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
-#if SIMD_Z
       if (spu_extract(mask, 0))
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
@@ -499,20 +356,11 @@ emit_quad( int x, int y, mask_t mask )
          ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
          ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#elif 0
+
+#if 0
       /* SIMD_Z with swizzled color buffer (someday) */
       vector float icolors = *((vector float *) &colors);
       ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
-
-#else
-      if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
-      if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
-      if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
-      if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 #endif
    }
 
@@ -533,38 +381,20 @@ static INLINE int block( int x )
 /**
  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
  * the triangle's bounds.
- *
- * this is pretty nasty...  may need to rework flush_spans again to
- * fix it, if possible.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static mask_t calculate_mask( int x )
+static INLINE mask_t calculate_mask( int x )
 {
-#if SIMD_Z
-   uint m0, m1, m2, m3;
-
-   m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
-   m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
-   m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
-   m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
-
-   return (vector unsigned int) {m0, m1, m2, m3};
-#else
-   unsigned mask = 0x0;
-
-   if (x >= setup.span.left[0] && x < setup.span.right[0]) 
-      mask |= MASK_TOP_LEFT;
-
-   if (x >= setup.span.left[1] && x < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_LEFT;
-      
-   if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) 
-      mask |= MASK_TOP_RIGHT;
-
-   if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_RIGHT;
-
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
    return mask;
-#endif
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
new file mode 100644
index 0000000000..5fefb15176
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Zbuffer/depth test code.
+ */
+
+
+#ifndef SPU_ZTEST_H
+#define SPU_ZTEST_H
+
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+
+
+/**
+ * Perform Z testing for a 16-bit/value Z buffer.
+ *
+ * \param zvals  vector of four fragment zvalues as floats
+ * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
+ *               contains the Z values for 2 quads, 8 pixels.
+ * \param x      x coordinate of quad (only lsbit is significant)
+ * \param inMask indicates which fragments in the quad are alive
+ * \return new mask indicating which fragments are alive after ztest
+ */
+static INLINE vector unsigned int
+spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
+                  uint x, vector unsigned int inMask)
+{
+#define ZERO 0x80
+   vector unsigned int zvals_ui4, zbuf_ui4, mask;
+
+   /* convert floats to uints in [0, 65535] */
+   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
+   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
+
+   /* XXX this conditional could be removed with a bit of work */
+   if (x & 1) {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather lower four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                      ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                      ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 16, 17, 18, 19, 20, 21, 22, 23,
+                                 2, 3, 6, 7, 10, 11, 14, 15));
+   }
+   else {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather upper four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 2, 3, 6, 7, 10, 11, 14, 15,
+                                 24, 25, 26, 27, 28, 29, 30, 31));
+   }
+   return mask;
+#undef ZERO
+}
+
+
+/**
+ * As above, but Zbuffer values as 32-bit uints
+ */
+static INLINE vector unsigned int
+spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
+                  vector unsigned int inMask)
+{
+   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
+
+   /* convert floats to uints in [0, 0xffffffff] */
+   zvals_ui4 = spu_convtu(zvals, 32);
+   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
+   mask = spu_cmpgt(zbuf, zvals_ui4);
+   /* mask &= inMask */
+   mask = spu_and(mask, inMask);
+   /* zbuf = mask ? zval : zbuf */
+   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
+
+   return mask;
+}
+
+
+#endif /* SPU_ZTEST_H */
-- 
cgit v1.2.3


From c392cc8f1bcaaecc2cc723fc5550e5f6462602f3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 13:49:51 -0700
Subject: Cell: rename fields of the tile_t union

---
 src/mesa/pipe/cell/spu/spu_main.c    |  8 ++++++++
 src/mesa/pipe/cell/spu/spu_texture.c |  6 +++---
 src/mesa/pipe/cell/spu/spu_tile.c    |  4 ++--
 src/mesa/pipe/cell/spu/spu_tile.h    | 18 +++++-------------
 src/mesa/pipe/cell/spu/spu_tri.c     |  8 ++++----
 5 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index d6393048f5..7d6e910ad5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -36,6 +36,7 @@
 #include "spu_render.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+//#include "spu_test.h"
 #include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
@@ -495,6 +496,7 @@ one_time_init(void)
 }
 
 
+
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -515,6 +517,8 @@ main(main_param_t speid, main_param_t argp)
 
    (void) speid;
 
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+
    one_time_init();
 
    if (Debug)
@@ -528,6 +532,10 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc();
+#endif
 
    main_loop();
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 7a1ca097c0..c1dc6bfe90 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -97,10 +97,10 @@ get_tex_tile(uint i, uint j)
              spu.init.id, src, tex_tiles[pos].t32);
 #endif
 
-      ASSERT_ALIGN16(tex_tiles[pos].t32);
+      ASSERT_ALIGN16(tex_tiles[pos].ui);
       ASSERT_ALIGN16(src);
 
-      mfc_get(tex_tiles[pos].t32,  /* dest */
+      mfc_get(tex_tiles[pos].ui,  /* dest */
               (unsigned int) src,
               bytes_per_tile,      /* size */
               TAG_TEXTURE_TILE,
@@ -134,6 +134,6 @@ sample_texture(float4 texcoord)
    uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
    uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
+   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index aea4785bc2..fd65c2b49c 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -56,7 +56,7 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
    printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
           tile, (unsigned int) src, bytesPerTile);
    */
-   mfc_get(tile->t32,  /* dest in local memory */
+   mfc_get(tile->ui,  /* dest in local memory */
            (unsigned int) src, /* src in main memory */
            bytesPerTile,
            tag,
@@ -82,7 +82,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
           spu.init.id,
           tile, (unsigned int) dst, bytesPerTile);
    */
-   mfc_put((void *) tile->t32,  /* src in local memory */
+   mfc_put((void *) tile->ui,  /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            bytesPerTile,
            tag,
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 4b1ef2a4c8..85a0d55807 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -40,8 +40,8 @@
 
 
 typedef union {
-   ushort t16[TILE_SIZE][TILE_SIZE];
-   uint   t32[TILE_SIZE][TILE_SIZE];
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
    vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
    vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
@@ -74,7 +74,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 static INLINE void
 clear_c_tile(tile_t *ctile)
 {
-   memset32((uint*) ctile->t32,
+   memset32((uint*) ctile->ui,
             spu.fb.color_clear_value,
             TILE_SIZE * TILE_SIZE);
 }
@@ -84,23 +84,15 @@ static INLINE void
 clear_z_tile(tile_t *ztile)
 {
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      memset16((ushort*) ztile->t16,
+      memset16((ushort*) ztile->us,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
    }
    else {
       ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-#if SIMD_Z
-      union fi z;
-      z.f = 1.0;
-      memset32((uint*) ztile->t32,
-               z.i,/*spu.fb.depth_clear_value,*/
-               TILE_SIZE * TILE_SIZE);
-#else
-      memset32((uint*) ztile->t32,
+      memset32((uint*) ztile->ui,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
-#endif
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a26a4f098d..b04b6841c0 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -349,13 +349,13 @@ emit_quad( int x, int y, mask_t mask )
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
       if (spu_extract(mask, 0))
-         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
+         ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
-         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+         ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (spu_extract(mask, 2))
-         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+         ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
-         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+         ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 
 #if 0
       /* SIMD_Z with swizzled color buffer (someday) */
-- 
cgit v1.2.3


From 0e9a370ae2fa7a6d8bbc7d236e63dae1e3dcac37 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 14:02:22 -0700
Subject: Cell: move ztest before color interp/packing

---
 src/mesa/pipe/cell/spu/spu_tri.c | 43 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index b04b6841c0..ae8fd17cc6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -316,24 +316,6 @@ emit_quad( int x, int y, mask_t mask )
    setup.quad.mask = mask;
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
-   /* Cell: "write" quad fragments to the tile by setting prim color */
-   const int ix = x - setup.cliprect_minx;
-   const int iy = y - setup.cliprect_miny;
-   uint colors[4];  /* indexed by QUAD_x */
-
-   if (spu.texture.start) {
-      float4 texcoords[4];
-      uint i;
-      eval_coeff(2, (float) x, (float) y, texcoords);
-      for (i = 0; i < 4; i++) {
-         colors[i] = sample_texture(texcoords[i]);
-      }
-   }
-   else {
-      float4 fcolors[4];
-      eval_coeff(1, (float) x, (float) y, fcolors);
-      pack_colors(colors, fcolors);
-   }
 
    if (spu.depth_stencil.depth.enabled) {
       mask = do_depth_test(x, y, mask);
@@ -341,6 +323,23 @@ emit_quad( int x, int y, mask_t mask )
 
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+      uint colors[4];  /* indexed by QUAD_x */
+
+      if (spu.texture.start) {
+         float4 texcoords[4];
+         uint i;
+         eval_coeff(2, (float) x, (float) y, texcoords);
+         for (i = 0; i < 4; i++) {
+            colors[i] = sample_texture(texcoords[i]);
+         }
+      }
+      else {
+         float4 fcolors[4];
+         eval_coeff(1, (float) x, (float) y, fcolors);
+         pack_colors(colors, fcolors);
+      }
 
       if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
@@ -348,6 +347,7 @@ emit_quad( int x, int y, mask_t mask )
       }
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
+#if 1
       if (spu_extract(mask, 0))
          ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
@@ -356,11 +356,10 @@ emit_quad( int x, int y, mask_t mask )
          ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
          ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-
-#if 0
+#else
       /* SIMD_Z with swizzled color buffer (someday) */
-      vector float icolors = *((vector float *) &colors);
-      ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
+      vector unsigned int uicolors = *((vector unsigned int *) &colors);
+      ctile.ui4[iy/2][ix/2] = spu_sel(ctile.ui4[iy/2][ix/2], uicolors, mask);
 #endif
    }
 
-- 
cgit v1.2.3


From b1a472bfb7df5ba273574e1799c5b8e85ca5f2d9 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:20:07 -0700
Subject: Cell: remove commands from top-level while loop which should only
 appear in batch buffers

---
 src/mesa/pipe/cell/spu/spu_main.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 7d6e910ad5..1760de02b7 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -447,34 +447,22 @@ main_loop(void)
               0  /* rid */);
       wait_on_mask( 1 << tag );
 
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          if (Debug)
             printf("SPU %u: EXIT\n", spu.init.id);
          exitFlag = 1;
          break;
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         cmd_state_framebuffer(&cmd.fb);
-         break;
-      case CELL_CMD_CLEAR_SURFACE:
-         cmd_clear_surface(&cmd.clear);
-         break;
-      case CELL_CMD_RENDER:
-         {
-            uint pos_incr;
-            cmd_render(&cmd.render, &pos_incr);
-            assert(pos_incr == 0);
-         }
-         break;
       case CELL_CMD_VS_EXECUTE:
          spu_execute_vertex_shader(&draw, &cmd.vs);
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
          break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         break;
       default:
          printf("Bad opcode!\n");
       }
-- 
cgit v1.2.3


From 17305489f0d2a0681d4c0d4952957af517019ab6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:24:00 -0700
Subject: Cell: deprecate some use of struct cell_command - it should go away
 completely

Also, remove ALIGN16_ATTRIB from structs that no longer need it.
---
 src/mesa/pipe/cell/common.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 05aeed83ab..7e193f31be 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -105,7 +105,7 @@ struct cell_command_framebuffer
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
-} ALIGN16_ATTRIB;
+};
 
 
 /**
@@ -116,7 +116,7 @@ struct cell_command_clear_surface
    uint opcode;
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
-} ALIGN16_ATTRIB;
+};
 
 
 /**
@@ -173,7 +173,7 @@ struct cell_command_render
    uint dummy3;
    uint min_index;
    boolean inline_verts;
-} ALIGN16_ATTRIB;
+};
 
 
 struct cell_command_release_verts
@@ -191,11 +191,14 @@ struct cell_command_texture
 
 
 /** XXX unions don't seem to work */
+/* XXX this should go away; all commands should be placed in batch buffers */
 struct cell_command
 {
+#if 0
    struct cell_command_framebuffer fb;
    struct cell_command_clear_surface clear;
    struct cell_command_render render;
+#endif
    struct cell_command_vs vs;
 } ALIGN16_ATTRIB;
 
-- 
cgit v1.2.3


From 42201d7574ebb1582563988820c248680081c42f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:33:53 -0700
Subject: Cell: rename/move global vars

Put tile-related globals into spu_global struct.
Rename c/ztile fields to be more consistant.
---
 src/mesa/pipe/cell/spu/spu_main.c   | 28 +++++++++++++-------------
 src/mesa/pipe/cell/spu/spu_main.h   | 32 +++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_render.c | 40 ++++++++++++++++++-------------------
 src/mesa/pipe/cell/spu/spu_tile.c   | 11 +---------
 src/mesa/pipe/cell/spu/spu_tile.h   | 27 -------------------------
 src/mesa/pipe/cell/spu/spu_tri.c    | 38 +++++++++++++++++------------------
 6 files changed, 86 insertions(+), 90 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 1760de02b7..8e3987f6ef 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -92,24 +92,24 @@ really_clear_tiles(uint surfaceIndex)
    uint i;
 
    if (surfaceIndex == 0) {
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
          }
       }
    }
    else {
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status_z[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 1);
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
       }
    }
 
@@ -133,11 +133,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 #if CLEAR_OPT
    /* set all tile's status to CLEAR */
    if (clear->surface == 0) {
-      memset(tile_status, TILE_STATUS_CLEAR, sizeof(tile_status));
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
       spu.fb.color_clear_value = clear->value;
    }
    else {
-      memset(tile_status_z, TILE_STATUS_CLEAR, sizeof(tile_status_z));
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
       spu.fb.depth_clear_value = clear->value;
    }
    return;
@@ -145,11 +145,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
    }
    else {
       spu.fb.depth_clear_value = clear->value;
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
    }
 
    /*
@@ -161,9 +161,9 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
       uint tx = i % spu.fb.width_tiles;
       uint ty = i / spu.fb.width_tiles;
       if (clear->surface == 0)
-         put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
       else
-         put_tile(tx, ty, &ztile, TAG_SURFACE_CLEAR, 1);
+         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
       /* XXX we don't want this here, but it fixes bad tile results */
    }
 
@@ -478,8 +478,8 @@ main_loop(void)
 static void
 one_time_init(void)
 {
-   memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status));
-   memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z));
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 8be5268f52..cce5e70802 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -36,6 +36,11 @@
 #include "pipe/p_state.h"
 
 
+
+#define MAX_WIDTH 1024
+#define MAX_HEIGHT 1024
+
+
 typedef union
 {
    vector float v;
@@ -43,6 +48,21 @@ typedef union
 } float4;
 
 
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -75,6 +95,18 @@ struct spu_global
 
    /* XXX more state to come */
 
+
+   /** current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index ca54a103bd..ab711d67fe 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -95,15 +95,15 @@ static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
-      if (cur_tile_status_z != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         cur_tile_status_z = TILE_STATUS_GETTING;
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
       }
    }
 
-   if (cur_tile_status_c != TILE_STATUS_CLEAR) {
-      get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      cur_tile_status_c = TILE_STATUS_GETTING;
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
    }
 }
 
@@ -114,24 +114,24 @@ get_cz_tiles(uint tx, uint ty)
 static INLINE void
 put_cz_tiles(uint tx, uint ty)
 {
-   if (cur_tile_status_z == TILE_STATUS_DIRTY) {
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
-      put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-      cur_tile_status_z = TILE_STATUS_DEFINED;
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
-   else if (cur_tile_status_z == TILE_STATUS_GETTING) {
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
-      cur_tile_status_z = TILE_STATUS_DEFINED;
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
 
-   if (cur_tile_status_c == TILE_STATUS_DIRTY) {
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
-      put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-      cur_tile_status_c = TILE_STATUS_DEFINED;
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
-   else if (cur_tile_status_c == TILE_STATUS_GETTING) {
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
-      cur_tile_status_c = TILE_STATUS_DEFINED;
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
 }
 
@@ -250,8 +250,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
-      cur_tile_status_c = tile_status[ty][tx];
-      cur_tile_status_z = tile_status_z[ty][tx];
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
       get_cz_tiles(tx, ty);
 
@@ -275,8 +275,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
       wait_put_cz_tiles(); /* XXX seems unnecessary... */
 
-      tile_status[ty][tx] = cur_tile_status_c;
-      tile_status_z[ty][tx] = cur_tile_status_z;
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
    if (Debug)
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index fd65c2b49c..12dc246328 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -28,16 +28,7 @@
 
 
 #include "spu_tile.h"
-
-
-
-tile_t ctile ALIGN16_ATTRIB;
-tile_t ztile ALIGN16_ATTRIB;
-
-ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
-ubyte cur_tile_status_c, cur_tile_status_z;
+#include "spu_main.h"
 
 
 void
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 85a0d55807..e53340a55a 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -35,33 +35,6 @@
 #include "pipe/cell/common.h"
 
 
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-typedef union {
-   ushort us[TILE_SIZE][TILE_SIZE];
-   uint   ui[TILE_SIZE][TILE_SIZE];
-   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
-   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
-} tile_t;
-
-
-extern tile_t ctile ALIGN16_ATTRIB;
-extern tile_t ztile ALIGN16_ATTRIB;
-
-
-#define TILE_STATUS_CLEAR   1
-#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
-#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
-#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
-#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
-
-extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
-extern ubyte cur_tile_status_c, cur_tile_status_z;
-
 
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index ae8fd17cc6..6f61a3d816 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -283,21 +283,21 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-      cur_tile_status_z = TILE_STATUS_DIRTY;
+      clear_z_tile(&spu.ztile);
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
    }
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       int ix = (x - setup.cliprect_minx) / 4;
       int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
+      mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
       int ix = (x - setup.cliprect_minx) / 2;
       int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
+      mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
    }
    return mask;
 }
@@ -341,25 +341,25 @@ emit_quad( int x, int y, mask_t mask )
          pack_colors(colors, fcolors);
       }
 
-      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
+         clear_c_tile(&spu.ctile);
       }
-      cur_tile_status_c = TILE_STATUS_DIRTY;
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
 #if 1
       if (spu_extract(mask, 0))
-         ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
+         spu.ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
-         ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+         spu.ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (spu_extract(mask, 2))
-         ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+         spu.ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
-         ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+         spu.ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 #else
       /* SIMD_Z with swizzled color buffer (someday) */
       vector unsigned int uicolors = *((vector unsigned int *) &colors);
-      ctile.ui4[iy/2][ix/2] = spu_sel(ctile.ui4[iy/2][ix/2], uicolors, mask);
+      spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
 #endif
    }
 
@@ -846,21 +846,21 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
-   if (cur_tile_status_c == TILE_STATUS_GETTING) {
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* wait for mfc_get() to complete */
       wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      cur_tile_status_c = TILE_STATUS_CLEAN;
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
    }
 
-   ASSERT(cur_tile_status_c != TILE_STATUS_DEFINED);
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
    if (spu.depth_stencil.depth.enabled) {
-      if (cur_tile_status_z == TILE_STATUS_GETTING) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          wait_on_mask(1 << TAG_READ_TILE_Z);
-         cur_tile_status_z = TILE_STATUS_CLEAN;
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
       }
-   ASSERT(cur_tile_status_z != TILE_STATUS_DEFINED);
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
 
-- 
cgit v1.2.3


From d7c2eb0df47bd79291172727539b99331a3c6724 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:45:02 -0700
Subject: Cell: New color packing functions (A8R8G8B8 and B8G8R8A8)

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 60 ++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_tri.c       | 22 +++++--------
 2 files changed, 69 insertions(+), 13 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_colorpack.h

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..56709bd9f3
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <vec_literal.h>
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  12, 0, 4, 8, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  8, 4, 0, 12, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 6f61a3d816..c82ca51000 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -29,11 +29,10 @@
  * Triangle rendering within a tile.
  */
 
-#include <pack_rgba8.h>
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
+#include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
@@ -253,21 +252,18 @@ eval_z(float x, float y)
 static INLINE void
 pack_colors(uint uicolors[4], const float4 fcolors[4])
 {
-   /* XXX grab the code for _pack_rgba8() and use the shuffle
-    * command to do the swizzling seen here.
-    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      uicolors[0] = _pack_rgba8(fcolors[0].f[3], fcolors[0].f[0], fcolors[0].f[1], fcolors[0].f[2]);
-      uicolors[1] = _pack_rgba8(fcolors[1].f[3], fcolors[1].f[0], fcolors[1].f[1], fcolors[1].f[2]);
-      uicolors[2] = _pack_rgba8(fcolors[2].f[3], fcolors[2].f[0], fcolors[2].f[1], fcolors[2].f[2]);
-      uicolors[3] = _pack_rgba8(fcolors[3].f[3], fcolors[0].f[0], fcolors[3].f[1], fcolors[3].f[2]);
+      uicolors[0] = spu_pack_A8R8G8B8(fcolors[0].v);
+      uicolors[1] = spu_pack_A8R8G8B8(fcolors[1].v);
+      uicolors[2] = spu_pack_A8R8G8B8(fcolors[2].v);
+      uicolors[3] = spu_pack_A8R8G8B8(fcolors[3].v);
       break;
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      uicolors[0] = _pack_rgba8(fcolors[0].f[2], fcolors[0].f[1], fcolors[0].f[0], fcolors[0].f[3]);
-      uicolors[1] = _pack_rgba8(fcolors[1].f[2], fcolors[1].f[1], fcolors[1].f[0], fcolors[1].f[3]);
-      uicolors[2] = _pack_rgba8(fcolors[2].f[2], fcolors[2].f[1], fcolors[2].f[0], fcolors[2].f[3]);
-      uicolors[3] = _pack_rgba8(fcolors[3].f[2], fcolors[3].f[1], fcolors[3].f[0], fcolors[3].f[3]);
+      uicolors[0] = spu_pack_B8G8R8A8(fcolors[0].v);
+      uicolors[1] = spu_pack_B8G8R8A8(fcolors[1].v);
+      uicolors[2] = spu_pack_B8G8R8A8(fcolors[2].v);
+      uicolors[3] = spu_pack_B8G8R8A8(fcolors[3].v);
       break;
    default:
       ASSERT(0);
-- 
cgit v1.2.3


From 8bd566a9cb8bb01ef5ce9c526047bafc0fbf0aef Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:25:42 -0700
Subject: Cell: use global color_shuffle to remove a switch stmnt

---
 src/mesa/pipe/cell/spu/Makefile        |  2 +
 src/mesa/pipe/cell/spu/spu_colorpack.h |  9 ++++
 src/mesa/pipe/cell/spu/spu_main.c      | 12 ++++++
 src/mesa/pipe/cell/spu/spu_main.h      |  3 ++
 src/mesa/pipe/cell/spu/spu_tri.c       | 76 ++++++++++++++--------------------
 5 files changed, 56 insertions(+), 46 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 2d031bfbc6..91a631b699 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -8,6 +8,8 @@ TOP = ../../../../..
 include $(TOP)/configs/linux-cell
 
 
+OPT_FLAGS=-g
+OPT_FLAGS=-O3
 PROG = g3d
 
 PROG_SPU = $(PROG)_spu
diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 56709bd9f3..9977a6ece0 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -57,4 +57,13 @@ spu_pack_B8G8R8A8(vector float rgba)
 }
 
 
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
 #endif /* SPU_COLORPACK_H */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 8e3987f6ef..ba4d180cc0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,6 +31,7 @@
 
 #include <stdio.h>
 #include <libmisc.h>
+#include <vec_literal.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -217,6 +218,17 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 2;
    else
       spu.fb.zsize = 0;
+
+   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
+      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
+                                      12, 0, 4, 8, 0, 0, 0, 0, 
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
+                                      8, 4, 0, 12, 0, 0, 0, 0, 
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+   else
+      ASSERT(0);
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index cce5e70802..7a12715b0b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -107,6 +107,9 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+
+   /** for converting RGBA to PIPE_FORMAT_x colors */
+   vector unsigned char color_shuffle;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index c82ca51000..165e41a781 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -249,28 +249,6 @@ eval_z(float x, float y)
 }
 
 
-static INLINE void
-pack_colors(uint uicolors[4], const float4 fcolors[4])
-{
-   switch (spu.fb.color_format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      uicolors[0] = spu_pack_A8R8G8B8(fcolors[0].v);
-      uicolors[1] = spu_pack_A8R8G8B8(fcolors[1].v);
-      uicolors[2] = spu_pack_A8R8G8B8(fcolors[2].v);
-      uicolors[3] = spu_pack_A8R8G8B8(fcolors[3].v);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      uicolors[0] = spu_pack_B8G8R8A8(fcolors[0].v);
-      uicolors[1] = spu_pack_B8G8R8A8(fcolors[1].v);
-      uicolors[2] = spu_pack_B8G8R8A8(fcolors[2].v);
-      uicolors[3] = spu_pack_B8G8R8A8(fcolors[3].v);
-      break;
-   default:
-      ASSERT(0);
-   }
-}
-
-
 static INLINE mask_t
 do_depth_test(int x, int y, mask_t quadmask)
 {
@@ -321,38 +299,44 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      uint colors[4];  /* indexed by QUAD_x */
+
+      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+         /* now, _really_ clear the tile */
+         clear_c_tile(&spu.ctile);
+      }
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture.start) {
+         /* texture mapping */
          float4 texcoords[4];
-         uint i;
          eval_coeff(2, (float) x, (float) y, texcoords);
-         for (i = 0; i < 4; i++) {
-            colors[i] = sample_texture(texcoords[i]);
-         }
+
+         if (spu_extract(mask, 0))
+            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0]);
+         if (spu_extract(mask, 1))
+            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1]);
+         if (spu_extract(mask, 2))
+            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2]);
+         if (spu_extract(mask, 3))
+            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3]);
       }
       else {
-         float4 fcolors[4];
-         eval_coeff(1, (float) x, (float) y, fcolors);
-         pack_colors(colors, fcolors);
+         /* simple shading */
+         const vector unsigned char shuffle = spu.color_shuffle;
+         float4 colors[4];
+         eval_coeff(1, (float) x, (float) y, colors);
+
+         if (spu_extract(mask, 0))
+            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0].v, shuffle);
+         if (spu_extract(mask, 1))
+            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1].v, shuffle);
+         if (spu_extract(mask, 2))
+            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2].v, shuffle);
+         if (spu_extract(mask, 3))
+            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3].v, shuffle);
       }
 
-      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&spu.ctile);
-      }
-      spu.cur_ctile_status = TILE_STATUS_DIRTY;
-
-#if 1
-      if (spu_extract(mask, 0))
-         spu.ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
-      if (spu_extract(mask, 1))
-         spu.ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
-      if (spu_extract(mask, 2))
-         spu.ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
-      if (spu_extract(mask, 3))
-         spu.ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#else
+#if 0
       /* SIMD_Z with swizzled color buffer (someday) */
       vector unsigned int uicolors = *((vector unsigned int *) &colors);
       spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
-- 
cgit v1.2.3


From 44d32693562e2fb83572bd10e4d489a7cb6f74f3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:42:09 -0700
Subject: Cell: move some tile get/clear code

Also, we weren't marking the ztile as dirty after ztesting, fixes gears glitches.
---
 src/mesa/pipe/cell/spu/spu_tri.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 165e41a781..f0758c42e7 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -257,12 +257,6 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&spu.ztile);
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
-   }
-
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       int ix = (x - setup.cliprect_minx) / 4;
       int iy = (y - setup.cliprect_miny) / 2;
@@ -273,6 +267,10 @@ do_depth_test(int x, int y, mask_t quadmask)
       int iy = (y - setup.cliprect_miny) / 2;
       mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
    }
+
+   if (spu_extract(spu_orx(mask), 0))
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
    return mask;
 }
 
@@ -300,10 +298,6 @@ emit_quad( int x, int y, mask_t mask )
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
 
-      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&spu.ctile);
-      }
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture.start) {
@@ -408,6 +402,18 @@ static void flush_spans( void )
       return;
    }
 
+
+   /* _really_ clear tiles now if needed */
+   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   if (spu.depth_stencil.depth.enabled &&
+       spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+      clear_z_tile(&spu.ztile);
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+   }
+
    /* XXX this loop could be moved into the above switch cases and
     * calculate_mask() could be simplified a bit...
     */
@@ -831,7 +837,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
       wait_on_mask(1 << TAG_READ_TILE_COLOR);
       spu.cur_ctile_status = TILE_STATUS_CLEAN;
    }
-
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
    if (spu.depth_stencil.depth.enabled) {
-- 
cgit v1.2.3


From aa761b160520479efcf09d12ae4a161fc2f872f7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:54:46 -0700
Subject: Cell: comment about emit_quad() mask

---
 src/mesa/pipe/cell/spu/spu_tri.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index f0758c42e7..83bb247b22 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -277,6 +277,9 @@ do_depth_test(int x, int y, mask_t quadmask)
 
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
  */
 static INLINE void
 emit_quad( int x, int y, mask_t mask )
-- 
cgit v1.2.3


From 69cc19751dd0122116cab03d808d5a1f5d0ade84 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:45:33 -0700
Subject: Cell: insert some draw_flush() calls

---
 src/mesa/pipe/cell/ppu/cell_state_blend.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c
index 34ae0128ea..2c19aa3971 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_blend.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c
@@ -29,6 +29,7 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
@@ -49,6 +50,8 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend = (const struct pipe_blend_state *)blend;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -68,6 +71,8 @@ cell_set_blend_color(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend_color = *blend_color;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -93,6 +98,8 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->depth_stencil
       = (const struct pipe_depth_stencil_alpha_state *) depth_stencil;
 
-- 
cgit v1.2.3


From 6023311c7ce336f727d7aa6d5266e88a55b88d36 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:46:44 -0700
Subject: Cell: clamp txmax, tymax in tile_bounding_box()

Also, added some debug printfs
---
 src/mesa/pipe/cell/spu/spu_render.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index ab711d67fe..e8705eeeba 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -65,6 +65,10 @@ tile_bounding_box(const struct cell_command_render *render,
    *tymin = (uint) render->ymin / TILE_SIZE;
    txmax = (uint) render->xmax / TILE_SIZE;
    tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
    *box_width_tiles = txmax - *txmin + 1;
    box_height_tiles = tymax - *tymin + 1;
    *box_num_tiles = *box_width_tiles * box_height_tiles;
@@ -96,12 +100,14 @@ get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
          spu.cur_ztile_status = TILE_STATUS_GETTING;
       }
    }
 
    if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
       get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
       spu.cur_ctile_status = TILE_STATUS_GETTING;
    }
@@ -116,22 +122,26 @@ put_cz_tiles(uint tx, uint ty)
 {
    if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
       put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
       spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
    else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
       spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
    }
 
    if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
       put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
       spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
    else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
       spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
    }
 }
 
-- 
cgit v1.2.3


From e967a5c746f340a76b27181b4ead1035101cece3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:53:18 -0700
Subject: Cell: move tile clear code to flush_spans()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 51 ++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 83bb247b22..3f46e75d7c 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -406,22 +406,44 @@ static void flush_spans( void )
    }
 
 
-   /* _really_ clear tiles now if needed */
-   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
       clear_c_tile(&spu.ctile);
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
-   if (spu.depth_stencil.depth.enabled &&
-       spu.cur_ztile_status == TILE_STATUS_CLEAR) {
-      clear_z_tile(&spu.ztile);
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
    /* XXX this loop could be moved into the above switch cases and
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
+#if 1
       emit_quad( x, setup.span.y, calculate_mask( x ) );
+#endif
    }
 
    setup.span.y = 0;
@@ -835,23 +857,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
-   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
-      /* wait for mfc_get() to complete */
-      wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      spu.cur_ctile_status = TILE_STATUS_CLEAN;
-   }
-   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
-
-   if (spu.depth_stencil.depth.enabled) {
-      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
-         /* wait for mfc_get() to complete */
-         wait_on_mask(1 << TAG_READ_TILE_Z);
-         spu.cur_ztile_status = TILE_STATUS_CLEAN;
-      }
-      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
-   }
-
-
    if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
-- 
cgit v1.2.3


From 18105195a86b8294b578462febf47692832e8705 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 09:54:21 -0700
Subject: Cell: checkpoint: start to SIMD-ize texture sampling

---
 src/mesa/pipe/cell/spu/spu_main.c    | 10 ++++++++++
 src/mesa/pipe/cell/spu/spu_main.h    |  4 ++++
 src/mesa/pipe/cell/spu/spu_texture.c | 17 ++++++++++++++---
 src/mesa/pipe/cell/spu/spu_texture.h |  2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     |  8 ++++----
 5 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index ba4d180cc0..412661061a 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -263,6 +263,16 @@ cmd_state_texture(const struct cell_command_texture *texture)
              spu.init.id, texture->start, texture->width, texture->height);
 
    memcpy(&spu.texture, texture, sizeof(*texture));
+   spu.tex_size = VEC_LITERAL(vector float,
+                              spu.texture.width,
+                              spu.texture.height,
+                              0.0,
+                              0.0);
+   spu.tex_size_mask = VEC_LITERAL(vector unsigned int,
+                                   spu.texture.width - 1,
+                                   spu.texture.height - 1,
+                                   0,
+                                   0);
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 7a12715b0b..02b62ee5cd 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -110,6 +110,10 @@ struct spu_global
 
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
+
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index c1dc6bfe90..1cf958806f 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -128,12 +128,23 @@ get_tex_tile(uint i, uint j)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(float4 texcoord)
+sample_texture(vector float texcoord)
 {
+#if 0
    /* wrap/repeat */
-   uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
+   uint i = (uint) (spu_extract(texcoord, 0) * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (spu_extract(texcoord, 1) * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
    uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
+#else
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   vector unsigned int itc = spu_convtu(tc, 0);
+   itc = spu_and(itc, spu.tex_size_mask);
+   uint i = spu_extract(itc, 0);
+   uint j = spu_extract(itc, 1);
+   uint pos = get_tex_tile(i, j);
+   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
+   return texel;
+#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 938a42b549..5bc8e71879 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(float4 texcoord);
+sample_texture(vector float texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 3f46e75d7c..c148c75dd6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -309,13 +309,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0]);
+            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0].v);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1]);
+            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1].v);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2]);
+            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2].v);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3]);
+            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3].v);
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From 703a8691553386242bf3d6662c314fc35b617194 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 11:02:47 -0700
Subject: Cell: SIMD-ize more of texture sampling

---
 src/mesa/pipe/cell/spu/spu_texture.c | 66 ++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 1cf958806f..b52df970d0 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <vec_literal.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -41,7 +43,7 @@
 
 static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
 
-static int tex_tile_x[CACHE_SIZE], tex_tile_y[CACHE_SIZE];
+static vector unsigned int tex_tile_xy[CACHE_SIZE];
 
 
@@ -53,20 +55,19 @@ invalidate_tex_cache(void)
 {
    /* XXX memset? */
    uint i;
-   for (i = 0; i < CACHE_SIZE; i++)
-      tex_tile_x[i] = tex_tile_y[i] = -1;
+   for (i = 0; i < CACHE_SIZE; i++) {
+      tex_tile_xy[i] = VEC_LITERAL(vector unsigned int, ~0U, ~0U, ~0U, ~0U);
+   }
 }
 
 
 /**
- * Return the cache pos/index which corresponds to texel (i,j)
+ * Return the cache pos/index which corresponds to tile (tx,ty)
  */
 static INLINE uint
-cache_pos(uint i, uint j)
+cache_pos(vector unsigned int txty)
 {
-   uint tx = i / TILE_SIZE;
-   uint ty = j / TILE_SIZE;
-   uint pos = (tx + ty * 4) % CACHE_SIZE;
+   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
    return pos;
 }
 
@@ -76,26 +77,28 @@ cache_pos(uint i, uint j)
  * in the cache.
  */
 static uint
-get_tex_tile(uint i, uint j)
+get_tex_tile(vector unsigned int ij)
 {
-   const int tx = i / TILE_SIZE;
-   const int ty = j / TILE_SIZE;
-   const uint pos = cache_pos(i, j);
+   /* tile address: tx,ty */
+   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
+   const uint pos = cache_pos(txty);
+
+   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
+       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
 
-   if (tex_tile_x[pos] != tx || tex_tile_y[pos] != ty) {
       /* texture cache miss, fetch tile from main memory */
       const uint tiles_per_row = spu.texture.width / TILE_SIZE;
       const uint bytes_per_tile = sizeof(tile_t);
       const void *src = (const ubyte *) spu.texture.start
-         + (ty * tiles_per_row + tx) * bytes_per_tile;
+         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
 
       printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
-             spu.init.id, tx, ty, pos,
-             tex_tile_x[pos], tex_tile_y[pos]);
-#if 0
-      printf("SPU %u: get tex tile from %p to %p\n",
-             spu.init.id, src, tex_tiles[pos].t32);
-#endif
+             spu.init.id,
+             spu_extract(txty,0),
+             spu_extract(txty,1),
+             pos,
+             spu_extract(tex_tile_xy[pos],0),
+             spu_extract(tex_tile_xy[pos],1));
 
       ASSERT_ALIGN16(tex_tiles[pos].ui);
       ASSERT_ALIGN16(src);
@@ -109,8 +112,7 @@ get_tex_tile(uint i, uint j)
 
       wait_on_mask(1 << TAG_TEXTURE_TILE);
 
-      tex_tile_x[pos] = tx;
-      tex_tile_y[pos] = ty;
+      tex_tile_xy[pos] = txty;
    }
    else {
 #if 0
@@ -130,21 +132,11 @@ get_tex_tile(uint i, uint j)
 uint
 sample_texture(vector float texcoord)
 {
-#if 0
-   /* wrap/repeat */
-   uint i = (uint) (spu_extract(texcoord, 0) * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (spu_extract(texcoord, 1) * spu.texture.height) % spu.texture.height;
-   uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
-   return texel;
-#else
    vector float tc = spu_mul(texcoord, spu.tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);
-   itc = spu_and(itc, spu.tex_size_mask);
-   uint i = spu_extract(itc, 0);
-   uint j = spu_extract(itc, 1);
-   uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
+   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
+   itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
+   uint pos = get_tex_tile(itc);
+   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
    return texel;
-#endif
 }
-- 
cgit v1.2.3


From 9a5074217fd3be8feff2be597bb124a2a3637d0a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 12:50:16 -0700
Subject: Cell: added spu_unpack_color(), spu_pack_R8G8B8A8()

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 9977a6ece0..0c93c06562 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -35,6 +35,17 @@
 #include <spu_intrinsics.h>
 
 
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  0, 4, 8, 12, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
 static INLINE unsigned int
 spu_pack_A8R8G8B8(vector float rgba)
 {
@@ -66,4 +77,18 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
 }
 
 
+static INLINE vector float
+spu_unpack_color(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          VEC_LITERAL(vector unsigned char,
+                                      0, 0, 0, 0,
+                                      5, 5, 5, 5,
+                                      10, 10, 10, 10,
+                                      15, 15, 15, 15));
+   return spu_convtf(color_u4, 32);
+}
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From 0a45f7594870cb7296100fb5f5d5dc82888a467d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 12:50:42 -0700
Subject: Cell: implement basic bilinear texture sampler

---
 src/mesa/pipe/cell/spu/spu_texture.c | 67 ++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_texture.h |  4 +++
 2 files changed, 71 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index b52df970d0..26a5eefc48 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -32,6 +32,7 @@
 #include "spu_main.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+#include "spu_colorpack.h"
 
 
 /**
@@ -140,3 +141,69 @@ sample_texture(vector float texcoord)
    uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
    return texel;
 }
+
+
+uint
+sample_texture_bilinear(vector float texcoord)
+{
+   static const vector unsigned int offset10 = {1, 0, 0, 0};
+   static const vector unsigned int offset01 = {0, 1, 0, 0};
+
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   /* itcST */
+   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
+   vector unsigned int itc01 = spu_add(itc00, offset01);
+   vector unsigned int itc10 = spu_add(itc00, offset10);
+   vector unsigned int itc11 = spu_add(itc10, offset01);
+
+   itc00 = spu_and(itc00, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc01 = spu_and(itc01, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc10 = spu_and(itc10, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc11 = spu_and(itc11, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+
+   /* intra tile addr */
+   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
+   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
+   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
+   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
+
+   uint pos00 = get_tex_tile(itc00);
+   uint pos01 = get_tex_tile(itc01);
+   uint pos10 = get_tex_tile(itc10);
+   uint pos11 = get_tex_tile(itc11);
+
+   vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_color(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
+   vector signed int itc1024 = spu_convts(tc1024, 0);
+   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
+   vector float weight = spu_convtf(itc1024, 10);
+
+   /* smeared frac and 1-frac */
+   vector float sfrac = spu_splats(spu_extract(weight, 0));
+   vector float tfrac = spu_splats(spu_extract(weight, 1));
+   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
+   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
+
+   /* multiply the samples (colors) by the S/T weights */
+   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
+   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
+   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
+   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
+
+   /* compute sum of weighted samples */
+   vector float texel_sum = spu_add(texel00, texel01);
+   texel_sum = spu_add(texel_sum, texel10);
+   texel_sum = spu_add(texel_sum, texel11);
+
+   /* convert to uint color */
+   uint texel = spu_pack_R8G8B8A8(texel_sum);
+
+   return texel;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 5bc8e71879..25cbe9b3c6 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -40,4 +40,8 @@ extern uint
 sample_texture(vector float texcoord);
 
 
+extern uint
+sample_texture_bilinear(vector float texcoord);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From ca1d2fc5f6fb138025f6848591e3494e4b881930 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 13:16:10 -0700
Subject: Cell: improved bilinear filtering

avoid calling get_tex_tile() if all texels are in same tile
---
 src/mesa/pipe/cell/spu/spu_texture.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 26a5eefc48..6e243f7fa3 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -150,16 +150,17 @@ sample_texture_bilinear(vector float texcoord)
    static const vector unsigned int offset01 = {0, 1, 0, 0};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
-   /* itcST */
+   /* integer texcoords S,T: */
    vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
    vector unsigned int itc01 = spu_add(itc00, offset01);
    vector unsigned int itc10 = spu_add(itc00, offset10);
    vector unsigned int itc11 = spu_add(itc10, offset01);
 
-   itc00 = spu_and(itc00, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc01 = spu_and(itc01, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc10 = spu_and(itc10, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc11 = spu_and(itc11, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   /* mask (GL_REPEAT) */
+   itc00 = spu_and(itc00, spu.tex_size_mask);
+   itc01 = spu_and(itc01, spu.tex_size_mask);
+   itc10 = spu_and(itc10, spu.tex_size_mask);
+   itc11 = spu_and(itc11, spu.tex_size_mask);
 
    /* intra tile addr */
    vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
@@ -167,11 +168,21 @@ sample_texture_bilinear(vector float texcoord)
    vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
    vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
 
+   /* get tile cache positions */
    uint pos00 = get_tex_tile(itc00);
-   uint pos01 = get_tex_tile(itc01);
-   uint pos10 = get_tex_tile(itc10);
-   uint pos11 = get_tex_tile(itc11);
+   uint pos01, pos10, pos11;
+   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
+       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
+      /* all texels are in the same tile */
+      pos01 = pos10 = pos11 = pos00;
+   }
+   else {
+      pos01 = get_tex_tile(itc01);
+      pos10 = get_tex_tile(itc10);
+      pos11 = get_tex_tile(itc11);
+   }
 
+   /* get texels from tiles and convert to float[4] */
    vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
    vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
    vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-- 
cgit v1.2.3


From 8f924e4df06a5d45dda338e7a0a87308e48df57e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 13:23:07 -0700
Subject: Cell: choose bilinear vs. nearest filtering according to sampler
 state

---
 src/mesa/pipe/cell/spu/spu_main.c    | 4 ++++
 src/mesa/pipe/cell/spu/spu_main.h    | 2 ++
 src/mesa/pipe/cell/spu/spu_texture.c | 2 +-
 src/mesa/pipe/cell/spu/spu_texture.h | 2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     | 8 ++++----
 5 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 412661061a..48e016fc8b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -252,6 +252,10 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
              spu.init.id);
 
    memcpy(&spu.sampler[0], state, sizeof(*state));
+   if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture = sample_texture_bilinear;
+   else
+      spu.sample_texture = sample_texture_nearest;
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 02b62ee5cd..fb98b0d889 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -114,6 +114,8 @@ struct spu_global
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
 
+   uint (*sample_texture)(vector float texcoord);
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 6e243f7fa3..ecacf2ec88 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -131,7 +131,7 @@ get_tex_tile(vector unsigned int ij)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(vector float texcoord)
+sample_texture_nearest(vector float texcoord)
 {
    vector float tc = spu_mul(texcoord, spu.tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 25cbe9b3c6..0e000bfebf 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(vector float texcoord);
+sample_texture_nearest(vector float texcoord);
 
 
 extern uint
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index c148c75dd6..7b422f71a8 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -309,13 +309,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0].v);
+            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0].v);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1].v);
+            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1].v);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2].v);
+            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2].v);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3].v);
+            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3].v);
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From 7a1d01f2a0d8f0875a265e7d4e31e1348fd82677 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:06:10 -0700
Subject: Cell: emit blend state to SPUs

---
 src/mesa/pipe/cell/common.h              |  3 ++-
 src/mesa/pipe/cell/ppu/cell_state_emit.c |  6 ++++++
 src/mesa/pipe/cell/spu/spu_main.c        | 17 +++++++++++++++++
 src/mesa/pipe/cell/spu/spu_main.h        |  1 +
 4 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 7e193f31be..d861e82d33 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -85,7 +85,8 @@
 #define CELL_CMD_STATE_VERTEX_INFO   14
 #define CELL_CMD_STATE_VIEWPORT      15
 #define CELL_CMD_STATE_VS_ARRAY_INFO 16
-#define CELL_CMD_VS_EXECUTE          17
+#define CELL_CMD_STATE_BLEND         17
+#define CELL_CMD_VS_EXECUTE          18
 
 
 #define CELL_NUM_BUFFERS 4
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 702184416b..3b2670f786 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -61,6 +61,12 @@ cell_emit_state(struct cell_context *cell)
       fb->height = cell->framebuffer.cbufs[0]->height;
    }
 
+   if (cell->dirty & CELL_NEW_BLEND) {
+      emit_state_cmd(cell, CELL_CMD_STATE_BLEND,
+                     cell->blend,
+                     sizeof(struct pipe_blend_state));
+   }
+
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
       emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
                      cell->depth_stencil,
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 48e016fc8b..9d8e6df0e3 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -232,6 +232,18 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+static void
+cmd_state_blend(const struct pipe_blend_state *state)
+{
+   if (Debug)
+      printf("SPU %u: BLEND: ztest %d\n",
+             spu.init.id,
+             state->blend_enable);
+
+   memcpy(&spu.blend, state, sizeof(*state));
+}
+
+
 static void
 cmd_state_depth_stencil(const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -398,6 +410,11 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_STATE_BLEND:
+         cmd_state_blend((struct pipe_blend_state *)
+                                 &buffer[pos+1]);
+         pos += (1 + sizeof(struct pipe_blend_state) / 4);
+         break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index fb98b0d889..b22d563551 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -86,6 +86,7 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
+   struct pipe_blend_state blend_stencil;
    struct pipe_depth_stencil_alpha_state depth_stencil;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From 168247d1caee28ef577ad4c3c4308451f1193062 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:10:35 -0700
Subject: Cell: replace float 4 with vector float in eval_coeff()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 7b422f71a8..199afa1aa6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -32,6 +32,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
+#include "spu_blend.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -206,14 +207,14 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, float4 result[4])
+eval_coeff(uint slot, float x, float y, vector float result[4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
       result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
       break;
 
    case INTERP_LINEAR:
@@ -227,10 +228,10 @@ eval_coeff(uint slot, float x, float y, float4 result[4])
                       spu_add(spu_mul(spu_splats(x), dadx),
                               spu_mul(spu_splats(y), dady)));
 
-         result[QUAD_TOP_LEFT].v = topLeft;
-         result[QUAD_TOP_RIGHT].v = spu_add(topLeft, dadx);
-         result[QUAD_BOTTOM_LEFT].v = spu_add(topLeft, dady);
-         result[QUAD_BOTTOM_RIGHT].v = spu_add(spu_add(topLeft, dadx), dady);
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
       }
    }
 }
@@ -305,32 +306,32 @@ emit_quad( int x, int y, mask_t mask )
 
       if (spu.texture.start) {
          /* texture mapping */
-         float4 texcoords[4];
+         vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0].v);
+            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0]);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1].v);
+            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1]);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2].v);
+            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2]);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3].v);
+            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3]);
       }
       else {
          /* simple shading */
          const vector unsigned char shuffle = spu.color_shuffle;
-         float4 colors[4];
+         vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0].v, shuffle);
+            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1].v, shuffle);
+            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2].v, shuffle);
+            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3].v, shuffle);
+            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
       }
 
 #if 0
-- 
cgit v1.2.3


From bc1ad6bcbd5c63da9c10d0276c9d7535b6139437 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:17:50 -0700
Subject: Cell: some basic blending code

---
 src/mesa/pipe/cell/spu/Makefile    |  1 +
 src/mesa/pipe/cell/spu/spu_blend.c | 62 ++++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_blend.h | 37 +++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_tri.c   |  5 +++
 4 files changed, 105 insertions(+)
 create mode 100644 src/mesa/pipe/cell/spu/spu_blend.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_blend.h

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 91a631b699..66f16cde9b 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -19,6 +19,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_blend.c \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
diff --git a/src/mesa/pipe/cell/spu/spu_blend.c b/src/mesa/pipe/cell/spu/spu_blend.c
new file mode 100644
index 0000000000..23ec0eeb45
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "spu_main.h"
+#include "spu_blend.h"
+#include "spu_colorpack.h"
+
+
+void
+blend_quad(uint itx, uint ity, vector float colors[4])
+{
+   /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */
+   vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]);
+   vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]);
+   vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]);
+   vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]);
+
+   vector float alpha00 = spu_splats(spu_extract(colors[0], 3));
+   vector float alpha01 = spu_splats(spu_extract(colors[1], 3));
+   vector float alpha10 = spu_splats(spu_extract(colors[2], 3));
+   vector float alpha11 = spu_splats(spu_extract(colors[3], 3));
+
+   vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00);
+   vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01);
+   vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10);
+   vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11);
+
+   colors[0] = spu_add(spu_mul(colors[0], alpha00),
+                       spu_mul(fbc00, one_minus_alpha00));
+   colors[1] = spu_add(spu_mul(colors[1], alpha01),
+                       spu_mul(fbc01, one_minus_alpha01));
+   colors[2] = spu_add(spu_mul(colors[2], alpha10),
+                       spu_mul(fbc10, one_minus_alpha10));
+   colors[3] = spu_add(spu_mul(colors[3], alpha11),
+                       spu_mul(fbc11, one_minus_alpha11));
+}
+
diff --git a/src/mesa/pipe/cell/spu/spu_blend.h b/src/mesa/pipe/cell/spu/spu_blend.h
new file mode 100644
index 0000000000..2b594b578b
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_BLEND_H
+#define SPU_BLEND_H
+
+
+extern void
+blend_quad(uint itx, uint ity, vector float colors[4]);
+
+
+#endif /* SPU_BLEND_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 199afa1aa6..89aaca9a72 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -324,6 +324,11 @@ emit_quad( int x, int y, mask_t mask )
          vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
 
+#if 0
+         if (spu.blend.blend_enable)
+            blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#endif
+
          if (spu_extract(mask, 0))
             spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
          if (spu_extract(mask, 1))
-- 
cgit v1.2.3


From 76c1a10eb121f040ef510124bf6aa24c4c5c3f8f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:04:50 -0700
Subject: Cell: fix typo

---
 src/mesa/pipe/cell/spu/spu_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 9d8e6df0e3..b0311db1aa 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -236,7 +236,7 @@ static void
 cmd_state_blend(const struct pipe_blend_state *state)
 {
    if (Debug)
-      printf("SPU %u: BLEND: ztest %d\n",
+      printf("SPU %u: BLEND: enabled %d\n",
              spu.init.id,
              state->blend_enable);
 
-- 
cgit v1.2.3


From 5068b573c417bdb317e1938585bebfe931bda049 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:05:13 -0700
Subject: Cell: added spu_unpack_A8R8G8B8()

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 0c93c06562..57ea3525c2 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -91,4 +91,19 @@ spu_unpack_color(uint color)
 }
 
 
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          VEC_LITERAL(vector unsigned char,
+                                      5, 5, 5, 5,
+                                      10, 10, 10, 10,
+                                      15, 15, 15, 15,
+                                      0, 0, 0, 0));
+
+   return spu_convtf(color_u4, 32);
+}
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From efa8e03a6f3f7c27b019d20cca93bf7e624d7035 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:06:51 -0700
Subject: Cell: texture sampler functions always return vector float now

Texture colors look the same now, regardless of X display/pixel format
---
 src/mesa/pipe/cell/spu/spu_main.h    |  2 +-
 src/mesa/pipe/cell/spu/spu_texture.c | 19 ++++++++-----------
 src/mesa/pipe/cell/spu/spu_texture.h |  4 ++--
 src/mesa/pipe/cell/spu/spu_tri.c     | 36 ++++++++++++++++++------------------
 4 files changed, 29 insertions(+), 32 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index b22d563551..cfd4d72729 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -115,7 +115,7 @@ struct spu_global
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
 
-   uint (*sample_texture)(vector float texcoord);
+   vector float (*sample_texture)(vector float texcoord);
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index ecacf2ec88..9ee2b45e24 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -130,7 +130,7 @@ get_tex_tile(vector unsigned int ij)
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
  */
-uint
+vector float
 sample_texture_nearest(vector float texcoord)
 {
    vector float tc = spu_mul(texcoord, spu.tex_size);
@@ -139,11 +139,11 @@ sample_texture_nearest(vector float texcoord)
    vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
    uint pos = get_tex_tile(itc);
    uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
-   return texel;
+   return spu_unpack_A8R8G8B8(texel);
 }
 
 
-uint
+vector float
 sample_texture_bilinear(vector float texcoord)
 {
    static const vector unsigned int offset10 = {1, 0, 0, 0};
@@ -183,10 +183,10 @@ sample_texture_bilinear(vector float texcoord)
    }
 
    /* get texels from tiles and convert to float[4] */
-   vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
-   vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
-   vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-   vector float texel11 = spu_unpack_color(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
@@ -213,8 +213,5 @@ sample_texture_bilinear(vector float texcoord)
    texel_sum = spu_add(texel_sum, texel10);
    texel_sum = spu_add(texel_sum, texel11);
 
-   /* convert to uint color */
-   uint texel = spu_pack_R8G8B8A8(texel_sum);
-
-   return texel;
+   return texel_sum;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 0e000bfebf..95eb87080f 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -36,11 +36,11 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern uint
+extern vector float
 sample_texture_nearest(vector float texcoord);
 
 
-extern uint
+extern vector float
 sample_texture_bilinear(vector float texcoord);
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 89aaca9a72..4c6de56eda 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -301,6 +301,8 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
+      const vector unsigned char shuffle = spu.color_shuffle;
+      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
@@ -310,34 +312,32 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0]);
+            colors[0] = spu.sample_texture(texcoords[0]);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1]);
+            colors[1] = spu.sample_texture(texcoords[1]);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2]);
+            colors[2] = spu.sample_texture(texcoords[2]);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3]);
+            colors[3] = spu.sample_texture(texcoords[3]);
       }
       else {
          /* simple shading */
-         const vector unsigned char shuffle = spu.color_shuffle;
-         vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
+      }
 
-#if 0
-         if (spu.blend.blend_enable)
-            blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#if 1
+      if (spu.blend.blend_enable)
+         blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
 #endif
 
-         if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
-         if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
-         if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
-         if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
-      }
+      if (spu_extract(mask, 0))
+         spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
+      if (spu_extract(mask, 1))
+         spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
+      if (spu_extract(mask, 2))
+         spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
+      if (spu_extract(mask, 3))
+         spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
 
 #if 0
       /* SIMD_Z with swizzled color buffer (someday) */
-- 
cgit v1.2.3


From 1a75464cdc12a1e83f1452707cd624c53f808308 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:48:00 -0700
Subject: Cell: fix small sampling error in sample_texture_bilinear()

---
 src/mesa/pipe/cell/spu/spu_texture.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 9ee2b45e24..01ff33a857 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -150,6 +150,8 @@ sample_texture_bilinear(vector float texcoord)
    static const vector unsigned int offset01 = {0, 1, 0, 0};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
+   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+
    /* integer texcoords S,T: */
    vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
    vector unsigned int itc01 = spu_add(itc00, offset01);
-- 
cgit v1.2.3


From 93d061b217e31d27a1c54e50a14538e94f1404d6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:48:36 -0700
Subject: Cell: move float4 typedef (temporary datatype)

---
 src/mesa/pipe/cell/spu/spu_main.h | 7 -------
 src/mesa/pipe/cell/spu/spu_tri.c  | 6 ++++++
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index cfd4d72729..1710a17512 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -41,13 +41,6 @@
 #define MAX_HEIGHT 1024
 
 
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
-
-
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 4c6de56eda..688c8646ab 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -45,6 +45,12 @@
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
 
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
 
 /**
  * Simplified types taken from other parts of Gallium
-- 
cgit v1.2.3


From 7cbe5cf212d296c19ccf8e1b74d3a5b1bcb2d9e9 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:02:21 -0700
Subject: Cell: don't use VEC_LITERAL macro, doesn't work w/ SDK 3.0

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 41 +++++++++++++++++-----------------
 src/mesa/pipe/cell/spu/spu_ztest.h     | 24 ++++++++++----------
 2 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 57ea3525c2..e9fee8a3a6 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -31,7 +31,6 @@
 #define SPU_COLORPACK_H
 
 
-#include <vec_literal.h>
 #include <spu_intrinsics.h>
 
 
@@ -39,9 +38,11 @@ static INLINE unsigned int
 spu_pack_R8G8B8A8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  0, 4, 8, 12, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
   return spu_extract(out, 0);
 }
 
@@ -50,9 +51,9 @@ static INLINE unsigned int
 spu_pack_A8R8G8B8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  12, 0, 4, 8, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
   return spu_extract(out, 0);
 }
 
@@ -61,9 +62,9 @@ static INLINE unsigned int
 spu_pack_B8G8R8A8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  8, 4, 0, 12, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
   return spu_extract(out, 0);
 }
 
@@ -82,11 +83,11 @@ spu_unpack_color(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
-                          VEC_LITERAL(vector unsigned char,
-                                      0, 0, 0, 0,
-                                      5, 5, 5, 5,
-                                      10, 10, 10, 10,
-                                      15, 15, 15, 15));
+                          ((vector unsigned char) {
+                             0, 0, 0, 0,
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -96,11 +97,11 @@ spu_unpack_A8R8G8B8(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
-                          VEC_LITERAL(vector unsigned char,
-                                      5, 5, 5, 5,
-                                      10, 10, 10, 10,
-                                      15, 15, 15, 15,
-                                      0, 0, 0, 0));
+                          ((vector unsigned char) {
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15,
+                             0, 0, 0, 0}) );
 
    return spu_convtf(color_u4, 32);
 }
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
index 5fefb15176..ce8ad00339 100644
--- a/src/mesa/pipe/cell/spu/spu_ztest.h
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -68,9 +68,9 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* gather lower four ushorts */
       zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
                              (vector unsigned int) *zbuf,
-                             VEC_LITERAL(vector unsigned char,
-                                      ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
-                                      ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+                             ((vector unsigned char) {
+                                ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
       /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
       mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
       /* mask &= inMask */
@@ -80,18 +80,18 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
       *zbuf = (vector unsigned short)
          spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     VEC_LITERAL(vector unsigned char,
-                                 16, 17, 18, 19, 20, 21, 22, 23,
-                                 2, 3, 6, 7, 10, 11, 14, 15));
+                     ((vector unsigned char) {
+                        16, 17, 18, 19, 20, 21, 22, 23,
+                        2, 3, 6, 7, 10, 11, 14, 15}));
    }
    else {
       /* convert zbuffer values from ushorts to uints */
       /* gather upper four ushorts */
       zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
                              (vector unsigned int) *zbuf,
-                             VEC_LITERAL(vector unsigned char,
-                                         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-                                         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+                             ((vector unsigned char) {
+                                ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
       /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
       mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
       /* mask &= inMask */
@@ -101,9 +101,9 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
       *zbuf = (vector unsigned short)
          spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     VEC_LITERAL(vector unsigned char,
-                                 2, 3, 6, 7, 10, 11, 14, 15,
-                                 24, 25, 26, 27, 28, 29, 30, 31));
+                     ((vector unsigned char) {
+                        2, 3, 6, 7, 10, 11, 14, 15,
+                        24, 25, 26, 27, 28, 29, 30, 31}));
    }
    return mask;
 #undef ZERO
-- 
cgit v1.2.3


From 684d320ea2e7ec03d01275a544068cc6b45e1e9a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:03:05 -0700
Subject: Cell: don't use VEC_LITERAL macro, doesn't work w/ SDK 3.0

---
 src/mesa/pipe/cell/spu/spu_texture.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 01ff33a857..3962aaa4a9 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -26,8 +26,6 @@
  **************************************************************************/
 
 
-#include <vec_literal.h>
-
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -57,7 +55,7 @@ invalidate_tex_cache(void)
    /* XXX memset? */
    uint i;
    for (i = 0; i < CACHE_SIZE; i++) {
-      tex_tile_xy[i] = VEC_LITERAL(vector unsigned int, ~0U, ~0U, ~0U, ~0U);
+      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
    }
 }
 
-- 
cgit v1.2.3


From 5db1593c78192b764ad2ef7bdc5182d8ec4aed7c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:05:37 -0700
Subject: Cell: fix some alignment issues by aligning commands to 8-byte
 boundaries

Contributed by Ian Romanick.
Also, temporarily disable inlined vertex buffers.  They need to be 16-byte
aligned...
---
 src/mesa/pipe/cell/common.h                 | 16 ++++----
 src/mesa/pipe/cell/ppu/cell_batch.c         |  4 +-
 src/mesa/pipe/cell/ppu/cell_flush.c         |  2 +-
 src/mesa/pipe/cell/ppu/cell_state_emit.c    |  3 +-
 src/mesa/pipe/cell/ppu/cell_vbuf.c          |  4 +-
 src/mesa/pipe/cell/ppu/cell_vertex_shader.c | 22 ++++++-----
 src/mesa/pipe/cell/spu/spu_main.c           | 58 +++++++++++++----------------
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c   |  7 ++--
 src/mesa/pipe/cell/spu/spu_vertex_shader.h  |  2 +-
 9 files changed, 57 insertions(+), 61 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d861e82d33..cf8fc94ebf 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -57,6 +57,9 @@
 /** round up value to next multiple of 4 */
 #define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
 
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
 /** round up value to next multiple of 16 */
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
@@ -102,7 +105,7 @@
  */
 struct cell_command_framebuffer
 {
-   uint opcode;
+   uint64_t opcode;
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -114,7 +117,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint opcode;
+   uint64_t opcode;
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
 };
@@ -125,8 +128,7 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint opcode;
-    uint base;          /**< Base address of the 0th element. */
+    uint64_t base;          /**< Base address of the 0th element. */
     uint attr;          /**< Attribute that this state if for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
@@ -150,7 +152,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    struct cell_shader_info   shader;
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -163,7 +165,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint opcode;       /**< CELL_CMD_RENDER */
+   uint64_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -179,7 +181,7 @@ struct cell_command_render
 
 struct cell_command_release_verts
 {
-   int opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 2d032fc902..2fb49711b2 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -136,7 +136,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
    ASSERT(cell->cur_batch >= 0);
 
@@ -171,7 +171,7 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
    void *pos;
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
 
    assert(cell->cur_batch >= 0);
diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index cf4e676645..f62bc4650c 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -59,7 +59,7 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags)
    flushing = TRUE;
 
    if (flags & PIPE_FLUSH_WAIT) {
-      uint *cmd = (uint *) cell_batch_alloc(cell, sizeof(uint));
+      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
       *cmd = CELL_CMD_FINISH;
    }
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 3b2670f786..5d2a786449 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -37,7 +37,8 @@ static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint *dst = (uint *) cell_batch_alloc(cell, sizeof(uint) + state_size);
+   uint64_t *dst = (uint64_t *) 
+       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
    *dst = cmd;
    memcpy(dst + 1, state, state_size);
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index e63b34cf52..0fee61821a 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,7 +40,7 @@
 
 
 /** Allow vertex data to be inlined after RENDER command */
-#define ALLOW_INLINE_VERTS 1
+#define ALLOW_INLINE_VERTS 0
 
 
 /**
@@ -197,7 +197,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP4(nr_indices * 2);
+      const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
 
       const uint batch_size = sizeof(struct cell_command_render)
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
index aef329a902..80dd500b34 100644
--- a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -52,8 +52,8 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
    struct cell_command_vs *const vs = &cell_global.command[0].vs;
-   unsigned *batch;
-   struct cell_array_info array_info;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
    unsigned i, j;
 
    assert(draw->vs.queue_nr != 0);
@@ -63,17 +63,19 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    draw_update_vertex_fetch(draw);
 
    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
-      array_info.opcode = CELL_CMD_STATE_VS_ARRAY_INFO;
-      assert(draw->vertex_fetch.src_ptr[i] != NULL);
-      array_info.base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
-      array_info.attr = i;
-      array_info.pitch = draw->vertex_fetch.pitch[i];
-      array_info.format = draw->vertex_element[i].src_format;
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
 
-      cell_batch_append(cell, & array_info, sizeof(array_info));
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->format = draw->vertex_element[i].src_format;
    }
 
-   batch = cell_batch_alloc(cell, sizeof(unsigned)
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
                             + sizeof(struct pipe_viewport_state));
    batch[0] = CELL_CMD_STATE_VIEWPORT;
    (void) memcpy(&batch[1], &draw->viewport,
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index b0311db1aa..4f126d5e5b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,7 +31,6 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <vec_literal.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -220,13 +219,13 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 0;
 
    if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      12, 0, 4, 8, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              12, 0, 4, 8, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      8, 4, 0, 12, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              8, 4, 0, 12, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else
       ASSERT(0);
 }
@@ -279,16 +278,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
              spu.init.id, texture->start, texture->width, texture->height);
 
    memcpy(&spu.texture, texture, sizeof(*texture));
-   spu.tex_size = VEC_LITERAL(vector float,
-                              spu.texture.width,
-                              spu.texture.height,
-                              0.0,
-                              0.0);
-   spu.tex_size_mask = VEC_LITERAL(vector unsigned int,
-                                   spu.texture.width - 1,
-                                   spu.texture.height - 1,
-                                   0,
-                                   0);
+   spu.tex_size = (vector float)
+      { spu.texture.width, spu.texture.height, 0.0, 0.0};
+   spu.tex_size_mask = (vector unsigned int)
+      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
 }
 
 
@@ -341,8 +334,8 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
-   const uint usize = size / sizeof(uint);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
    if (Debug)
@@ -377,7 +370,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 4;
+            pos += sizeof(*fb) / 8;
          }
          break;
       case CELL_CMD_CLEAR_SURFACE:
@@ -385,7 +378,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 4;
+            pos += sizeof(*clr) / 8;
          }
          break;
       case CELL_CMD_RENDER:
@@ -394,7 +387,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 4 + pos_incr;
+            pos += sizeof(*render) / 8 + ((pos_incr + 1) / 2);
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -402,8 +395,7 @@ cmd_batch(uint opcode)
             struct cell_command_release_verts *release
                = (struct cell_command_release_verts *) &buffer[pos];
             cmd_release_verts(release);
-            ASSERT(sizeof(*release) == 8);
-            pos += sizeof(*release) / 4;
+            pos += sizeof(*release) / 8;
          }
          break;
       case CELL_CMD_FINISH:
@@ -413,36 +405,36 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_BLEND:
          cmd_state_blend((struct pipe_blend_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_blend_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_depth_stencil_alpha_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_sampler_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
          break;
       case CELL_CMD_STATE_TEXTURE:
          cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_command_texture) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct vertex_info) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
          break;
       case CELL_CMD_STATE_VIEWPORT:
          (void) memcpy(& draw.viewport, &buffer[pos+1],
                        sizeof(struct pipe_viewport_state));
-         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos]);
-         pos += (sizeof(struct cell_array_info) / 4);
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       default:
-         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
          break;
       }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 1e846868e3..5b0f2a6470 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -431,9 +431,8 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
    for (attr = 0; attr < nr_attrs; attr++) {
-
-      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
-      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
       float p[4][4];
@@ -447,7 +446,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        */
       for (i = 0; i < count; i++) {
          uint8_t buffer[32] ALIGN16_ATTRIB;
-         const unsigned long addr = src + (elts[i] * pitch);
+         const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index c52f38fd02..b261ab44a2 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -16,7 +16,7 @@ struct spu_vs_context {
    struct pipe_viewport_state viewport;
 
    struct {
-      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      uint64_t src_ptr[PIPE_ATTRIB_MAX];
       unsigned pitch[PIPE_ATTRIB_MAX];
       enum pipe_format format[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
-- 
cgit v1.2.3


From 8fc2355949b67cd99403c1184ce711a344877375 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 14:58:38 -0800
Subject: Vectorize all micro ops

Fold single instruction micro ops inline.  Remove unused micro ops.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 912 ++++++++++----------------------------
 src/mesa/pipe/cell/spu/spu_exec.h |   1 +
 2 files changed, 230 insertions(+), 683 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 168bada3bb..1ac9c031e3 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,8 +52,15 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
-#include <simdmath/sqrtf4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
 #include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
@@ -157,643 +164,175 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
 }
 
 
-static void
-micro_abs(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] = (float) fabs( (double) src->f[0] );
-   dst->f[1] = (float) fabs( (double) src->f[1] );
-   dst->f[2] = (float) fabs( (double) src->f[2] );
-   dst->f[3] = (float) fabs( (double) src->f[3] );
-}
-
-static void
-micro_add(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] + src1->f[0];
-   dst->f[1] = src0->f[1] + src1->f[1];
-   dst->f[2] = src0->f[2] + src1->f[2];
-   dst->f[3] = src0->f[3] + src1->f[3];
-}
-
-static void
-micro_iadd(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] + src1->i[0];
-   dst->i[1] = src0->i[1] + src1->i[1];
-   dst->i[2] = src0->i[2] + src1->i[2];
-   dst->i[3] = src0->i[3] + src1->i[3];
-}
-
-static void
-micro_and(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] & src1->u[0];
-   dst->u[1] = src0->u[1] & src1->u[1];
-   dst->u[2] = src0->u[2] & src1->u[2];
-   dst->u[3] = src0->u[3] & src1->u[3];
-}
-
-static void
-micro_ceil(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) ceil( (double) src->f[0] );
-   dst->f[1] = (float) ceil( (double) src->f[1] );
-   dst->f[2] = (float) ceil( (double) src->f[2] );
-   dst->f[3] = (float) ceil( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_cos(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) cos( (double) src->f[0] );
-   dst->f[1] = (float) cos( (double) src->f[1] );
-   dst->f[2] = (float) cos( (double) src->f[2] );
-   dst->f[3] = (float) cos( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_ddx(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_ddy(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_div(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] / src1->f[0];
-   dst->f[1] = src0->f[1] / src1->f[1];
-   dst->f[2] = src0->f[2] / src1->f[2];
-   dst->f[3] = src0->f[3] / src1->f[3];
-}
-
-static void
-micro_udiv(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] / src1->u[0];
-   dst->u[1] = src0->u[1] / src1->u[1];
-   dst->u[2] = src0->u[2] / src1->u[2];
-   dst->u[3] = src0->u[3] / src1->u[3];
-}
-
-static void
-micro_eq(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_ieq(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
-}
-
-static void
-micro_exp2(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src)
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
-   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
-   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
-   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
-#endif
-}
-
-static void
-micro_f2it(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->i[0] = (int) src->f[0];
-   dst->i[1] = (int) src->f[1];
-   dst->i[2] = (int) src->f[2];
-   dst->i[3] = (int) src->f[3];
-}
-
-static void
-micro_f2ut(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->u[0] = (uint) src->f[0];
-   dst->u[1] = (uint) src->f[1];
-   dst->u[2] = (uint) src->f[2];
-   dst->u[3] = (uint) src->f[3];
-}
-
-static void
-micro_flr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) floor( (double) src->f[0] );
-   dst->f[1] = (float) floor( (double) src->f[1] );
-   dst->f[2] = (float) floor( (double) src->f[2] );
-   dst->f[3] = (float) floor( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_frc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
-   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
-   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
-   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_ge(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_i2f(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] = (float) src->i[0];
-   dst->f[1] = (float) src->i[1];
-   dst->f[2] = (float) src->i[2];
-   dst->f[3] = (float) src->i[3];
-}
-
-static void
-micro_lg2(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
-   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
-   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
-   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
-#endif
-}
-
-static void
-micro_lt(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_abs(qword src)
 {
-   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+   return si_rotmi(si_shli(src, 1), -1);
 }
 
-static void
-micro_ilt(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_ceil(qword src)
 {
-   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+   return (qword) _ceilf4((vec_float4) src);
 }
 
-static void
-micro_ult(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_cos(qword src)
 {
-   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+   return (qword) _cosf4((vec_float4) src);
 }
 
-static void
-micro_max(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
 {
-   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
-}
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
 
-static void
-micro_imax(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+   return si_fs(bottom_right, bottom_left);
 }
 
-static void
-micro_umax(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_ddy(qword src)
 {
-   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
-}
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
 
-static void
-micro_min(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+   return si_fs(top_left, bottom_left);
 }
 
-static void
-micro_imin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_div(qword src0, qword src1)
 {
-   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
 }
 
-static void
-micro_umin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_flr(qword src)
 {
-   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+   return (qword) _floorf4((vec_float4) src);
 }
 
-static void
-micro_umod(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_frc(qword src)
 {
-   dst->u[0] = src0->u[0] % src1->u[0];
-   dst->u[1] = src0->u[1] % src1->u[1];
-   dst->u[2] = src0->u[2] % src1->u[2];
-   dst->u[3] = src0->u[3] % src1->u[3];
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
 }
 
-static void
-micro_mul(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] * src1->f[0];
-   dst->f[1] = src0->f[1] * src1->f[1];
-   dst->f[2] = src0->f[2] * src1->f[2];
-   dst->f[3] = src0->f[3] * src1->f[3];
-}
-
-static void
-micro_imul(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] * src1->i[0];
-   dst->i[1] = src0->i[1] * src1->i[1];
-   dst->i[2] = src0->i[2] * src1->i[2];
-   dst->i[3] = src0->i[3] * src1->i[3];
-}
-
-static void
-micro_imul64(
-   union spu_exec_channel *dst0,
-   union spu_exec_channel *dst1,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst1->i[0] = src0->i[0] * src1->i[0];
-   dst1->i[1] = src0->i[1] * src1->i[1];
-   dst1->i[2] = src0->i[2] * src1->i[2];
-   dst1->i[3] = src0->i[3] * src1->i[3];
-   dst0->i[0] = 0;
-   dst0->i[1] = 0;
-   dst0->i[2] = 0;
-   dst0->i[3] = 0;
-}
-
-static void
-micro_umul64(
-   union spu_exec_channel *dst0,
-   union spu_exec_channel *dst1,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst1->u[0] = src0->u[0] * src1->u[0];
-   dst1->u[1] = src0->u[1] * src1->u[1];
-   dst1->u[2] = src0->u[2] * src1->u[2];
-   dst1->u[3] = src0->u[3] * src1->u[3];
-   dst0->u[0] = 0;
-   dst0->u[1] = 0;
-   dst0->u[2] = 0;
-   dst0->u[3] = 0;
-}
-
-static void
-micro_movc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2 )
-{
-   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
-   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
-   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
-   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
-}
-
-static void
-micro_neg(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_ge(qword src0, qword src1)
 {
-   dst->f[0] = -src->f[0];
-   dst->f[1] = -src->f[1];
-   dst->f[2] = -src->f[2];
-   dst->f[3] = -src->f[3];
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
 }
 
-static void
-micro_ineg(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static qword
+micro_lg2(qword src)
 {
-   dst->i[0] = -src->i[0];
-   dst->i[1] = -src->i[1];
-   dst->i[2] = -src->i[2];
-   dst->i[3] = -src->i[3];
+   return (qword) _log2f4((vec_float4) src);
 }
 
-static void
-micro_not(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_lt(qword src0, qword src1)
 {
-   dst->u[0] = ~src->u[0];
-   dst->u[1] = ~src->u[1];
-   dst->u[2] = ~src->u[2];
-   dst->u[3] = ~src->u[3];
-}
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
 
-static void
-micro_or(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] | src1->u[0];
-   dst->u[1] = src0->u[1] | src1->u[1];
-   dst->u[2] = src0->u[2] | src1->u[2];
-   dst->u[3] = src0->u[3] | src1->u[3];
+   return si_xori(tmp, 0xff);
 }
 
-static void
-micro_pow(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_max(qword src0, qword src1)
 {
-   vec_float4 s0 = (vec_float4) {
-      src0->f[0], src0->f[1], src0->f[2], src0->f[3]
-   };
-   vec_float4 s1 = (vec_float4) {
-      src1->f[0], src1->f[1], src1->f[2], src1->f[3]
-   };
-   vec_float4 d = _powf4(s0, s1);
-
-   dst->f[0] = spu_extract(d, 0);
-   dst->f[1] = spu_extract(d, 1);
-   dst->f[2] = spu_extract(d, 2);
-   dst->f[3] = spu_extract(d, 3);
+   return si_selb(src1, src0, si_fcgt(src0, src1));
 }
 
-static void
-micro_rnd(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_min(qword src0, qword src1)
 {
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
-   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
-   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
-   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
-#endif
+   return si_selb(src0, src1, si_fcgt(src0, src1));
 }
 
-static void
-micro_shl(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_neg(qword src)
 {
-   dst->i[0] = src0->i[0] << src1->i[0];
-   dst->i[1] = src0->i[1] << src1->i[1];
-   dst->i[2] = src0->i[2] << src1->i[2];
-   dst->i[3] = src0->i[3] << src1->i[3];
+   return si_xor(src, (qword) spu_splats(0x80000000));
 }
 
-static void
-micro_ishr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_set_sign(qword src)
 {
-   dst->i[0] = src0->i[0] >> src1->i[0];
-   dst->i[1] = src0->i[1] >> src1->i[1];
-   dst->i[2] = src0->i[2] >> src1->i[2];
-   dst->i[3] = src0->i[3] >> src1->i[3];
+   return si_or(src, (qword) spu_splats(0x80000000));
 }
 
-static void
-micro_trunc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0 )
+static qword
+micro_pow(qword src0, qword src1)
 {
-   dst->f[0] = (float) (int) src0->f[0];
-   dst->f[1] = (float) (int) src0->f[1];
-   dst->f[2] = (float) (int) src0->f[2];
-   dst->f[3] = (float) (int) src0->f[3];
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
 }
 
-static void
-micro_ushr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_rnd(qword src)
 {
-   dst->u[0] = src0->u[0] >> src1->u[0];
-   dst->u[1] = src0->u[1] >> src1->u[1];
-   dst->u[2] = src0->u[2] >> src1->u[2];
-   dst->u[3] = src0->u[3] >> src1->u[3];
-}
+   const qword half = (qword) spu_splats(0.5f);
 
-static void
-micro_sin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) sin( (double) src->f[0] );
-   dst->f[1] = (float) sin( (double) src->f[1] );
-   dst->f[2] = (float) sin( (double) src->f[2] );
-   dst->f[3] = (float) sin( (double) src->f[3] );
-#endif
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
 }
 
-static void
-micro_sqrt( union spu_exec_channel *dst,
-            const union spu_exec_channel *src )
+static INLINE qword
+micro_ishr(qword src0, qword src1)
 {
-   vec_float4 s = (vec_float4) {
-      src->f[0], src->f[1], src->f[2], src->f[3]
-   };
-   vec_float4 d = _sqrtf4(s);
-
-   dst->f[0] = spu_extract(d, 0);
-   dst->f[1] = spu_extract(d, 1);
-   dst->f[2] = spu_extract(d, 2);
-   dst->f[3] = spu_extract(d, 3);
+   return si_rotma(src0, si_sfi(src1, 0));
 }
 
-static void
-micro_sub(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_trunc(qword src)
 {
-   dst->f[0] = src0->f[0] - src1->f[0];
-   dst->f[1] = src0->f[1] - src1->f[1];
-   dst->f[2] = src0->f[2] - src1->f[2];
-   dst->f[3] = src0->f[3] - src1->f[3];
+   return (qword) _truncf4((vec_float4) src);
 }
 
-static void
-micro_u2f(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static qword
+micro_sin(qword src)
 {
-   dst->f[0] = (float) src->u[0];
-   dst->f[1] = (float) src->u[1];
-   dst->f[2] = (float) src->u[2];
-   dst->f[3] = (float) src->u[3];
+   return (qword) _sinf4((vec_float4) src);
 }
 
-static void
-micro_xor(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_sqrt(qword src)
 {
-   dst->u[0] = src0->u[0] ^ src1->u[0];
-   dst->u[1] = src0->u[1] ^ src1->u[1];
-   dst->u[2] = src0->u[2] ^ src1->u[2];
-   dst->u[3] = src0->u[3] ^ src1->u[3];
+   return (qword) _sqrtf4((vec_float4) src);
 }
 
 static void
@@ -983,16 +522,15 @@ fetch_source(
 
    switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
    case TGSI_UTIL_SIGN_CLEAR:
-      micro_abs( chan, chan );
+      chan->q = micro_abs(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_SET:
-      micro_abs( chan, chan );
-      micro_neg( chan, chan );
+      chan->q = micro_set_sign(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_TOGGLE:
-      micro_neg( chan, chan );
+      chan->q = micro_neg(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_KEEP:
@@ -1000,7 +538,7 @@ fetch_source(
    }
 
    if (reg->SrcRegisterExtMod.Complement) {
-      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
    }
 }
 
@@ -1051,8 +589,8 @@ store_dest(
 
    case TGSI_SAT_ZERO_ONE:
       /* XXX need to obey ExecMask here */
-      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
@@ -1162,7 +700,7 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[1], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[1] );
+         r[0].q = micro_div(r[0].q, r[1].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1194,9 +732,9 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1228,9 +766,9 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1389,7 +927,7 @@ exec_instruction(
    case TGSI_OPCODE_ARL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 FETCH( &r[0], 0, chan_index );
-	 micro_f2it( &r[0], &r[0] );
+         r[0].q = si_cflts(r[0].q, 0);
 	 STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1409,22 +947,27 @@ exec_instruction(
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
 	 FETCH( &r[0], 0, CHAN_X );
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
 	    STORE( &r[0], 0, CHAN_Y );
 	 }
 
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-	    FETCH( &r[1], 0, CHAN_Y );
-	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-
-	    FETCH( &r[2], 0, CHAN_W );
-	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
-	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
-	    micro_pow( &r[1], &r[1], &r[2] );
-	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-	    STORE( &r[0], 0, CHAN_Z );
-	 }
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
       }
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
@@ -1435,7 +978,7 @@ exec_instruction(
    case TGSI_OPCODE_RCP:
    /* TGSI_OPCODE_RECIP */
       FETCH( &r[0], 0, CHAN_X );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
       }
@@ -1444,8 +987,8 @@ exec_instruction(
    case TGSI_OPCODE_RSQ:
    /* TGSI_OPCODE_RECIPSQRT */
       FETCH( &r[0], 0, CHAN_X );
-      micro_sqrt( &r[0], &r[0] );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
       }
@@ -1465,7 +1008,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         micro_mul( &r[0], &r[0], &r[1] );
+         r[0].q = si_fm(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1475,7 +1018,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
+         r[0].q = si_fa(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1484,17 +1027,16 @@ exec_instruction(
    /* TGSI_OPCODE_DOT3 */
       FETCH( &r[0], 0, CHAN_X );
       FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH( &r[1], 0, CHAN_Y );
       FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
 
       FETCH( &r[1], 0, CHAN_Z );
       FETCH( &r[2], 1, CHAN_Z );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
@@ -1506,25 +1048,22 @@ exec_instruction(
        FETCH(&r[0], 0, CHAN_X);
        FETCH(&r[1], 1, CHAN_X);
 
-       micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
        FETCH(&r[1], 0, CHAN_Y);
        FETCH(&r[2], 1, CHAN_Y);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
        FETCH(&r[1], 0, CHAN_Z);
        FETCH(&r[2], 1, CHAN_Z);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
        FETCH(&r[1], 0, CHAN_W);
        FETCH(&r[2], 1, CHAN_W);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1539,7 +1078,7 @@ exec_instruction(
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
 	 FETCH( &r[0], 0, CHAN_Y );
 	 FETCH( &r[1], 1, CHAN_Y);
-	 micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 	 STORE( &r[0], 0, CHAN_Y );
       }
 
@@ -1559,8 +1098,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         /* XXX use micro_min()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+         r[0].q = micro_min(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1571,8 +1109,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         /* XXX use micro_max()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+         r[0].q = micro_max(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index );
       }
@@ -1583,7 +1120,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1593,7 +1133,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         r[0].q = micro_ge(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1603,9 +1143,8 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_mul( &r[0], &r[0], &r[1] );
-         FETCH( &r[1], 2, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1615,7 +1154,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         micro_sub( &r[0], &r[0], &r[1] );
+         r[0].q = si_fs(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1628,9 +1167,8 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
 
-         micro_sub( &r[1], &r[1], &r[2] );
-         micro_mul( &r[0], &r[0], &r[1] );
-         micro_add( &r[0], &r[0], &r[2] );
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1661,7 +1199,7 @@ exec_instruction(
    /* TGSI_OPCODE_FRC */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_frc( &r[0], &r[0] );
+         r[0].q = micro_frc(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1674,7 +1212,7 @@ exec_instruction(
    /* TGSI_OPCODE_FLR */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_flr( &r[0], &r[0] );
+         r[0].q = micro_flr(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1682,7 +1220,7 @@ exec_instruction(
    case TGSI_OPCODE_ROUND:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_rnd( &r[0], &r[0] );
+         r[0].q = micro_rnd(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1691,7 +1229,7 @@ exec_instruction(
     /* TGSI_OPCODE_EX2 */
       FETCH(&r[0], 0, CHAN_X);
 
-      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1701,7 +1239,7 @@ exec_instruction(
    case TGSI_OPCODE_LOGBASE2:
    /* TGSI_OPCODE_LG2 */
       FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
+      r[0].q = micro_lg2(r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
       }
@@ -1712,7 +1250,7 @@ exec_instruction(
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 1, CHAN_X);
 
-      micro_pow( &r[0], &r[0], &r[1] );
+      r[0].q = micro_pow(r[0].q, r[1].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1723,35 +1261,34 @@ exec_instruction(
       /* TGSI_OPCODE_XPD */
       FETCH(&r[0], 0, CHAN_Y);
       FETCH(&r[1], 1, CHAN_Z);
-
-      micro_mul( &r[2], &r[0], &r[1] );
-
       FETCH(&r[3], 0, CHAN_Z);
       FETCH(&r[4], 1, CHAN_Y);
 
-      micro_mul( &r[5], &r[3], &r[4] );
-      micro_sub( &r[2], &r[2], &r[5] );
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
          STORE( &r[2], 0, CHAN_X );
       }
 
       FETCH(&r[2], 1, CHAN_X);
-
-      micro_mul( &r[3], &r[3], &r[2] );
-
       FETCH(&r[5], 0, CHAN_X);
 
-      micro_mul( &r[1], &r[1], &r[5] );
-      micro_sub( &r[3], &r[3], &r[1] );
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
          STORE( &r[3], 0, CHAN_Y );
       }
 
-      micro_mul( &r[5], &r[5], &r[4] );
-      micro_mul( &r[0], &r[0], &r[2] );
-      micro_sub( &r[5], &r[5], &r[0] );
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
          STORE( &r[5], 0, CHAN_Z );
@@ -1770,7 +1307,7 @@ exec_instruction(
        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH(&r[0], 0, chan_index);
 
-          micro_abs( &r[0], &r[0] );
+          r[0].q = micro_abs(r[0].q);
 
           STORE(&r[0], 0, chan_index);
        }
@@ -1784,23 +1321,21 @@ exec_instruction(
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 1, CHAN_X);
 
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 1, CHAN_Y);
 
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FETCH(&r[1], 0, CHAN_Z);
       FETCH(&r[2], 1, CHAN_Z);
 
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FETCH(&r[1], 1, CHAN_W);
 
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fa(r[0].q, r[1].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1810,7 +1345,7 @@ exec_instruction(
    case TGSI_OPCODE_COS:
       FETCH(&r[0], 0, CHAN_X);
 
-      micro_cos( &r[0], &r[0] );
+      r[0].q = micro_cos(r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1820,7 +1355,7 @@ exec_instruction(
    case TGSI_OPCODE_DDX:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddx( &r[0], &r[0] );
+         r[0].q = micro_ddx(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1828,7 +1363,7 @@ exec_instruction(
    case TGSI_OPCODE_DDY:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddy( &r[0], &r[0] );
+         r[0].q = micro_ddy(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1866,9 +1401,9 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1],
-                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
-                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1881,14 +1416,14 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
 
    case TGSI_OPCODE_SIN:
       FETCH( &r[0], 0, CHAN_X );
-      micro_sin( &r[0], &r[0] );
+      r[0].q = micro_sin(r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
       }
@@ -1898,7 +1433,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1907,7 +1445,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2038,7 +1579,11 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
 
-         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -2049,11 +1594,11 @@ exec_instruction(
          FETCH( &r[0], 0, CHAN_X );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-         micro_cos( &r[1], &r[0] );
+         r[1].q = micro_cos(r[0].q);
          STORE( &r[1], 0, CHAN_X );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         micro_sin( &r[1], &r[0] );
+         r[1].q = micro_sin(r[0].q);
          STORE( &r[1], 0, CHAN_Y );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
@@ -2075,12 +1620,11 @@ exec_instruction(
    case TGSI_OPCODE_DP2:
       FETCH( &r[0], 0, CHAN_X );
       FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH( &r[1], 0, CHAN_Y );
       FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
@@ -2152,7 +1696,7 @@ exec_instruction(
    case TGSI_OPCODE_CEIL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ceil( &r[0], &r[0] );
+         r[0].q = micro_ceil(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2160,7 +1704,7 @@ exec_instruction(
    case TGSI_OPCODE_I2F:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_i2f( &r[0], &r[0] );
+         r[0].q = si_csflt(r[0].q, 0);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2168,7 +1712,7 @@ exec_instruction(
    case TGSI_OPCODE_NOT:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_not( &r[0], &r[0] );
+         r[0].q = si_xorbi(r[0].q, 0xff);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2176,7 +1720,7 @@ exec_instruction(
    case TGSI_OPCODE_TRUNC:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_trunc( &r[0], &r[0] );
+         r[0].q = micro_trunc(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2185,7 +1729,9 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_shl( &r[0], &r[0], &r[1] );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2194,7 +1740,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ishr( &r[0], &r[0], &r[1] );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2203,7 +1749,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_and( &r[0], &r[0], &r[1] );
+         r[0].q = si_and(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2212,7 +1758,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_or( &r[0], &r[0], &r[1] );
+         r[0].q = si_or(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2225,7 +1771,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_xor( &r[0], &r[0], &r[1] );
+         r[0].q = si_xor(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h
index 89e422ba48..b4c7661ef6 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.h
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -43,6 +43,7 @@ union spu_exec_channel
    float    f[QUAD_SIZE];
    int      i[QUAD_SIZE];
    unsigned u[QUAD_SIZE];
+   qword    q;
 };
 
 /**
-- 
cgit v1.2.3


From 490a7b1c73babd528b6d883471a8636157c5853a Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 17:12:20 -0800
Subject: Vectorize vertex puller

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 186 +++++++++--------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   4 +-
 2 files changed, 61 insertions(+), 129 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 5b0f2a6470..4133fbba17 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -42,6 +42,8 @@
 #define DRAW_DBG 0
 
 
+static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 };
+
 /**
  * Fetch a float[4] vertex attribute from memory, doing format/type
  * conversion as needed.
@@ -50,19 +52,16 @@
  * conversion, texture sampling etc.
  */
 #define FETCH_ATTRIB( NAME, SZ, CVT )			\
-static void						\
-fetch_##NAME(const void *ptr, float *attrib)		\
+static qword						\
+fetch_##NAME(const void *ptr)				\
 {							\
-   static const float defaults[4] = { 0,0,0,1 };	\
+   vec_float4 attrib = defaults;			\
    int i;						\
 							\
    for (i = 0; i < SZ; i++) {				\
-      attrib[i] = CVT;					\
-   }							\
-							\
-   for (; i < 4; i++) {					\
-      attrib[i] = defaults[i];				\
+      attrib = spu_insert(CVT, attrib, i);		\
    }							\
+   return (qword) attrib;				\
 }
 
 #define CVT_64_FLOAT   (float) ((double *) ptr)[i]
@@ -309,106 +308,59 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-static void 
-transpose_4x4( float *out, const float *in )
-{
-   /* This can be achieved in 12 sse instructions, plus the final
-    * stores I guess.  This is probably a bit more than that - maybe
-    * 32 or so?
-    */
-   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
-   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
-   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
-   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
-}
-
-
-
-static void fetch_xyz_rgb( struct spu_vs_context *draw,
-			   struct spu_exec_machine *machine,
-			   const unsigned *elts,
-			   unsigned count )
+void
+spu_transpose_4x4(qword *out, const qword *in)
 {
-   assert(count <= 4);
-
-//   _mesa_printf("%s\n", __FUNCTION__);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+   static const qword masks[8] = {
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-   }
-}
-
-
-
-
-static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
-			      struct spu_exec_machine *machine,
-			      const unsigned *elts,
-			      unsigned count )
-{
-   assert(count <= 4);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      },
+
+      { 
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+      },
+
+      { 
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+      },
+
+      { 
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+      },
+   };
 
-      {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+   out[0] = si_shufb(in[0], in[1], masks[0]);
+   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
 
-      {
-	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = 0.0f;
- 	 out[12] = 1.0f;
-      }
-   }
-}
+   out[1] = si_shufb(in[0], in[1], masks[2]);
+   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
 
+   out[2] = si_shufb(in[0], in[1], masks[4]);
+   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
 
+   out[3] = si_shufb(in[0], in[1], masks[6]);
+   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
+}
 
 
 /**
@@ -435,7 +387,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
-      float p[4][4];
+      qword p[4];
 
 
       /* Fetch four attributes for four vertices.  
@@ -452,17 +404,15 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memmove(& buffer, buffer + (addr & 0x0f), 16);
-
-         fetch(buffer, p[i]);
+         p[i] = (*fetch)(buffer + (addr & 0x0f));
       }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
-      
-      /* Transpose/swizzle into sse-friendly format.  Currently
+          p[i] = si_xor(p[i], p[i]);
+
+      /* Transpose/swizzle into vector-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
        * isn't true -- if the vertex shader only wants tex0.xy, we
        * could optimize for that.
@@ -471,7 +421,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
@@ -487,24 +437,4 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    }
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
-
-   /* Disable the fast path because they don't use mfc_get yet.
-    */
-#if 0
-   switch (draw->vertex_fetch.nr_attrs) {
-   case 2:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
-      break;
-   case 3:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
-      break;
-   default:
-      break;
-   }
-#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index b261ab44a2..2435b7ddae 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -6,7 +6,7 @@
 
 struct spu_vs_context;
 
-typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
+typedef qword (*spu_fetch_func)(const void *ptr);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
@@ -39,6 +39,8 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
+extern void spu_transpose_4x4(qword *out, const qword *in);
+
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3


From e8a80c8627972537c595f06fb28cd383569e7ea0 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 17:14:09 -0800
Subject: More semi-trivial vectorization in the shader VM

---
 src/mesa/pipe/cell/spu/spu_exec.c | 62 +++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 35 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 1ac9c031e3..1bd8687d41 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -70,6 +70,7 @@
 #include "pipe/tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
 #include "spu_main.h"
+#include "spu_vertex_shader.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -144,23 +145,27 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
                       struct spu_sampler *samplers,
                       unsigned processor)
 {
+   qword zero;
+   qword not_zero;
    uint i;
 
    mach->Samplers = samplers;
    mach->Processor = processor;
    mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
 
+   zero = si_xor(zero, zero);
+   not_zero = si_xori(zero, 0xff);
+
    /* Setup constants. */
-   for( i = 0; i < 4; i++ ) {
-      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
-      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
-      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
-      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
-      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
-      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
-      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
-      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
-   }
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
 }
 
 
@@ -459,25 +464,16 @@ fetch_source(
          &index2,
          &indir_index );
 
-      index.i[0] += indir_index.i[0];
-      index.i[1] += indir_index.i[1];
-      index.i[2] += indir_index.i[2];
-      index.i[3] += indir_index.i[3];
+      index.q = si_a(index.q, indir_index.q);
    }
 
    if( reg->SrcRegister.Dimension ) {
       switch( reg->SrcRegister.File ) {
       case TGSI_FILE_INPUT:
-         index.i[0] *= 17;
-         index.i[1] *= 17;
-         index.i[2] *= 17;
-         index.i[3] *= 17;
+         index.q = si_mpyi(index.q, 17);
          break;
       case TGSI_FILE_CONSTANT:
-         index.i[0] *= 4096;
-         index.i[1] *= 4096;
-         index.i[2] *= 4096;
-         index.i[3] *= 4096;
+         index.q = si_shli(index.q, 12);
          break;
       default:
          assert( 0 );
@@ -505,10 +501,7 @@ fetch_source(
             &index2,
             &indir_index );
 
-         index.i[0] += indir_index.i[0];
-         index.i[1] += indir_index.i[1];
-         index.i[2] += indir_index.i[2];
-         index.i[3] += indir_index.i[3];
+         index.q = si_a(index.q, indir_index.q);
       }
    }
 
@@ -666,17 +659,16 @@ fetch_texel( struct spu_sampler *sampler,
              union spu_exec_channel *b,
              union spu_exec_channel *a )
 {
-   uint j;
-   float rgba[NUM_CHANNELS][QUAD_SIZE];
+   qword rgba[4];
+   qword out[4];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
 
-   for (j = 0; j < 4; j++) {
-      r->f[j] = rgba[0][j];
-      g->f[j] = rgba[1][j];
-      b->f[j] = rgba[2][j];
-      a->f[j] = rgba[3][j];
-   }
+   spu_transpose_4x4(out, rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
 }
 
 
-- 
cgit v1.2.3


From 45f4125fa83c4e43a01d44cb8eb2a4c97b72181f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 4 Feb 2008 16:03:55 -0800
Subject: Add some debug messages

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 4133fbba17..cfa449e813 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -378,7 +378,10 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
    wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -401,6 +404,9 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-- 
cgit v1.2.3


From c9f98142b6a47825c49aea72a79c1be62c2b7d89 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 5 Feb 2008 09:43:52 -0800
Subject: Use _transpose_matrix4x4 from Cell SDK instead of my own version

---
 src/mesa/pipe/cell/spu/spu_exec.c          |  3 +-
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 59 ++----------------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |  2 -
 3 files changed, 5 insertions(+), 59 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 1bd8687d41..e51008b9b3 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,6 +52,7 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
 #include <simdmath/ceilf4.h>
 #include <simdmath/cosf4.h>
 #include <simdmath/divf4.h>
@@ -664,7 +665,7 @@ fetch_texel( struct spu_sampler *sampler,
 
    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
 
-   spu_transpose_4x4(out, rgba);
+   _transpose_matrix4x4(out, rgba);
    r->q = out[0];
    g->q = out[1];
    b->q = out[2];
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index cfa449e813..6e86a919ce 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -31,6 +31,8 @@
   */
 
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -308,61 +310,6 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-void
-spu_transpose_4x4(qword *out, const qword *in)
-{
-   static const qword masks[8] = {
-      {
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-      },
-
-      { 
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-      },
-
-      { 
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-      },
-
-      { 
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-      },
-   };
-
-   out[0] = si_shufb(in[0], in[1], masks[0]);
-   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
-
-   out[1] = si_shufb(in[0], in[1], masks[2]);
-   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
-
-   out[2] = si_shufb(in[0], in[1], masks[4]);
-   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
-
-   out[3] = si_shufb(in[0], in[1], masks[6]);
-   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
-}
-
-
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
@@ -427,7 +374,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
+      _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index 2435b7ddae..c96b93ff0a 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -39,8 +39,6 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
-extern void spu_transpose_4x4(qword *out, const qword *in);
-
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3


From b0974420f4dab55d398f4015cf71a62fa643f713 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:21:01 -0700
Subject: Cell: added cell_batch_alloc_aligned()

---
 src/mesa/pipe/cell/ppu/cell_batch.c | 26 ++++++++++++++++++++------
 src/mesa/pipe/cell/ppu/cell_batch.h |  4 ++++
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 2fb49711b2..f45e5f25b6 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -157,7 +157,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
       size = 0;
    }
 
-   assert(size + bytes <= CELL_BUFFER_SIZE);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
 
@@ -167,14 +167,22 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 
 void *
 cell_batch_alloc(struct cell_context *cell, uint bytes)
+{
+   return cell_batch_alloc_aligned(cell, bytes, 1);
+}
+
+
+void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment)
 {
    void *pos;
-   uint size;
+   uint size, padbytes;
 
    ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
-
-   assert(cell->cur_batch >= 0);
+   ASSERT(alignment > 0);
+   ASSERT(cell->cur_batch >= 0);
 
 #ifdef ASSERT
    {
@@ -188,12 +196,18 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BUFFER_SIZE) {
+   padbytes = (alignment - (size % alignment)) % alignment;
+
+   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
+   else {
+      size += padbytes;
+   }
 
-   assert(size + bytes <= CELL_BUFFER_SIZE);
+   ASSERT(size % alignment == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    pos = (void *) (cell->buffer[cell->cur_batch] + size);
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index f4f37314a4..a6eee0a8b1 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -50,5 +50,9 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
 extern void *
 cell_batch_alloc(struct cell_context *cell, uint bytes);
 
+extern void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment);
+
 
 #endif /* CELL_BATCH_H */
-- 
cgit v1.2.3


From 2174890ed030bde8494b7f13b7090e27771695fa Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:21:48 -0700
Subject: Cell: remove dummy fields, update/add some comments

---
 src/mesa/pipe/cell/common.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index cf8fc94ebf..4de514c358 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -51,7 +51,7 @@
 
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
-   assert((((unsigned long) (ptr)) & 0xf) == 0);
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
 
 
 /** round up value to next multiple of 4 */
@@ -105,7 +105,7 @@
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;
+   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -117,7 +117,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint64_t opcode;
+   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
 };
@@ -128,8 +128,8 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;          /**< Base address of the 0th element. */
-    uint attr;          /**< Attribute that this state if for. */
+    uint64_t base;      /**< Base address of the 0th element. */
+    uint attr;          /**< Attribute that this state is for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
 } ALIGN16_ATTRIB;
@@ -169,11 +169,9 @@ struct cell_command_render
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
-   uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-   float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
-   uint dummy3;
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
 };
-- 
cgit v1.2.3


From 4da82fd5c5e0a7535e30aa81f08dcbe1a26358b7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:23:34 -0700
Subject: Cell: re-enable inlined vertex buffers

Vertex data must be on a 16-byte address/offset so SIMD operations will work
properly in the SPU code.
---
 src/mesa/pipe/cell/ppu/cell_vbuf.c  | 12 +++++-------
 src/mesa/pipe/cell/spu/spu_main.c   |  3 ++-
 src/mesa/pipe/cell/spu/spu_render.c | 12 ++++++++----
 3 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 0fee61821a..e9fafe492e 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,7 +40,7 @@
 
 
 /** Allow vertex data to be inlined after RENDER command */
-#define ALLOW_INLINE_VERTS 0
+#define ALLOW_INLINE_VERTS 1
 
 
 /**
@@ -199,9 +199,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    {
       const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
-
-      const uint batch_size = sizeof(struct cell_command_render)
-         + index_bytes;
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
@@ -223,9 +221,9 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
           min_index == 0 &&
-          vertex_bytes <= cell_batch_free_space(cell)) {
-         /* vertex data inlined, after indices */
-         void *dst = cell_batch_alloc(cell, vertex_bytes);
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
          render->vertex_buf = ~0;
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 4f126d5e5b..e375197fe6 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -387,7 +387,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 8 + ((pos_incr + 1) / 2);
+            pos += pos_incr;
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -541,6 +541,7 @@ main(main_param_t speid, main_param_t argp)
    (void) speid;
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
 
    one_time_init();
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index e8705eeeba..932fb500b3 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -171,6 +171,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
@@ -199,13 +200,16 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
    /* indexes are right after the render command in the batch buffer */
    indexes = (const ushort *) (render + 1);
-   *pos_incr = (render->num_indexes * 2 + 3) / 4;
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
 
 
    if (render->inline_verts) {
-      /* Vertices are right after indexes in batch buffer */
-      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
    }
    else {
       /* Begin DMA fetch of vertex buffer */
-- 
cgit v1.2.3


From 1730f7bad462ac7f29857b8b2347e38c1b6c9820 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 15:07:36 -0700
Subject: Cell: SIMD-ize tri_linear_coeff(), use vector float for vertex
 attributes in struct vertex_header

---
 src/mesa/pipe/cell/spu/spu_tri.c | 112 ++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 37 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 688c8646ab..be9624cf7d 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -56,7 +56,7 @@ typedef union
  * Simplified types taken from other parts of Gallium
  */
 struct vertex_header {
-   float data[0][4];
+   vector float data[1];
 };
 
 
@@ -476,6 +476,7 @@ static void print_vertex(const struct vertex_header *v)
 }
 #endif
 
+
 static boolean setup_sort_vertices(const struct vertex_header *v0,
                                    const struct vertex_header *v1,
                                    const struct vertex_header *v2)
@@ -492,9 +493,9 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 
    /* determine bottom to top order of vertices */
    {
-      float y0 = v0->data[0][1];
-      float y1 = v1->data[0][1];
-      float y2 = v2->data[0][1];
+      float y0 = spu_extract(v0->data[0], 1);
+      float y1 = spu_extract(v1->data[0], 1);
+      float y2 = spu_extract(v2->data[0], 1);
       if (y0 <= y1) {
 	 if (y1 <= y2) {
 	    /* y0<=y1<=y2 */
@@ -538,25 +539,25 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
    }
 
    /* Check if triangle is completely outside the tile bounds */
-   if (setup.vmin->data[0][1] > setup.cliprect_maxy)
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
       return FALSE;
-   if (setup.vmax->data[0][1] < setup.cliprect_miny)
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
       return FALSE;
-   if (setup.vmin->data[0][0] < setup.cliprect_minx &&
-       setup.vmid->data[0][0] < setup.cliprect_minx &&
-       setup.vmax->data[0][0] < setup.cliprect_minx)
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
       return FALSE;
-   if (setup.vmin->data[0][0] > setup.cliprect_maxx &&
-       setup.vmid->data[0][0] > setup.cliprect_maxx &&
-       setup.vmax->data[0][0] > setup.cliprect_maxx)
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
       return FALSE;
 
-   setup.ebot.dx = setup.vmid->data[0][0] - setup.vmin->data[0][0];
-   setup.ebot.dy = setup.vmid->data[0][1] - setup.vmin->data[0][1];
-   setup.emaj.dx = setup.vmax->data[0][0] - setup.vmin->data[0][0];
-   setup.emaj.dy = setup.vmax->data[0][1] - setup.vmin->data[0][1];
-   setup.etop.dx = setup.vmax->data[0][0] - setup.vmid->data[0][0];
-   setup.etop.dy = setup.vmax->data[0][1] - setup.vmid->data[0][1];
+   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
+   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
@@ -597,14 +598,12 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
  * The result will be put into setup.coef[slot].a0.
  * \param slot  which attribute slot 
  */
-static INLINE void const_coeff(uint slot)
+static INLINE void
+const_coeff(uint slot)
 {
    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.f[0] = setup.vprovoke->data[slot][0];
-   setup.coef[slot].a0.f[1] = setup.vprovoke->data[slot][1];
-   setup.coef[slot].a0.f[2] = setup.vprovoke->data[slot][2];
-   setup.coef[slot].a0.f[3] = setup.vprovoke->data[slot][3];
+   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
 }
 
 
@@ -612,12 +611,19 @@ static INLINE void const_coeff(uint slot)
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
+static INLINE void
+tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
 {
    uint i;
+   const float *vmin_d = (float *) &setup.vmin->data[slot];
+   const float *vmid_d = (float *) &setup.vmid->data[slot];
+   const float *vmax_d = (float *) &setup.vmax->data[slot];
+   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
+   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+
    for (i = firstComp; i < lastComp; i++) {
-      float botda = setup.vmid->data[slot][i] - setup.vmin->data[slot][i];
-      float majda = setup.vmax->data[slot][i] - setup.vmin->data[slot][i];
+      float botda = vmid_d[i] - vmin_d[i];
+      float majda = vmax_d[i] - vmin_d[i];
       float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
       float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
    
@@ -638,9 +644,9 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup.coef[slot].a0.f[i] = (setup.vmin->data[slot][i] - 
-                                 (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-                                  setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
+                                 (setup.coef[slot].dadx.f[i] * x + 
+                                  setup.coef[slot].dady.f[i] * y));
    }
 
    /*
@@ -653,6 +659,37 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
 }
 
 
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
 #if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
@@ -710,17 +747,18 @@ static void setup_tri_coefficients(void)
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(i, 2, 3);
+         /*tri_linear_coeff(i, 2, 3);*/
          /* XXX interp W if PERSPECTIVE... */
+         tri_linear_coeff4(i);
          break;
       case INTERP_CONSTANT:
          const_coeff(i);
          break;
       case INTERP_LINEAR:
-         tri_linear_coeff(i, 0, 4);
+         tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff(i, 0, 4); /* XXX temporary */
+         tri_linear_coeff4(i);  /* temporary */
          break;
       default:
          ASSERT(0);
@@ -738,12 +776,12 @@ static void setup_tri_coefficients(void)
 
 static void setup_tri_edges(void)
 {
-   float vmin_x = setup.vmin->data[0][0] + 0.5f;
-   float vmid_x = setup.vmid->data[0][0] + 0.5f;
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 
-   float vmin_y = setup.vmin->data[0][1] - 0.5f;
-   float vmid_y = setup.vmid->data[0][1] - 0.5f;
-   float vmax_y = setup.vmax->data[0][1] - 0.5f;
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 
    setup.emaj.sy = CEILF(vmin_y);
    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
-- 
cgit v1.2.3


From e39fccc34c07a015d8713841a69037e32187dd6d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 15:12:18 -0700
Subject: Cell: remove accidentally added OPT_FLAGS lines

---
 src/mesa/pipe/cell/spu/Makefile | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 66f16cde9b..f202971d73 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -8,8 +8,6 @@ TOP = ../../../../..
 include $(TOP)/configs/linux-cell
 
 
-OPT_FLAGS=-g
-OPT_FLAGS=-O3
 PROG = g3d
 
 PROG_SPU = $(PROG)_spu
-- 
cgit v1.2.3


From 31c98eafb043cbc82e5de206ceecc5888174b5e6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:24:30 -0700
Subject: gallium: change pipe->texture_create() to operate like the CSO
 functions

Now, pass in a template object and return a new object.
---
 src/mesa/pipe/cell/ppu/cell_texture.c     | 31 ++++++++++++++---------------
 src/mesa/pipe/cell/ppu/cell_texture.h     |  5 +++--
 src/mesa/pipe/i915simple/i915_texture.c   | 17 ++++++++--------
 src/mesa/pipe/i915simple/i915_texture.h   |  5 +++--
 src/mesa/pipe/i965simple/brw_tex_layout.c | 15 +++++++-------
 src/mesa/pipe/i965simple/brw_tex_layout.h |  4 ++--
 src/mesa/pipe/p_context.h                 |  4 ++--
 src/mesa/pipe/softpipe/sp_texture.c       | 33 +++++++++++++++----------------
 src/mesa/pipe/softpipe/sp_texture.h       |  5 +++--
 src/mesa/state_tracker/st_texture.c       | 31 ++++++++++++-----------------
 10 files changed, 73 insertions(+), 77 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index 2cf6022939..df178d9ca2 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -79,31 +79,30 @@ cell_texture_layout(struct cell_texture * spt)
 }
 
 
-void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct cell_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct cell_texture));
+   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
+   if (!spt)
+      return NULL;
 
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct cell_texture) - sizeof(struct pipe_texture));
+   spt->base = *templat;
 
-      cell_texture_layout(spt);
+   cell_texture_layout(spt);
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h
index bd434c8776..0264fed88e 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.h
+++ b/src/mesa/pipe/cell/ppu/cell_texture.h
@@ -60,8 +60,9 @@ cell_texture(struct pipe_texture *pt)
 
 
-extern void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i915simple/i915_texture.c b/src/mesa/pipe/i915simple/i915_texture.c
index 61944fe7d9..6faeab134a 100644
--- a/src/mesa/pipe/i915simple/i915_texture.c
+++ b/src/mesa/pipe/i915simple/i915_texture.c
@@ -477,17 +477,17 @@ i945_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
    return TRUE;
 }
 
-void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat)
 {
-   struct i915_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-				      sizeof(struct i915_texture));
+   struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
 
    if (tex) {
       struct i915_context *i915 = i915_context(pipe);
 
-      memset(&tex->base + 1, 0,
-	     sizeof(struct i915_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (i915->flags.is_i945 ? i945_miptree_layout(pipe, tex) :
 	  i915_miptree_layout(pipe, tex))
@@ -498,13 +498,14 @@ i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+	 return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
+
 void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/i915simple/i915_texture.h b/src/mesa/pipe/i915simple/i915_texture.h
index 84a0502e81..330d111dc7 100644
--- a/src/mesa/pipe/i915simple/i915_texture.h
+++ b/src/mesa/pipe/i915simple/i915_texture.h
@@ -6,8 +6,9 @@ struct pipe_context;
 struct pipe_texture;
 
 
-extern void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.c b/src/mesa/pipe/i965simple/brw_tex_layout.c
index b8b6b579e2..405fd1f794 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.c
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.c
@@ -299,15 +299,14 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
    return TRUE;
 }
 
-void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct brw_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-                                     sizeof(struct brw_texture));
+   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
 
    if (tex) {
-      memset(&tex->base + 1, 0,
-	     sizeof(struct brw_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (brw_miptree_layout(pipe, tex))
 	 tex->buffer = pipe->winsys->buffer_create(pipe->winsys, 64,
@@ -317,11 +316,11 @@ brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+         return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
 void
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.h b/src/mesa/pipe/i965simple/brw_tex_layout.h
index 15e275058a..cfd6b1ef3a 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.h
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.h
@@ -6,8 +6,8 @@
 struct pipe_context;
 struct pipe_texture;
 
-extern void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat);
 
 extern void
 brw_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/p_context.h b/src/mesa/pipe/p_context.h
index 0dda06c53b..92a1cd70c4 100644
--- a/src/mesa/pipe/p_context.h
+++ b/src/mesa/pipe/p_context.h
@@ -199,8 +199,8 @@ struct pipe_context {
    /*
     * Texture functions
     */
-   void (*texture_create)(struct pipe_context *pipe,
-			  struct pipe_texture **pt);
+   struct pipe_texture * (*texture_create)(struct pipe_context *pipe,
+                                           const struct pipe_texture *templat);
 
    void (*texture_release)(struct pipe_context *pipe,
 			   struct pipe_texture **pt);
diff --git a/src/mesa/pipe/softpipe/sp_texture.c b/src/mesa/pipe/softpipe/sp_texture.c
index 172234843d..fd2cc3dbbb 100644
--- a/src/mesa/pipe/softpipe/sp_texture.c
+++ b/src/mesa/pipe/softpipe/sp_texture.c
@@ -79,31 +79,30 @@ softpipe_texture_layout(struct softpipe_texture * spt)
 }
 
 
-void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat)
 {
-   struct softpipe_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct softpipe_texture));
-
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct softpipe_texture) - sizeof(struct pipe_texture));
+   struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
 
-      softpipe_texture_layout(spt);
+   spt->base = *templat;
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   softpipe_texture_layout(spt);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/softpipe/sp_texture.h b/src/mesa/pipe/softpipe/sp_texture.h
index c6cf370351..fa646c0de9 100644
--- a/src/mesa/pipe/softpipe/sp_texture.h
+++ b/src/mesa/pipe/softpipe/sp_texture.h
@@ -55,8 +55,9 @@ softpipe_texture(struct pipe_texture *pt)
 
 
-extern void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat);
 
 extern void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 741f36c2a7..844a9f80d8 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -74,7 +74,7 @@ st_texture_create(struct st_context *st,
 		  GLuint depth0,
 		  GLuint compress_byte)
 {
-   struct pipe_texture *pt = CALLOC_STRUCT(pipe_texture);
+   struct pipe_texture pt;
 
    assert(target <= PIPE_TEXTURE_CUBE);
 
@@ -82,25 +82,20 @@ st_texture_create(struct st_context *st,
        _mesa_lookup_enum_by_nr(target),
        _mesa_lookup_enum_by_nr(format), first_level, last_level);
 
-   if (!pt)
-      return NULL;
-
    assert(format);
 
-   pt->target = target;
-   pt->format = format;
-   pt->first_level = first_level;
-   pt->last_level = last_level;
-   pt->width[0] = width0;
-   pt->height[0] = height0;
-   pt->depth[0] = depth0;
-   pt->compressed = compress_byte ? 1 : 0;
-   pt->cpp = pt->compressed ? compress_byte : st_sizeof_format(format);
-   pt->refcount = 1; 
-
-   st->pipe->texture_create(st->pipe, &pt);
-
-   return pt;
+   pt.target = target;
+   pt.format = format;
+   pt.first_level = first_level;
+   pt.last_level = last_level;
+   pt.width[0] = width0;
+   pt.height[0] = height0;
+   pt.depth[0] = depth0;
+   pt.compressed = compress_byte ? 1 : 0;
+   pt.cpp = pt.compressed ? compress_byte : st_sizeof_format(format);
+   pt.refcount = 1; 
+
+   return st->pipe->texture_create(st->pipe, &pt);
 }
 
 
-- 
cgit v1.2.3


From c0235d0a24da82304f7f23936c71032c0a9a7ce1 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:35:33 -0700
Subject: Cell: use mem_dup()

---
 src/mesa/pipe/cell/ppu/cell_state_blend.c   | 9 ++-------
 src/mesa/pipe/cell/ppu/cell_state_sampler.c | 4 +---
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c
index 2c19aa3971..4fc60548c8 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_blend.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c
@@ -39,9 +39,7 @@ void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   struct pipe_blend_state *state = MALLOC(sizeof(struct pipe_blend_state));
-   memcpy(state, blend, sizeof(struct pipe_blend_state));
-   return state;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 
@@ -85,10 +83,7 @@ void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                  const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   struct pipe_depth_stencil_alpha_state *state =
-      MALLOC(sizeof(struct pipe_depth_stencil_alpha_state));
-   memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state));
-   return state;
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
 }
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
index 317f7603bb..ade6cc8338 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
@@ -40,9 +40,7 @@ void *
 cell_create_sampler_state(struct pipe_context *pipe,
                           const struct pipe_sampler_state *sampler)
 {
-   struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) );
-   memcpy(state, sampler, sizeof(struct pipe_sampler_state));
-   return state;
+   return mem_dup(sampler, sizeof(*sampler));
 }
 
 void
-- 
cgit v1.2.3


From c4e0d725dc9f18aed2babed344bb4e42df9e481f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:48:56 -0700
Subject: Cell: silence unused var warnings

---
 src/mesa/pipe/cell/ppu/cell_state_fs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'src/mesa/pipe/cell')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_fs.c b/src/mesa/pipe/cell/ppu/cell_state_fs.c
index 81c2ac14dd..96a52273b0 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_fs.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_fs.c
@@ -45,7 +45,7 @@ void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
-   struct cell_context *cell = cell_context(pipe);
+   /*struct cell_context *cell = cell_context(pipe);*/
    struct cell_fragment_shader_state *state;
 
    state = CALLOC_STRUCT(cell_fragment_shader_state);
@@ -94,8 +94,6 @@ cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 void
 cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
-   struct cell_context *cell = cell_context(pipe);
-
    struct cell_fragment_shader_state *state =
       (struct cell_fragment_shader_state *) fs;
 
-- 
cgit v1.2.3