8 files changed, 138 insertions, 141 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 67949b73dd..644496db40 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -51,7 +51,7 @@ cell_map_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].size) {
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size) {
          sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
                                                    PIPE_BUFFER_USAGE_CPU_READ);
          cell_flush_buffer_range(sp, sp->mapped_constants[i], 
@@ -61,7 +61,7 @@ cell_map_constant_buffers(struct cell_context *sp)
 
    draw_set_mapped_constant_buffer(sp->draw,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   sp->constants[PIPE_SHADER_VERTEX].size);
+                                   sp->constants[PIPE_SHADER_VERTEX].buffer->size);
 }
 
 static void
@@ -70,7 +70,7 @@ cell_unmap_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].size)
+      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
          ws->buffer_unmap(ws, sp->constants[i].buffer);
       sp->mapped_constants[i] = NULL;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 9bdc71b676..66d4b3b6a3 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -161,7 +161,7 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
        (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
       /* load/splat the alpha reference float value */
-      spe_load_float(f, ref_reg, dsa->alpha.ref);
+      spe_load_float(f, ref_reg, dsa->alpha.ref_value);
    }
 
    /* emit code to do the alpha comparison, updating 'mask' */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 39b85faeb8..ff529fe22c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -239,7 +239,7 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
       const uint shader = PIPE_SHADER_FRAGMENT;
-      const uint num_const = cell->constants[shader].size / sizeof(float);
+      const uint num_const = cell->constants[shader].buffer->size / sizeof(float);
       uint i, j;
       float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
       uint32_t *ibuf = (uint32_t *) buf;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 990f23e170..bf517ea563 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -186,7 +186,6 @@ cell_set_constant_buffer(struct pipe_context *pipe,
                          const struct pipe_constant_buffer *buf)
 {
    struct cell_context *cell = cell_context(pipe);
-   struct pipe_winsys *ws = pipe->winsys;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
@@ -197,7 +196,6 @@ cell_set_constant_buffer(struct pipe_context *pipe,
    pipe_buffer_reference(pipe->screen,
                          &cell->constants[shader].buffer,
                          buf->buffer);
-   cell->constants[shader].size = buf->size;
 
    if (shader == PIPE_SHADER_VERTEX)
       cell->dirty |= CELL_NEW_VS_CONSTANTS;
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 4f16e2c6af..9ba995ab7d 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -307,9 +307,8 @@ cell_twiddle_texture(struct pipe_screen *screen,
    const uint texHeight = ct->base.height[level];
    const uint bufWidth = align(texWidth, TILE_SIZE);
    const uint bufHeight = align(texHeight, TILE_SIZE);
-   const void *map = pipe_buffer_map(screen, surface->buffer,
-                                     PIPE_BUFFER_USAGE_CPU_READ);
-   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+   const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) map;
 
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
@@ -324,12 +323,12 @@ cell_twiddle_texture(struct pipe_screen *screen,
             /* allocate buffer for tiled data now */
             struct pipe_winsys *ws = screen->winsys;
             uint bytes = bufWidth * bufHeight * 4 * numFaces;
-            ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
-                                                         PIPE_BUFFER_USAGE_PIXEL,
-                                                         bytes);
+            ct->tiled_buffer[level] =
+               ws->buffer_create(ws, 16, PIPE_BUFFER_USAGE_PIXEL, bytes);
             /* and map it */
-            ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
-                                                      PIPE_BUFFER_USAGE_GPU_READ);
+            ct->tiled_mapped[level] =
+               ws->buffer_map(ws, ct->tiled_buffer[level],
+                              PIPE_BUFFER_USAGE_GPU_READ);
          }
          dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
 
@@ -338,11 +337,11 @@ cell_twiddle_texture(struct pipe_screen *screen,
       }
       break;
    default:
-      printf("Cell: twiddle unsupported texture format %s\n", pf_name(ct->base.format));
-      ;
+      printf("Cell: twiddle unsupported texture format %s\n",
+             pf_name(ct->base.format));
    }
 
-   pipe_buffer_unmap(screen, surface->buffer);
+   screen->surface_unmap(screen, surface);
 }
 
 
@@ -357,8 +356,7 @@ cell_untwiddle_texture(struct pipe_screen *screen,
    const uint level = surface->level;
    const uint texWidth = ct->base.width[level];
    const uint texHeight = ct->base.height[level];
-   const void *map = pipe_buffer_map(screen, surface->buffer,
-                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const void *map = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_READ);
    const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
 
    switch (ct->base.format) {
@@ -384,11 +382,12 @@ cell_untwiddle_texture(struct pipe_screen *screen,
    default:
       {
          ct->untiled_data[level] = NULL;
-         printf("Cell: untwiddle unsupported texture format %s\n", pf_name(ct->base.format));
+         printf("Cell: untwiddle unsupported texture format %s\n",
+                pf_name(ct->base.format));
       }
    }
 
-   pipe_buffer_unmap(screen, surface->buffer);
+   screen->surface_unmap(screen, surface);
 }
 
 
@@ -398,15 +397,13 @@ cell_get_tex_surface(struct pipe_screen *screen,
                      unsigned face, unsigned level, unsigned zslice,
                      unsigned usage)
 {
-   struct pipe_winsys *ws = screen->winsys;
    struct cell_texture *ct = cell_texture(pt);
    struct pipe_surface *ps;
 
-   ps = ws->surface_alloc(ws);
+   ps = CALLOC_STRUCT(pipe_surface);
    if (ps) {
-      assert(ps->refcount);
-      assert(ps->winsys);
-      pipe_buffer_reference(screen, &ps->buffer, ct->buffer);
+      ps->refcount = 1;
+      pipe_texture_reference(&ps->texture, pt);
       ps->format = pt->format;
       ps->block = pt->block;
       ps->width = pt->width[level];
@@ -425,9 +422,9 @@ cell_get_tex_surface(struct pipe_screen *screen,
       ps->zslice = zslice;
 
       if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
-	         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
-		      ps->nblocksy *
-		      ps->stride;
+         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+            ps->nblocksy *
+            ps->stride;
       }
       else {
          assert(face == 0);
@@ -449,18 +446,27 @@ cell_tex_surface_release(struct pipe_screen *screen,
 {
    struct cell_texture *ct = cell_texture((*s)->texture);
    const uint level = (*s)->level;
+   struct pipe_surface *surf = *s;
 
-   if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+   if ((surf->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
    {
       align_free(ct->untiled_data[level]);
       ct->untiled_data[level] = NULL;
    }
 
-   /* XXX if done rendering to teximage, re-tile */
+   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+       (surf->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+      /* convert from linear to tiled layout */
+      cell_twiddle_texture(screen, surf);
+   }
 
-   pipe_texture_reference(&(*s)->texture, NULL); 
+   /* XXX if done rendering to teximage, re-tile */
 
-   screen->winsys->surface_release(screen->winsys, s);
+   if (--surf->refcount == 0) {
+      pipe_texture_reference(&surf->texture, NULL);
+      FREE(surf);
+   }
+   *s = NULL;
 }
 
 
@@ -475,17 +481,20 @@ cell_surface_map(struct pipe_screen *screen,
 
    assert(ct);
 
+#if 0
    if (flags & ~surface->usage) {
       assert(0);
       return NULL;
    }
+#endif
 
-   map = pipe_buffer_map( screen, surface->buffer, flags );
-   if (map == NULL)
+   map = pipe_buffer_map( screen, ct->buffer, flags );
+   if (map == NULL) {
       return NULL;
-   else
-   {
-      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
+   }
+   else {
+      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) &&
+          (ct->untiled_data[level])) {
          return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
       }
       else {
@@ -503,13 +512,7 @@ cell_surface_unmap(struct pipe_screen *screen,
 
    assert(ct);
 
-   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
-       (surface->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
-      /* convert from linear to tiled layout */
-      cell_twiddle_texture(screen, surface);
-   }
-
-   pipe_buffer_unmap( screen, surface->buffer );
+   pipe_buffer_unmap( screen, ct->buffer );
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 116453b79c..3cc52301da 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -33,9 +33,10 @@ OLD_SOURCES = \
 	spu_vertex_shader.c
 
 
-SPU_OBJECTS = $(SOURCES:.c=.o) \
+SPU_OBJECTS = $(SOURCES:.c=.o)
+
+SPU_ASM_OUT = $(SOURCES:.c=.s)
 
-SPU_ASM_OUT = $(SOURCES:.c=.s) \
 
 INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa \
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index 683664e8a4..eba9f95cf1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -85,7 +85,7 @@ spu_fallback_fragment_ops(uint x, uint y,
     * Do alpha test
     */
    if (spu.depth_stencil_alpha.alpha.enabled) {
-      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref_value);
       vector unsigned int amask;
 
       switch (spu.depth_stencil_alpha.alpha.func) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0d9fcb9997..d727268475 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -29,7 +29,6 @@
  * Triangle rendering within a tile.
  */
 
-#include <transpose_matrix4x4.h>
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "util/u_math.h"
@@ -71,6 +70,12 @@ struct vertex_header {
 #define MASK_ALL          0xf
 
 
+#define CHAN0 0
+#define CHAN1 1
+#define CHAN2 2
+#define CHAN3 3
+
+
 #define DEBUG_VERTS 0
 
 /**
@@ -144,99 +149,97 @@ struct setup_stage {
 static struct setup_stage setup;
 
 
-/**
- * Evaluate attribute coefficients (plane equations) to compute
- * attribute values for the four fragments in a quad.
- * Eg: four colors will be computed (in AoS format).
- */
-static INLINE void
-eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
+static INLINE vector float
+splatx(vector float v)
 {
-   switch (spu.vertex_info.attrib[slot].interp_mode) {
-   case INTERP_CONSTANT:
-      result[QUAD_TOP_LEFT] =
-      result[QUAD_TOP_RIGHT] =
-      result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
-      break;
-   case INTERP_LINEAR:
-      {
-         vector float dadx = setup.coef[slot].dadx;
-         vector float dady = setup.coef[slot].dady;
-         vector float topLeft =
-            spu_add(setup.coef[slot].a0,
-                    spu_add(spu_mul(spu_splats(x), dadx),
-                            spu_mul(spu_splats(y), dady)));
-
-         result[QUAD_TOP_LEFT] = topLeft;
-         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
-         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
-         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
-      }
-      break;
-   case INTERP_PERSPECTIVE:
-      {
-         vector float dadx = setup.coef[slot].dadx;
-         vector float dady = setup.coef[slot].dady;
-         vector float topLeft =
-            spu_add(setup.coef[slot].a0,
-                    spu_add(spu_mul(spu_splats(x), dadx),
-                            spu_mul(spu_splats(y), dady)));
-
-         vector float wInv = spu_re(w);  /* 1.0 / w */
-
-         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
-         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
-         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
-         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
-      }
-      break;
-   case INTERP_POS:
-   case INTERP_NONE:
-      break;
-   default:
-      ASSERT(0);
-   }
+   return spu_splats(spu_extract(v, CHAN0));
 }
 
-
-/**
- * As above, but return 4 vectors in SOA format.
- * XXX this will all be re-written someday.
- */
-static INLINE void
-eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
+static INLINE vector float
+splaty(vector float v)
 {
-   eval_coeff(slot, x, y, w, result);
-   _transpose_matrix4x4(result, result);
+   return spu_splats(spu_extract(v, CHAN1));
 }
 
+static INLINE vector float
+splatz(vector float v)
+{
+   return spu_splats(spu_extract(v, CHAN2));
+}
 
-/** Evalute coefficients to get Z for four pixels in a quad */
 static INLINE vector float
-eval_z(float x, float y)
+splatw(vector float v)
 {
-   const uint slot = 0;
-   const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
-   const float dzdy = spu_extract(setup.coef[slot].dady, 2);
-   const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
-   const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
-   return spu_add(topLeftv, derivs);
+   return spu_splats(spu_extract(v, CHAN3));
 }
 
 
-/** Evalute coefficients to get W for four pixels in a quad */
-static INLINE vector float
-eval_w(float x, float y)
+/**
+ * Setup fragment shader inputs by evaluating triangle's vertex
+ * attribute coefficient info.
+ * \param x  quad x pos
+ * \param y  quad y pos
+ * \param fragZ  returns quad Z values
+ * \param fragInputs  returns fragment program inputs
+ * Note: this code could be incorporated into the fragment program
+ * itself to avoid the loop and switch.
+ */
+static void
+eval_inputs(float x, float y, vector float *fragZ, vector float fragInputs[])
 {
-   const uint slot = 0;
-   const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
-   const float dwdy = spu_extract(setup.coef[slot].dady, 3);
-   const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
-   const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
-   return spu_add(topLeftv, derivs);
+   static const vector float deltaX = (const vector float) {0, 1, 0, 1};
+   static const vector float deltaY = (const vector float) {0, 0, 1, 1};
+
+   const uint posSlot = 0;
+   const vector float pos = setup.coef[posSlot].a0;
+   const vector float dposdx = setup.coef[posSlot].dadx;
+   const vector float dposdy = setup.coef[posSlot].dady;
+   const vector float fragX = spu_splats(x) + deltaX;
+   const vector float fragY = spu_splats(y) + deltaY;
+   vector float fragW, wInv;
+   uint i;
+
+   *fragZ = splatz(pos) + fragX * splatz(dposdx) + fragY * splatz(dposdy);
+   fragW =  splatw(pos) + fragX * splatw(dposdx) + fragY * splatw(dposdy);
+   wInv = spu_re(fragW);  /* 1 / w */
+
+   /* loop over fragment program inputs */
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      uint attr = i + 1;
+      enum interp_mode interp = spu.vertex_info.attrib[attr].interp_mode;
+
+      /* constant term */
+      vector float a0 = setup.coef[attr].a0;
+      vector float r0 = splatx(a0);
+      vector float r1 = splaty(a0);
+      vector float r2 = splatz(a0);
+      vector float r3 = splatw(a0);
+
+      if (interp == INTERP_LINEAR || interp == INTERP_PERSPECTIVE) {
+         /* linear term */
+         vector float dadx = setup.coef[attr].dadx;
+         vector float dady = setup.coef[attr].dady;
+         /* Use SPU intrinsics here to get slightly better code.
+          * originally: r0 += fragX * splatx(dadx) + fragY * splatx(dady);
+          */
+         r0 = spu_madd(fragX, splatx(dadx), spu_madd(fragY, splatx(dady), r0));
+         r1 = spu_madd(fragX, splaty(dadx), spu_madd(fragY, splaty(dady), r1));
+         r2 = spu_madd(fragX, splatz(dadx), spu_madd(fragY, splatz(dady), r2));
+         r3 = spu_madd(fragX, splatw(dadx), spu_madd(fragY, splatw(dady), r3));
+         if (interp == INTERP_PERSPECTIVE) {
+            /* perspective term */
+            r0 *= wInv;
+            r1 *= wInv;
+            r2 *= wInv;
+            r3 *= wInv;
+         }
+      }
+      fragInputs[CHAN0] = r0;
+      fragInputs[CHAN1] = r1;
+      fragInputs[CHAN2] = r2;
+      fragInputs[CHAN3] = r3;
+      fragInputs += 4;
+   }
 }
 
 
@@ -262,19 +265,11 @@ emit_quad( int x, int y, mask_t mask)
           * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
          vector float inputs[4*4], outputs[2*4];
-         vector float fragZ = eval_z((float) x, (float) y);
-         vector float fragW = eval_w((float) x, (float) y);
          vector unsigned int kill_mask;
+         vector float fragZ;
+
+         eval_inputs((float) x, (float) y, &fragZ, inputs);
 
-         /* setup inputs */
-#if 0
-         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
-#else
-         uint i;
-         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
-         }
-#endif
          ASSERT(spu.fragment_program);
          ASSERT(spu.fragment_ops);