From 582ca6e4180e45655ea5f85ac1c823a665efad47 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 27 Oct 2008 16:29:20 -0600
Subject: cell: Added support for untwiddling textures during glReadPixels.  
 This allows glReadPixels to work correctly on cell now and makes conformance
 tests that use pixel compares useable.

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 158 ++++++++++++++++++++++++++--
 src/gallium/drivers/cell/ppu/cell_texture.h |   1 +
 2 files changed, 152 insertions(+), 7 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 9ac2f3bbb9..8ae4439f6c 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -41,7 +41,6 @@
 #include "cell_state.h"
 #include "cell_texture.h"
 
-
 /* Simple, maximally packed layout.
  */
 
@@ -210,6 +209,87 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
    }
 }
 
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+      }
+   }
+}
+
+/**
+ * Convert image from tiled layout to linear layout.  4-byte pixels.
+ */
+static void
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                     uint src_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+   uint *tile_buf;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   src_stride /= 4; /* convert from bytes to pixels */
+
+   tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+   
+   /* loop over src tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of src tile: */
+         const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+         
+         twiddle_tile(tsrc, tile_buf);
+         tsrc = tile_buf;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               uint dsti = it * tile_size + i;
+               uint dstj = jt * tile_size + j;
+               ASSERT(dsti < h);
+               ASSERT(dstj < w);
+               dst[dsti * src_stride + dstj] = tsrc[i * tile_size + j];
+            }
+         }
+      }
+   }
+
+   align_free(tile_buf);
+}
 
 /**
  * Convert linear texture image data to tiled format for SPU usage.
@@ -260,6 +340,47 @@ cell_twiddle_texture(struct pipe_screen *screen,
    pipe_buffer_unmap(screen, surface->buffer);
 }
 
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const void *map = pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = surface->stride * texHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->untiled_data[level]) {
+            ct->untiled_data[level] =
+               align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
+
+         untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                              surface->stride, src);
+      }
+      break;
+   default:
+      printf("Cell: untwiddle unsupported texture format\n");
+      ;
+   }
+
+   pipe_buffer_unmap(screen, surface->buffer);
+}
+
 
 static struct pipe_surface *
 cell_get_tex_surface(struct pipe_screen *screen,
@@ -294,13 +415,18 @@ cell_get_tex_surface(struct pipe_screen *screen,
       ps->zslice = zslice;
 
       if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
-	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
-		       ps->nblocksy *
-		       ps->stride;
+	         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+		      ps->nblocksy *
+		      ps->stride;
       }
       else {
-	 assert(face == 0);
-	 assert(zslice == 0);
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+         /* convert from tiled to linear layout */
+         cell_untwiddle_texture(screen, ps);
       }
    }
    return ps;
@@ -311,6 +437,13 @@ static void
 cell_tex_surface_release(struct pipe_screen *screen, 
                          struct pipe_surface **s)
 {
+   struct cell_texture *ct = cell_texture((*s)->texture);
+   const uint level = (*s)->level;
+
+   if ((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+      align_free(ct->untiled_data[level]);
+   }
+
    /* XXX if done rendering to teximage, re-tile */
 
    pipe_texture_reference(&(*s)->texture, NULL); 
@@ -325,6 +458,10 @@ cell_surface_map(struct pipe_screen *screen,
                  unsigned flags)
 {
    ubyte *map;
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+
+   assert(ct);
 
    if (flags & ~surface->usage) {
       assert(0);
@@ -335,7 +472,14 @@ cell_surface_map(struct pipe_screen *screen,
    if (map == NULL)
       return NULL;
    else
-      return (void *) (map + surface->offset);
+   {
+      if (surface->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+         return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+      }
+      else {
+         return (void *) (map + surface->offset);
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 2f5fe0dd1b..7018b0c9bf 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -52,6 +52,7 @@ struct cell_texture
    struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
    /** Mapped, tiled texture data */
    void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+   void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
-- 
cgit v1.2.3


From 604be5561f17042f61db42b31caf4d720cf66389 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 27 Oct 2008 15:36:25 -0600
Subject: gallium: ppc: use a src register cache to avoid redundant loads

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 300 +++++++++++++++++++++++-----------
 1 file changed, 204 insertions(+), 96 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 9ad7ecd7cf..000ddba935 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -72,11 +72,14 @@ const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
 #define CHAN_Z 2
 #define CHAN_W 3
 
-#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
-#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
 
-#define TEMP_R0   TGSI_EXEC_TEMP_R0
-#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+
+struct reg_chan_vec
+{
+   struct tgsi_full_src_register src;
+   uint chan;
+   uint vec;
+};
 
 
 /**
@@ -94,6 +97,18 @@ struct gen_context
 
    int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
    int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+
+
+   /**
+    * Cache of src registers.
+    * This is used to avoid redundant load instructions.
+    */
+   struct {
+      struct tgsi_full_src_register src;
+      uint chan;
+      uint vec;
+   } regs[12];  /* 3 src regs, 4 channels */
+   uint num_regs;
 };
 
 
@@ -261,8 +276,83 @@ emit_fetch(struct gen_context *gen,
    }
 }
 
-#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \
-   emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN )
+
+
+/**
+ * Test if two TGSI src registers refer to the same memory location.
+ * We use this to avoid redundant register loads.
+ */
+static boolean
+equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
+               const struct tgsi_full_src_register *b, uint chan_b)
+{
+   int swz_a, swz_b;
+   int sign_a, sign_b;
+   if (a->SrcRegister.File != b->SrcRegister.File)
+      return FALSE;
+   if (a->SrcRegister.Index != b->SrcRegister.Index)
+      return FALSE;
+   swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
+   swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+   if (swz_a != swz_b)
+      return FALSE;
+   sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
+   sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b);
+   if (sign_a != sign_b)
+      return FALSE;
+   return TRUE;
+}
+
+
+/**
+ * Given a TGSI src register and channel index, return the PPC vector
+ * register containing the value.  We use a cache to prevent re-loading
+ * the same register multiple times.
+ * \return index of PPC vector register with the desired src operand
+ */
+static int
+get_src_vec(struct gen_context *gen,
+            struct tgsi_full_instruction *inst, int src_reg, uint chan)
+{
+   const const struct tgsi_full_src_register *src = 
+      &inst->FullSrcRegisters[src_reg];
+   int vec;
+   uint i;
+
+   /* check the cache */
+   for (i = 0; i < gen->num_regs; i++) {
+      if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
+         /* cache hit */
+         return gen->regs[i].vec;
+      }
+   }
+
+   /* cache miss: allocate new vec reg and emit fetch/load code */
+   vec = ppc_allocate_vec_register(gen->f);
+   gen->regs[gen->num_regs].src = *src;
+   gen->regs[gen->num_regs].chan = chan;
+   gen->regs[gen->num_regs].vec = vec;
+   gen->num_regs++;
+   emit_fetch(gen, vec, src, chan);
+
+   assert(gen->num_regs <= Elements(gen->regs));
+
+   return vec;
+}
+
+
+/**
+ * Clear the src operand cache.  To be called at the end of each emit function.
+ */
+static void
+release_src_vecs(struct gen_context *gen)
+{
+   uint i;
+   for (i = 0; i < gen->num_regs; i++) {
+      ppc_release_vec_register(gen->f, gen->regs[i].vec);
+   }
+   gen->num_regs = 0;
+}
 
 
@@ -333,11 +423,10 @@ emit_store(struct gen_context *gen,
 static void
 emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
+   int v0, v1 = ppc_allocate_vec_register(gen->f);
    uint chan_index;
 
-   FETCH(gen, *inst, v0, 0, CHAN_X);
+   v0 = get_src_vec(gen, inst, 0, CHAN_X);
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_RSQ:
@@ -355,7 +444,8 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
       STORE(gen, *inst, v1, 0, chan_index);
    }
-   ppc_release_vec_register(gen->f, v0);
+
+   release_src_vecs(gen);
    ppc_release_vec_register(gen->f, v1);
 }
 
@@ -363,10 +453,9 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 static void
 emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
    uint chan_index;
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, 0, 0, chan_index);   /* v0 = srcreg[0] */
+      int v0 = get_src_vec(gen, inst, 0, chan_index);   /* v0 = srcreg[0] */
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_ABS:
          /* turn off the most significant bit of each vector float word */
@@ -404,20 +493,30 @@ emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
       }
       STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
    }
-   ppc_release_vec_register(gen->f, v0);
+
+   release_src_vecs(gen);
 }
 
 
 static void
 emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+   int v2, zero_vec = -1;
+   uint chan;
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+      zero_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vzero(gen->f, zero_vec);
+   }
+
+   v2 = ppc_allocate_vec_register(gen->f);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+
+      /* emit binop */
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_ADD:
          ppc_vaddfp(gen->f, v2, v0, v1);
@@ -426,8 +525,7 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vsubfp(gen->f, v2, v0, v1);
          break;
       case TGSI_OPCODE_MUL:
-         ppc_vxor(gen->f, v2, v2, v2);        /* v2 = {0, 0, 0, 0} */
-         ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */
+         ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec);
          break;
       case TGSI_OPCODE_MIN:
          ppc_vminfp(gen->f, v2, v0, v1);
@@ -438,11 +536,54 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
       default:
          assert(0);
       }
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+
+      /* store v2 */
+      STORE(gen, *inst, v2, 0, chan);
    }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
+
    ppc_release_vec_register(gen->f, v2);
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
+      ppc_release_vec_register(gen->f, zero_vec);
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v3;
+   uint chan;
+
+   v3 = ppc_allocate_vec_register(gen->f);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_src_vec(gen, inst, 2, chan);
+
+      /* emit ALU */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* store v3 */
+      STORE(gen, *inst, v3, 0, chan);
+   }
+
+   ppc_release_vec_register(gen->f, v3);
+
+   release_src_vecs(gen);
 }
 
 
@@ -452,16 +593,17 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 static void
 emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   boolean complement = FALSE;
+   int v2;
+   uint chan;
    int one_vec = gen_one_vec(gen);
 
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
+   v2 = ppc_allocate_vec_register(gen->f);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      boolean complement = FALSE;
 
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_SNE:
@@ -495,89 +637,58 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
       else
          ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
-      STORE(gen, *inst, v2, 0, chan_index);   /* store v2 */
+      /* store v2 */
+      STORE(gen, *inst, v2, 0, chan);
    }
 
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
    ppc_release_vec_register(gen->f, v2);
+
+   release_src_vecs(gen);
 }
 
 
 static void
 emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
+   int v0, v1, v2;
    uint chan_index;
 
+   v2 = ppc_allocate_vec_register(gen->f);
+
    ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
 
-   FETCH(gen, *inst, v0, 0, CHAN_X);       /* v0 = src0.XXXX */
-   FETCH(gen, *inst, v1, 1, CHAN_X);       /* v1 = src1.XXXX */
+   v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */
+   v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */
    ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   FETCH(gen, *inst, v0, 0, CHAN_Y);       /* v0 = src0.YYYY */
-   FETCH(gen, *inst, v1, 1, CHAN_Y);       /* v1 = src1.YYYY */
+   v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */
    ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
-   FETCH(gen, *inst, v0, 0, CHAN_Z);       /* v0 = src0.ZZZZ */
-   FETCH(gen, *inst, v1, 1, CHAN_Z);       /* v1 = src1.ZZZZ */
+   v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */
    ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
 
    if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
-      FETCH(gen, *inst, v0, 0, CHAN_W);    /* v0 = src0.WWWW */
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
+      v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
    }
    else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
-      FETCH(gen, *inst, v1, 1, CHAN_W);    /* v1 = src1.WWWW */
-      ppc_vaddfp(gen->f, v2, v2, v1);      /* v2 = v2 + v1 */
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);         /* v2 = v2 + v1 */
    }
 
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
       STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
    }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
-   ppc_release_vec_register(gen->f, v2);
-}
 
+   release_src_vecs(gen);
 
-static void
-emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
-{
-   int v0 = ppc_allocate_vec_register(gen->f);
-   int v1 = ppc_allocate_vec_register(gen->f);
-   int v2 = ppc_allocate_vec_register(gen->f);
-   int v3 = ppc_allocate_vec_register(gen->f);
-   uint chan_index;
-   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      FETCH(gen, *inst, v0, 0, chan_index);   /* v0 = srcreg[0] */
-      FETCH(gen, *inst, v1, 1, chan_index);   /* v1 = srcreg[1] */
-      FETCH(gen, *inst, v2, 2, chan_index);   /* v2 = srcreg[2] */
-      switch (inst->Instruction.Opcode) {
-      case TGSI_OPCODE_MAD:
-         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
-         break;
-      case TGSI_OPCODE_LRP:
-         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
-         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
-         break;
-      default:
-         assert(0);
-      }
-      STORE(gen, *inst, v3, 0, chan_index);   /* store v3 */
-   }
-   ppc_release_vec_register(gen->f, v0);
-   ppc_release_vec_register(gen->f, v1);
    ppc_release_vec_register(gen->f, v2);
-   ppc_release_vec_register(gen->f, v3);
 }
 
 
-
 /** Approximation for vr = pow(va, vb) */
 static void
 ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
@@ -610,10 +721,10 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
    /* Compute Y, Z */
    if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
        IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-      int x_vec = ppc_allocate_vec_register(gen->f);
+      int x_vec;
       int zero_vec = ppc_allocate_vec_register(gen->f);
 
-      FETCH(gen, *inst, x_vec, 0, CHAN_X);        /* x_vec = src[0].x */
+      x_vec = get_src_vec(gen, inst, 0, CHAN_X);  /* x_vec = src[0].x */
 
       ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
       ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
@@ -623,18 +734,16 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
       }
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-         int y_vec = ppc_allocate_vec_register(gen->f);
-         int z_vec = ppc_allocate_vec_register(gen->f);
-         int w_vec = ppc_allocate_vec_register(gen->f);
+         int y_vec, z_vec, w_vec;
          int pow_vec = ppc_allocate_vec_register(gen->f);
          int pos_vec = ppc_allocate_vec_register(gen->f);
          int p128_vec = ppc_allocate_vec_register(gen->f);
          int n128_vec = ppc_allocate_vec_register(gen->f);
 
-         FETCH(gen, *inst, y_vec, 0, CHAN_Y);        /* y_vec = src[0].y */
+         y_vec = get_src_vec(gen, inst, 0, CHAN_Y);  /* y_vec = src[0].y */
          ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
 
-         FETCH(gen, *inst, w_vec, 0, CHAN_W);        /* w_vec = src[0].w */
+         w_vec = get_src_vec(gen, inst, 0, CHAN_W);  /* w_vec = src[0].w */
 
          /* clamp Y to [-128, 128] */
          load_constant_vec(gen, p128_vec, 128.0f);
@@ -653,16 +762,12 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
          STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
 
-         ppc_release_vec_register(gen->f, y_vec);
-         ppc_release_vec_register(gen->f, z_vec);
-         ppc_release_vec_register(gen->f, w_vec);
          ppc_release_vec_register(gen->f, pow_vec);
          ppc_release_vec_register(gen->f, pos_vec);
          ppc_release_vec_register(gen->f, p128_vec);
          ppc_release_vec_register(gen->f, n128_vec);
       }
 
-      ppc_release_vec_register(gen->f, x_vec);
       ppc_release_vec_register(gen->f, zero_vec);
    }
 
@@ -670,6 +775,8 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
    if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
       STORE(gen, *inst, one_vec, 0, CHAN_W);
    }
+
+   release_src_vecs(gen);
 }
 
 
@@ -723,11 +830,10 @@ emit_instruction(struct gen_context *gen,
    default:
       return 0;
    }
-
-   
    return 1;
 }
 
+
 static void
 emit_declaration(
    struct ppc_function *func,
@@ -805,6 +911,7 @@ emit_epilogue(struct ppc_function *func)
 {
    ppc_return(func);
    /* XXX restore prev stack frame */
+   debug_printf("PPC: Emitted %u instructions\n", func->num_inst);
 }
 
 
@@ -839,6 +946,7 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    util_init_math();
 
+   memset(&gen, 0, sizeof(gen));
    gen.f = func;
    gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
    gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
-- 
cgit v1.2.3


From a1754424b6597219f436091dec1de4713719c4b8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 27 Oct 2008 15:58:00 -0600
Subject: gallium: ppc: emit fewer 'li' instructions prior to vector
 loads/stores

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 106 ++++++++++++++++++++++++----------
 1 file changed, 75 insertions(+), 31 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 000ddba935..06b7a41b1b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -95,6 +95,9 @@ struct gen_context
    int const_reg;     /**< GP register pointing to constants buffer */
    int builtins_reg;  /**< GP register pointint to built-in constants */
 
+   int offset_reg;    /**< used to reduce redundant li instructions */
+   int offset_value;
+
    int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
    int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 
@@ -112,6 +115,70 @@ struct gen_context
 };
 
 
+/**
+ * Initialize code generation context.
+ */
+static void
+init_gen_context(struct gen_context *gen, struct ppc_function *func)
+{
+   memset(gen, 0, sizeof(*gen));
+   gen->f = func;
+   gen->inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen->outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen->temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen->immed_reg = ppc_reserve_register(func, 6);
+   gen->const_reg = ppc_reserve_register(func, 7);
+   gen->builtins_reg = ppc_reserve_register(func, 8);
+   gen->one_vec = -1;
+   gen->bit31_vec = -1;
+   gen->offset_reg = -1;
+   gen->offset_value = -9999999;
+}
+
+
+/**
+ * All PPC vector load/store instructions form an effective address
+ * by adding the contents of two registers.  For example:
+ *    lvx v2,r8,r9   # v2 = memory[r8 + r9]
+ *    stvx v2,r8,r9  # memory[r8 + r9] = v2;
+ * So our lvx/stvx instructions are typically preceded by an 'li' instruction
+ * to load r9 (above) with an immediate (an offset).
+ * This code emits that 'li' instruction, but only if the offset value is
+ * different than the previous 'li'.
+ * This optimization seems to save about 10% in the instruction count.
+ * Note that we need to unconditionally emit an 'li' inside basic blocks
+ * (such as inside loops).
+ */
+static int
+emit_li_offset(struct gen_context *gen, int offset)
+{
+   if (gen->offset_reg <= 0) {
+      /* allocate a GP register for storing load/store offset */
+      gen->offset_reg = ppc_allocate_register(gen->f);
+   }
+
+   /* emit new 'li' if offset is changing */
+   if (gen->offset_value < 0 || gen->offset_value != offset) {
+      gen->offset_value = offset;
+      ppc_li(gen->f, gen->offset_reg, offset);
+   }
+
+   return gen->offset_reg;
+}
+
+
+/**
+ * Forces subsequent emit_li_offset() calls to emit an 'li'.
+ * To be called at the top of basic blocks.
+ */
+static int
+reset_li_offset(struct gen_context *gen)
+{
+   gen->offset_value = -9999999;
+}
+
+
+
 /**
  * Load the given vector register with {value, value, value, value}.
  * The value must be in the ppu_builtin_constants[] array.
@@ -124,10 +191,9 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value)
    uint pos;
    for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
       if (ppc_builtin_constants[pos] == value) {
-         int offset_reg = ppc_allocate_register(gen->f);
          int offset = pos * 4;
+         int offset_reg = emit_li_offset(gen, offset);
 
-         ppc_li(gen->f, offset_reg, offset);
          /* Load 4-byte word into vector register.
           * The vector slot depends on the effective address we load from.
           * We know that our builtins start at a 16-byte boundary so we
@@ -137,7 +203,6 @@ load_constant_vec(struct gen_context *gen, int dst_vec, float value)
          ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
          /* splat word[pos % 4] across the vector reg */
          ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
-         ppc_release_register(gen->f, offset_reg);
          return;
       }
    }
@@ -192,36 +257,29 @@ emit_fetch(struct gen_context *gen,
       switch (reg->SrcRegister.File) {
       case TGSI_FILE_INPUT:
          {
-            int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
+            int offset_reg = emit_li_offset(gen, offset);
             ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
          }
          break;
       case TGSI_FILE_TEMPORARY:
          {
-            int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
+            int offset_reg = emit_li_offset(gen, offset);
             ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
          }
          break;
       case TGSI_FILE_IMMEDIATE:
          {
-            int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
-            ppc_li(gen->f, offset_reg, offset);
+            int offset_reg = emit_li_offset(gen, offset);
             ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
-            ppc_release_register(gen->f, offset_reg);
          }
          break;
       case TGSI_FILE_CONSTANT:
          {
-            int offset_reg = ppc_allocate_register(gen->f);
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
-            ppc_li(gen->f, offset_reg, offset);
+            int offset_reg = emit_li_offset(gen, offset);
             /* Load 4-byte word into vector register.
              * The vector slot depends on the effective address we load from.
              * We know that our constants start at a 16-byte boundary so we
@@ -231,7 +289,6 @@ emit_fetch(struct gen_context *gen,
             ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
             /* splat word[swizzle] across the vector reg */
             ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
-            ppc_release_register(gen->f, offset_reg);
          }
          break;
       default:
@@ -369,20 +426,16 @@ emit_store(struct gen_context *gen,
    switch (reg->DstRegister.File) {
    case TGSI_FILE_OUTPUT:
       {
-         int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
+         int offset_reg = emit_li_offset(gen, offset);
          ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
       }
       break;
    case TGSI_FILE_TEMPORARY:
       {
-         int offset_reg = ppc_allocate_register(gen->f);
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
-         ppc_li(gen->f, offset_reg, offset);
+         int offset_reg = emit_li_offset(gen, offset);
          ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
-         ppc_release_register(gen->f, offset_reg);
       }
       break;
 #if 0
@@ -946,16 +999,7 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    util_init_math();
 
-   memset(&gen, 0, sizeof(gen));
-   gen.f = func;
-   gen.inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
-   gen.outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
-   gen.temps_reg = ppc_reserve_register(func, 5);    /* ... */
-   gen.immed_reg = ppc_reserve_register(func, 6);
-   gen.const_reg = ppc_reserve_register(func, 7);
-   gen.builtins_reg = ppc_reserve_register(func, 8);
-   gen.one_vec = -1;
-   gen.bit31_vec = -1;
+   init_gen_context(&gen, func);
 
    emit_prologue(func);
 
-- 
cgit v1.2.3


From d01324eb78da2d501ce33e2792713225090c84cd Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 27 Oct 2008 18:25:33 -0600
Subject: cell: fix some problems when displaying to a
 PIPE_FORMAT_B8G8R8A8_UNORM screen

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 8ae4439f6c..7734381c7e 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -310,6 +310,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
 
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       {
          int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
          int offset = bufWidth * bufHeight * 4 * surface->face;
@@ -357,6 +358,7 @@ cell_untwiddle_texture(struct pipe_screen *screen,
 
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       {
          int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
          int offset = surface->stride * texHeight * 4 * surface->face;
@@ -442,6 +444,7 @@ cell_tex_surface_release(struct pipe_screen *screen,
 
    if ((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) {
       align_free(ct->untiled_data[level]);
+      ct->untiled_data[level] = NULL;
    }
 
    /* XXX if done rendering to teximage, re-tile */
-- 
cgit v1.2.3


From 52e6fbb655f138f70670abdd365258873a78dabf Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 28 Oct 2008 16:28:56 +0000
Subject: gallium: recognize DEBUG as well as DBG for debugging

---
 src/gallium/include/pipe/p_debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/include/pipe/p_debug.h b/src/gallium/include/pipe/p_debug.h
index cb6196aa9f..3b00fb9aa8 100644
--- a/src/gallium/include/pipe/p_debug.h
+++ b/src/gallium/include/pipe/p_debug.h
@@ -49,7 +49,7 @@ extern "C" {
 #endif
 
 
-#ifdef DBG
+#if defined(DBG) || defined(DEBUG)
 #ifndef DEBUG
 #define DEBUG 1
 #endif
-- 
cgit v1.2.3


From 57487590871d523dd6044ad214dafde04dd799f0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 12:41:47 -0600
Subject: cell: don't include libmisc.h

Doesn't seem to be needed and fixes compilation with SDK 3.1 beta.
---
 src/gallium/drivers/cell/ppu/cell_spu.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
index b633880c25..c93958a9ed 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.h
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -30,7 +30,6 @@
 
 
 #include <libspe2.h>
-#include <libmisc.h>
 #include <pthread.h>
 #include "cell/common.h"
 
-- 
cgit v1.2.3


From c46583416a749f2e7f76a1eaadb54a8b9e76fb11 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 13:17:48 -0600
Subject: gallium: use some PPC vec registers to store TGSI temps

This could be a lot better, but already makes for better code.
---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 184 ++++++++++++++++++++++------------
 1 file changed, 122 insertions(+), 62 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 06b7a41b1b..0de9b972b4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -40,6 +40,7 @@
 #include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
+#include "tgsi_dump.h"
 #include "tgsi_exec.h"
 #include "tgsi_ppc.h"
 #include "rtasm/rtasm_ppc.h"
@@ -73,6 +74,12 @@ const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
 #define CHAN_W 3
 
 
+/**
+ * How many TGSI temps should be implemented with real PPC vector registers
+ * rather than memory.
+ */
+#define MAX_PPC_TEMPS 4
+
 
 struct reg_chan_vec
 {
@@ -101,6 +108,12 @@ struct gen_context
    int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
    int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
 
+   /**
+    * Map TGSI temps to PPC vector temps.
+    * We have 32 PPC vector regs.  Use 16 of them for storing 4 TGSI temps.
+    * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1].
+    */
+   int temps_map[MAX_PPC_TEMPS][4];
 
    /**
     * Cache of src registers.
@@ -121,6 +134,8 @@ struct gen_context
 static void
 init_gen_context(struct gen_context *gen, struct ppc_function *func)
 {
+   uint i;
+
    memset(gen, 0, sizeof(*gen));
    gen->f = func;
    gen->inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
@@ -133,6 +148,12 @@ init_gen_context(struct gen_context *gen, struct ppc_function *func)
    gen->bit31_vec = -1;
    gen->offset_reg = -1;
    gen->offset_value = -9999999;
+   for (i = 0; i < MAX_PPC_TEMPS; i++) {
+      gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f);
+   }
 }
 
 
@@ -171,7 +192,7 @@ emit_li_offset(struct gen_context *gen, int offset)
  * Forces subsequent emit_li_offset() calls to emit an 'li'.
  * To be called at the top of basic blocks.
  */
-static int
+static void
 reset_li_offset(struct gen_context *gen)
 {
    gen->offset_value = -9999999;
@@ -239,15 +260,15 @@ gen_get_bit31_vec(struct gen_context *gen)
 
 
 /**
- * Register fetch, put result in 'dst_vec'.
+ * Register fetch.  Return PPC vector register with result.
  */
-static void
+static int
 emit_fetch(struct gen_context *gen,
-           unsigned dst_vec,
            const struct tgsi_full_src_register *reg,
            const unsigned chan_index)
 {
    uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+   int dst_vec = -1;
 
    switch (swizzle) {
    case TGSI_EXTSWIZZLE_X:
@@ -259,13 +280,20 @@ emit_fetch(struct gen_context *gen,
          {
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
             ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
          }
          break;
       case TGSI_FILE_TEMPORARY:
-         {
+         if (reg->SrcRegister.Index < MAX_PPC_TEMPS) {
+            /* use PPC vec register */
+            dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle];
+         }
+         else {
+            /* use memory-based temp register "file" */
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
             ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
          }
          break;
@@ -273,6 +301,7 @@ emit_fetch(struct gen_context *gen,
          {
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
             int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
             ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
          }
          break;
@@ -280,6 +309,7 @@ emit_fetch(struct gen_context *gen,
          {
             int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
             int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
             /* Load 4-byte word into vector register.
              * The vector slot depends on the effective address we load from.
              * We know that our constants start at a 16-byte boundary so we
@@ -301,6 +331,7 @@ emit_fetch(struct gen_context *gen,
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
+         dst_vec = ppc_allocate_vec_register(gen->f);
          ppc_vmove(gen->f, dst_vec, one_vec);
       }
       break;
@@ -308,6 +339,8 @@ emit_fetch(struct gen_context *gen,
       assert( 0 );
    }
 
+   assert(dst_vec >= 0);
+
    {
       uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
       if (sign_op != TGSI_UTIL_SIGN_KEEP) {
@@ -331,6 +364,8 @@ emit_fetch(struct gen_context *gen,
          }
       }
    }
+
+   return dst_vec;
 }
 
 
@@ -380,20 +415,22 @@ get_src_vec(struct gen_context *gen,
    for (i = 0; i < gen->num_regs; i++) {
       if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
          /* cache hit */
+         assert(gen->regs[i].vec >= 0);
          return gen->regs[i].vec;
       }
    }
 
    /* cache miss: allocate new vec reg and emit fetch/load code */
-   vec = ppc_allocate_vec_register(gen->f);
+   vec = emit_fetch(gen, src, chan);
    gen->regs[gen->num_regs].src = *src;
    gen->regs[gen->num_regs].chan = chan;
    gen->regs[gen->num_regs].vec = vec;
    gen->num_regs++;
-   emit_fetch(gen, vec, src, chan);
 
    assert(gen->num_regs <= Elements(gen->regs));
 
+   assert(vec >= 0);
+
    return vec;
 }
 
@@ -406,23 +443,48 @@ release_src_vecs(struct gen_context *gen)
 {
    uint i;
    for (i = 0; i < gen->num_regs; i++) {
-      ppc_release_vec_register(gen->f, gen->regs[i].vec);
+      const const struct tgsi_full_src_register src = gen->regs[i].src;
+      if (!(src.SrcRegister.File == TGSI_FILE_TEMPORARY &&
+            src.SrcRegister.Index < MAX_PPC_TEMPS)) {
+         ppc_release_vec_register(gen->f, gen->regs[i].vec);
+      }
    }
    gen->num_regs = 0;
 }
 
 
+static int
+get_dst_vec(struct gen_context *gen, 
+            const struct tgsi_full_instruction *inst,
+            unsigned chan_index)
+{
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+   if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+       reg->DstRegister.Index < MAX_PPC_TEMPS) {
+      int vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+      return vec;
+   }
+   else {
+      return ppc_allocate_vec_register(gen->f);
+   }
+}
+
+
 /**
  * Register store.  Store 'src_vec' at location indicated by 'reg'.
+ * \param free_vec  Should the src_vec be released when done?
  */
 static void
 emit_store(struct gen_context *gen,
-           unsigned src_vec,
-           const struct tgsi_full_dst_register *reg,
+           int src_vec,
            const struct tgsi_full_instruction *inst,
-           unsigned chan_index)
+           unsigned chan_index,
+           boolean free_vec)
 {
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
    switch (reg->DstRegister.File) {
    case TGSI_FILE_OUTPUT:
       {
@@ -432,7 +494,15 @@ emit_store(struct gen_context *gen,
       }
       break;
    case TGSI_FILE_TEMPORARY:
-      {
+      if (reg->DstRegister.Index < MAX_PPC_TEMPS) {
+         if (!free_vec) {
+            int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+            if (dst_vec != src_vec)
+               ppc_vmove(gen->f, dst_vec, src_vec);
+         }
+         free_vec = FALSE;
+      }
+      else {
          int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
          int offset_reg = emit_li_offset(gen, offset);
          ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
@@ -465,21 +535,20 @@ emit_store(struct gen_context *gen,
       break;
    }
 #endif
-}
-
-
-#define STORE( GEN, INST, XMM, INDEX, CHAN )\
-   emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
 
+   if (free_vec)
+      ppc_release_vec_register(gen->f, src_vec);
+}
 
 
 static void
 emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v0, v1 = ppc_allocate_vec_register(gen->f);
+   int v0, v1;
    uint chan_index;
 
    v0 = get_src_vec(gen, inst, 0, CHAN_X);
+   v1 = ppc_allocate_vec_register(gen->f);
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_RSQ:
@@ -495,7 +564,7 @@ emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE(gen, *inst, v1, 0, chan_index);
+      emit_store(gen, v1, inst, chan_index, FALSE);
    }
 
    release_src_vecs(gen);
@@ -509,42 +578,37 @@ emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
    uint chan_index;
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
       int v0 = get_src_vec(gen, inst, 0, chan_index);   /* v0 = srcreg[0] */
+      int v1 = get_dst_vec(gen, inst, chan_index);
       switch (inst->Instruction.Opcode) {
       case TGSI_OPCODE_ABS:
          /* turn off the most significant bit of each vector float word */
          {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vspltisw(gen->f, v1, -1);  /* v1 = {-1, -1, -1, -1} */
-            ppc_vslw(gen->f, v1, v1, v1);  /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */
-            ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */
-            ppc_release_vec_register(gen->f, v1);
+            int bit31_vec = gen_get_bit31_vec(gen);
+            ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */
          }
          break;
       case TGSI_OPCODE_FLOOR:
-         ppc_vrfim(gen->f, v0, v0);         /* v0 = floor(v0) */
+         ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
          break;
       case TGSI_OPCODE_FRAC:
-         {
-            int v1 = ppc_allocate_vec_register(gen->f);
-            ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
-            ppc_vsubfp(gen->f, v0, v0, v1);    /* v0 = v0 - v1 */
-            ppc_release_vec_register(gen->f, v1);
-         }
+         ppc_vrfim(gen->f, v1, v0);      /* tmp = floor(v0) */
+         ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */
          break;
       case TGSI_OPCODE_EXPBASE2:
-         ppc_vexptefp(gen->f, v0, v0);      /* v0 = 2^v0 */
+         ppc_vexptefp(gen->f, v1, v0);     /* v1 = 2^v0 */
          break;
       case TGSI_OPCODE_LOGBASE2:
          /* XXX this may be broken! */
-         ppc_vlogefp(gen->f, v0, v0);      /* v0 = log2(v0) */
+         ppc_vlogefp(gen->f, v1, v0);      /* v1 = log2(v0) */
          break;
       case TGSI_OPCODE_MOV:
-         /* nothing */
+         if (v0 != v1)
+            ppc_vmove(gen->f, v1, v0);
          break;
       default:
          assert(0);
       }
-      STORE(gen, *inst, v0, 0, chan_index);   /* store v0 */
+      emit_store(gen, v1, inst, chan_index, TRUE);  /* store v0 */
    }
 
    release_src_vecs(gen);
@@ -554,7 +618,7 @@ emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 static void
 emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v2, zero_vec = -1;
+   int zero_vec = -1;
    uint chan;
 
    if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
@@ -562,12 +626,11 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
       ppc_vzero(gen->f, zero_vec);
    }
 
-   v2 = ppc_allocate_vec_register(gen->f);
-
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
       /* fetch src operands */
       int v0 = get_src_vec(gen, inst, 0, chan);
       int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
 
       /* emit binop */
       switch (inst->Instruction.Opcode) {
@@ -591,11 +654,9 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
       }
 
       /* store v2 */
-      STORE(gen, *inst, v2, 0, chan);
+      emit_store(gen, v2, inst, chan, TRUE);
    }
 
-   ppc_release_vec_register(gen->f, v2);
-
    if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
       ppc_release_vec_register(gen->f, zero_vec);
 
@@ -606,16 +667,14 @@ emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 static void
 emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v3;
    uint chan;
 
-   v3 = ppc_allocate_vec_register(gen->f);
-
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
       /* fetch src operands */
       int v0 = get_src_vec(gen, inst, 0, chan);
       int v1 = get_src_vec(gen, inst, 1, chan);
       int v2 = get_src_vec(gen, inst, 2, chan);
+      int v3 = get_dst_vec(gen, inst, chan);
 
       /* emit ALU */
       switch (inst->Instruction.Opcode) {
@@ -631,11 +690,9 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
       }
 
       /* store v3 */
-      STORE(gen, *inst, v3, 0, chan);
+      emit_store(gen, v3, inst, chan, TRUE);
    }
 
-   ppc_release_vec_register(gen->f, v3);
-
    release_src_vecs(gen);
 }
 
@@ -646,16 +703,14 @@ emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
 static void
 emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
 {
-   int v2;
    uint chan;
    int one_vec = gen_one_vec(gen);
 
-   v2 = ppc_allocate_vec_register(gen->f);
-
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
       /* fetch src operands */
       int v0 = get_src_vec(gen, inst, 0, chan);
       int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
       boolean complement = FALSE;
 
       switch (inst->Instruction.Opcode) {
@@ -691,11 +746,9 @@ emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
 
       /* store v2 */
-      STORE(gen, *inst, v2, 0, chan);
+      emit_store(gen, v2, inst, chan, TRUE);
    }
 
-   ppc_release_vec_register(gen->f, v2);
-
    release_src_vecs(gen);
 }
 
@@ -733,7 +786,7 @@ emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
-      STORE(gen, *inst, v2, 0, chan_index);  /* store v2 */
+      emit_store(gen, v2, inst, chan_index, FALSE);  /* store v2, free v2 later */
    }
 
    release_src_vecs(gen);
@@ -768,7 +821,7 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
    /* Compute X */
    if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
-      STORE(gen, *inst, one_vec, 0, CHAN_X);
+      emit_store(gen, one_vec, inst, CHAN_X, FALSE);
    }
 
    /* Compute Y, Z */
@@ -783,11 +836,12 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
       ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
-         STORE(gen, *inst, x_vec, 0, CHAN_Y);        /* store Y */
+         emit_store(gen, x_vec, inst, CHAN_Y, FALSE);
       }
 
       if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-         int y_vec, z_vec, w_vec;
+         int y_vec, w_vec;
+         int z_vec = ppc_allocate_vec_register(gen->f);
          int pow_vec = ppc_allocate_vec_register(gen->f);
          int pos_vec = ppc_allocate_vec_register(gen->f);
          int p128_vec = ppc_allocate_vec_register(gen->f);
@@ -798,11 +852,11 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
          w_vec = get_src_vec(gen, inst, 0, CHAN_W);  /* w_vec = src[0].w */
 
-         /* clamp Y to [-128, 128] */
+         /* clamp W to [-128, 128] */
          load_constant_vec(gen, p128_vec, 128.0f);
          load_constant_vec(gen, n128_vec, -128.0f);
-         ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */
-         ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */
+         ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */
+         ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */
 
          /* if temp.x > 0
           *    z = pow(tmp.y, tmp.w)
@@ -813,8 +867,9 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
          ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
 
-         STORE(gen, *inst, z_vec, 0, CHAN_Z);             /* store Z */
+         emit_store(gen, z_vec, inst, CHAN_Z, FALSE);
 
+         ppc_release_vec_register(gen->f, z_vec);
          ppc_release_vec_register(gen->f, pow_vec);
          ppc_release_vec_register(gen->f, pos_vec);
          ppc_release_vec_register(gen->f, p128_vec);
@@ -826,7 +881,7 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 
    /* Compute W */
    if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
-      STORE(gen, *inst, one_vec, 0, CHAN_W);
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
    }
 
    release_src_vecs(gen);
@@ -997,6 +1052,11 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    if (!use_ppc_asm)
       return FALSE;
 
+   if (0) {
+      debug_printf("\n********* TGSI->PPC ********\n");
+      tgsi_dump(tokens, 0);
+   }
+
    util_init_math();
 
    init_gen_context(&gen, func);
-- 
cgit v1.2.3


From db680ac0e3697ecc2c2dbd5f22c4c2fdb136b62c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 14:03:51 -0600
Subject: cell: fix a number of fence issues

Plus add assertions to check status, alignment, etc.
---
 src/gallium/drivers/cell/ppu/cell_batch.c   | 19 ++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_context.h |  2 +-
 src/gallium/drivers/cell/ppu/cell_fence.c   | 14 ++++++++++++--
 src/gallium/drivers/cell/spu/spu_command.c  |  2 +-
 4 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 448b723d85..962775cd33 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -100,12 +100,23 @@ emit_fence(struct cell_context *cell)
    const uint batch = cell->cur_batch;
    const uint size = cell->buffer_size[batch];
    struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
 
    ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
 
    fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
    fence_cmd->opcode = CELL_CMD_FENCE;
-   fence_cmd->fence = &cell->fenced_buffers[batch].fence;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+   assert(sizeof(struct cell_command_fence) % 8 == 0);
 }
 
 
@@ -119,7 +130,7 @@ cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->buffer_size[batch];
+   uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -130,8 +141,10 @@ cell_batch_flush(struct cell_context *cell)
    /* Before we use this batch buffer, make sure any fenced texture buffers
     * are released.
     */
-   if (cell->fenced_buffers[batch].head)
+   if (cell->fenced_buffers[batch].head) {
       emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
 
    flushing = TRUE;
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 4491ae8cdf..eb1397bb3f 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence;
+   struct cell_fence fence ALIGN16_ATTRIB;
    struct cell_buffer_node *head;
 };
 
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
index ffb3bea12b..867b5dcaa0 100644
--- a/src/gallium/drivers/cell/ppu/cell_fence.c
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -38,6 +38,7 @@ void
 cell_fence_init(struct cell_fence *fence)
 {
    uint i;
+   ASSERT_ALIGN16(fence->status);
    for (i = 0; i < CELL_MAX_SPUS; i++) {
       fence->status[i][0] = CELL_FENCE_IDLE;
    }
@@ -50,9 +51,9 @@ cell_fence_signalled(const struct cell_context *cell,
 {
    uint i;
    for (i = 0; i < cell->num_spus; i++) {
-      //ASSERT(fence->status[i][0] != CELL_FENCE_IDLE);
-      if (fence->status[i][0] == CELL_FENCE_EMITTED)
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
          return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
    }
    return TRUE;
 }
@@ -65,6 +66,15 @@ cell_fence_finish(const struct cell_context *cell,
    while (!cell_fence_signalled(cell, fence)) {
       usleep(10);
    }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index a6ed29ea63..63818d4c46 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -107,7 +107,7 @@ cmd_fence(struct cell_command_fence *fence_cmd)
                                               CELL_FENCE_SIGNALLED};
    uint *dst = (uint *) fence_cmd->fence;
    dst += 4 * spu.init.id;  /* main store/memory address, not local store */
-
+   ASSERT_ALIGN16(dst);
    mfc_put((void *) &status,    /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            sizeof(status),      /* size */
-- 
cgit v1.2.3


From f4e9526addc617dc78af9b1af781ffe09ce62504 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 18:21:03 -0600
Subject: gallium: ppc: don't replicate/smear immediate values, use vspltw
 instruction as with constants

---
 src/gallium/auxiliary/draw/draw_vs_ppc.c |  8 ++++----
 src/gallium/auxiliary/tgsi/tgsi_ppc.c    | 22 +++++++++++++---------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index 8eff6d4fda..ff40263400 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -54,7 +54,7 @@
 typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
                                              float (*outputs)[4][4],
                                              float (*temps)[4][4],
-                                             float (*immeds)[4][4],
+                                             float (*immeds)[4],
                                              float (*consts)[4],
                                              const float *builtins);
 
@@ -151,7 +151,7 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
                    output_stride );
 #else
       shader->func(inputs_soa, outputs_soa, temps_soa,
-		   (float (*)[4][4]) shader->base.immediates,
+		   (float (*)[4]) shader->base.immediates,
 		   (float (*)[4]) constants,
                    ppc_builtin_constants);
 
@@ -227,7 +227,7 @@ draw_create_vs_ppc(struct draw_context *draw,
    vs->base.run_linear = vs_ppc_run_linear;
    vs->base.delete = vs_ppc_delete;
    
-   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 *
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
                                       sizeof(float), 16);
 
    vs->machine = &draw->vs.machine;
@@ -236,7 +236,7 @@ draw_create_vs_ppc(struct draw_context *draw,
 
    if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
 			&vs->ppc_program, 
-                        (float (*)[4])vs->base.immediates, 
+                       (float (*)[4]) vs->base.immediates, 
                         TRUE )) 
       goto fail;
       
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 0de9b972b4..dd574ac02a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -299,10 +299,18 @@ emit_fetch(struct gen_context *gen,
          break;
       case TGSI_FILE_IMMEDIATE:
          {
-            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
             int offset_reg = emit_li_offset(gen, offset);
             dst_vec = ppc_allocate_vec_register(gen->f);
-            ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our immediates start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
          }
          break;
       case TGSI_FILE_CONSTANT:
@@ -1095,14 +1103,10 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
             assert(size <= 4);
             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
             for (i = 0; i < size; i++) {
-               const float value =
-                  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
-               imm[num_immediates * 4 + 0] = 
-               imm[num_immediates * 4 + 1] = 
-               imm[num_immediates * 4 + 2] = 
-               imm[num_immediates * 4 + 3] = value;
-               num_immediates++;
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
             }
+            num_immediates++;
          }
          break;
 
-- 
cgit v1.2.3


From a045b92511eb43ff89e9c0536464af7866956168 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 18:22:14 -0600
Subject: gallium: remove old code

---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index ff40263400..19f6c4ee5b 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -58,19 +58,6 @@ typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
                                              float (*consts)[4],
                                              const float *builtins);
 
-#if 0
-   const struct tgsi_exec_vector *input,
-   struct tgsi_exec_vector *output,
-   float (*constant)[4],        /* 3 */
-   struct tgsi_exec_vector *temporary, /* 4 */
-   float (*immediates)[4],      /* 5 */
-   const float (*aos_input)[4], /* 6 */
-   uint num_inputs,             /* 7 */
-   uint input_stride,           /* 8 */
-   float (*aos_output)[4],      /* 9 */
-   uint num_outputs,            /* 10 */
-   uint output_stride );        /* 11 */
-#endif
 
 struct draw_ppc_vertex_shader {
    struct draw_vertex_shader base;
@@ -137,27 +124,11 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 
       /* run compiled shader
        */
-#if 0
-      shader->func(machine->Inputs,
-		   machine->Outputs,
-		   (float (*)[4])constants,
-		   machine->Temps,
-		   (float (*)[4])shader->base.immediates,
-                   input,
-                   base->info.num_inputs,
-                   input_stride,
-                   output,
-                   base->info.num_outputs,
-                   output_stride );
-#else
       shader->func(inputs_soa, outputs_soa, temps_soa,
 		   (float (*)[4]) shader->base.immediates,
 		   (float (*)[4]) constants,
                    ppc_builtin_constants);
 
-      /*output[0][0] = input[0][0] * 0.5;*/
-#endif
-
       /* convert (up to) four output verts from SoA back to AoS format */
       for (attr = 0; attr < base->info.num_outputs; attr++) {
          float *vOut = (float *) output;
-- 
cgit v1.2.3


From 5db0372b3cffec9b5c28699a580da77dcfbd938d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 28 Oct 2008 18:57:54 -0600
Subject: gallium: ppc: implement TGSI_OPCODE_LOG/EXP

---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 111 +++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index dd574ac02a..e64fb5ac91 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -896,6 +896,110 @@ emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+static void
+emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int one_vec = gen_one_vec(gen);
+   int src_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* Compute X = 2^floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_X);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vexptefp(gen->f, dst_vec, tmp_vec);          /* dst = 2 ^ tmp */
+      emit_store(gen, dst_vec, inst, CHAN_X, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Y = src - floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Y);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec);   /* dst = src - tmp */
+      emit_store(gen, dst_vec, inst, CHAN_Y, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApprox2ToX(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vexptefp(gen->f, dst_vec, src_vec);          /* dst = 2 ^ src */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int bit31_vec = gen_get_bit31_vec(gen);
+   const int one_vec = gen_one_vec(gen);
+   int src_vec, abs_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* compute abs(src) */
+   abs_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec);     /* abs = src & ~bit31 */
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) &&
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+
+      /* compute tmp = floor(log2(abs)) */
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vlogefp(gen->f, tmp_vec, abs_vec);           /* tmp = log2(abs) */
+      ppc_vrfim(gen->f, tmp_vec, tmp_vec);             /* tmp = floor(tmp); */
+
+      /* Compute X = tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+         emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+      }
+      
+      /* Compute Y = abs / 2^tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         const int zero_vec = ppc_allocate_vec_register(gen->f);
+         ppc_vzero(gen->f, zero_vec);
+         ppc_vexptefp(gen->f, tmp_vec, tmp_vec);       /* tmp = 2 ^ tmp */
+         ppc_vrefp(gen->f, tmp_vec, tmp_vec);          /* tmp = 1 / tmp */
+         /* tmp = abs * tmp + zero */
+         ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec);
+         emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+         ppc_release_vec_register(gen->f, zero_vec);
+      }
+
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApproxLog2(abs) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vlogefp(gen->f, dst_vec, abs_vec);           /* dst = log2(abs) */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   ppc_release_vec_register(gen->f, abs_vec);
+   release_src_vecs(gen);
+}
+
+
 static int
 emit_instruction(struct gen_context *gen,
                  struct tgsi_full_instruction *inst)
@@ -940,6 +1044,12 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_LIT:
       emit_lit(gen, inst);
       break;
+   case TGSI_OPCODE_LOG:
+      emit_log(gen, inst);
+      break;
+   case TGSI_OPCODE_EXP:
+      emit_exp(gen, inst);
+      break;
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
@@ -1098,7 +1208,6 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
          /* splat each immediate component into a float[4] vector for SoA */
          {
             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
-            float *imm = (float *) immediates;
             uint i;
             assert(size <= 4);
             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
-- 
cgit v1.2.3


From 7640264064c2cbc9922f7f3df51f7caa7b449e8e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 11:03:51 -0600
Subject: gallium: added ppc_vnmsubfp()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 7 +++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 7dd8263749..a90b5587b0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -505,6 +505,13 @@ ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
    emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
 }
 
+/** vector float negative mult subtract: vD = vA - vB * vC */
+void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+}
+
 /** vector float compare greater than */
 void
 ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f938d8d759..561e139bce 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -97,10 +97,14 @@ ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
 
-/** vector float mult add */
+/** vector float mult add: vD = vA * vB + vC */
 extern void
 ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
 
+/** vector float negative mult subtract: vD = vA - vB * vC */
+extern void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
 /** vector float compare greater than */
 extern void
 ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
-- 
cgit v1.2.3


From 75b92764a7820558fb2b6cd27a2ab0487ef2f9ba Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 11:04:05 -0600
Subject: gallium: clean-ups

---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index 19f6c4ee5b..d720c7bbd5 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -64,8 +64,6 @@ struct draw_ppc_vertex_shader {
    struct ppc_function ppc_program;
 
    codegen_function func;
-   
-   struct tgsi_exec_machine *machine;
 };
 
 
@@ -73,11 +71,12 @@ static void
 vs_ppc_prepare( struct draw_vertex_shader *base,
 		struct draw_context *draw )
 {
+   /* nothing */
 }
 
 
-
-/* Simplified vertex shader interface for the pt paths.  Given the
+/**
+ * Simplified vertex shader interface for the pt paths.  Given the
  * complexity of code-generating all the above operations together,
  * it's time to try doing all the other stuff separately.
  */
@@ -91,7 +90,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 		   unsigned output_stride )
 {
    struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
-   struct tgsi_exec_machine *machine = shader->machine;
    unsigned int i;
 
 #define MAX_VERTICES 4
@@ -154,8 +152,6 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
 }
 
 
-
-
 static void
 vs_ppc_delete( struct draw_vertex_shader *base )
 {
@@ -172,7 +168,7 @@ vs_ppc_delete( struct draw_vertex_shader *base )
 
 struct draw_vertex_shader *
 draw_create_vs_ppc(struct draw_context *draw,
-                          const struct pipe_shader_state *templ)
+                   const struct pipe_shader_state *templ)
 {
    struct draw_ppc_vertex_shader *vs;
 
@@ -201,8 +197,6 @@ draw_create_vs_ppc(struct draw_context *draw,
    vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
                                       sizeof(float), 16);
 
-   vs->machine = &draw->vs.machine;
-   
    ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
 
    if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
-- 
cgit v1.2.3


From 4e1c33700d8885c91d8a1db4cbaefa1ff9f1b5fc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 11:05:34 -0600
Subject: gallium: added PPC support for SWZ, XPD, POW

That's the last of the ARB_v_p opcodes, except for ARL.
---
 src/gallium/auxiliary/tgsi/tgsi_ppc.c | 86 +++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index e64fb5ac91..5d13070922 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -610,6 +610,7 @@ emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
          ppc_vlogefp(gen->f, v1, v0);      /* v1 = log2(v0) */
          break;
       case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_SWZ:
          if (v0 != v1)
             ppc_vmove(gen->f, v1, v0);
          break;
@@ -1000,12 +1001,91 @@ emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
 }
 
 
+static void
+emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int s0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+   int s1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   int pow_vec = ppc_allocate_vec_register(gen->f);
+   int chan;
+
+   ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      emit_store(gen, pow_vec, inst, chan, FALSE);
+   }
+
+   ppc_release_vec_register(gen->f, pow_vec);
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int x0_vec, y0_vec, z0_vec;
+   int x1_vec, y1_vec, z1_vec;
+   int zero_vec, tmp_vec;
+   int tmp2_vec;
+
+   zero_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vzero(gen->f, zero_vec);
+
+   tmp_vec = ppc_allocate_vec_register(gen->f);
+   tmp2_vec = ppc_allocate_vec_register(gen->f);
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      x0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+      x1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      y0_vec = get_src_vec(gen, inst, 0, CHAN_Y);
+      y1_vec = get_src_vec(gen, inst, 1, CHAN_Y);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      z0_vec = get_src_vec(gen, inst, 0, CHAN_Z);
+      z1_vec = get_src_vec(gen, inst, 1, CHAN_Z);
+   }
+
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) {
+      /* tmp = y0 * z1 */
+      ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec);
+      /* tmp = tmp - z0 * y1*/
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) {
+      /* tmp = z0 * x1 */
+      ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec);
+      /* tmp = tmp - x0 * z1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) {
+      /* tmp = x0 * y1 */
+      ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec);
+      /* tmp = tmp - y0 * x1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE);
+   }
+   /* W is undefined */
+
+   ppc_release_vec_register(gen->f, tmp_vec);
+   ppc_release_vec_register(gen->f, zero_vec);
+   release_src_vecs(gen);
+}
+
 static int
 emit_instruction(struct gen_context *gen,
                  struct tgsi_full_instruction *inst)
 {
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
    case TGSI_OPCODE_ABS:
    case TGSI_OPCODE_FLOOR:
    case TGSI_OPCODE_FRAC:
@@ -1050,6 +1130,12 @@ emit_instruction(struct gen_context *gen,
    case TGSI_OPCODE_EXP:
       emit_exp(gen, inst);
       break;
+   case TGSI_OPCODE_POW:
+      emit_pow(gen, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(gen, inst);
+      break;
    case TGSI_OPCODE_END:
       /* normal end */
       return 1;
-- 
cgit v1.2.3


From 8b3af5c5d6fe100707da0d9dcc42500921792638 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 12:12:30 -0600
Subject: cell: use simd utilities for pow, exp2, log2

---
 src/gallium/drivers/cell/spu/spu_funcs.c | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 3534b35000..ff3d609d25 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -38,7 +38,9 @@
 #include <math.h>
 #include <cos14_v.h>
 #include <sin14_v.h>
-#include <transpose_matrix4x4.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
@@ -68,37 +70,19 @@ spu_sin(vector float x)
 static vector float
 spu_pow(vector float x, vector float y)
 {
-   float z0 = powf(spu_extract(x,0), spu_extract(y,0));
-   float z1 = powf(spu_extract(x,1), spu_extract(y,1));
-   float z2 = powf(spu_extract(x,2), spu_extract(y,2));
-   float z3 = powf(spu_extract(x,3), spu_extract(y,3));
-   return (vector float) {z0, z1, z2, z3};
+   return _powf4(x, y);
 }
 
 static vector float
 spu_exp2(vector float x)
 {
-   float z0 = powf(2.0f, spu_extract(x,0));
-   float z1 = powf(2.0f, spu_extract(x,1));
-   float z2 = powf(2.0f, spu_extract(x,2));
-   float z3 = powf(2.0f, spu_extract(x,3));
-   return (vector float) {z0, z1, z2, z3};
+   return _exp2f4(x);
 }
 
 static vector float
 spu_log2(vector float x)
 {
-   /*
-    * log_base_2(x) = log(x) / log(2)
-    * 1.442695 = 1/log(2).
-    */
-   static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
-   float z0 = logf(spu_extract(x,0));
-   float z1 = logf(spu_extract(x,1));
-   float z2 = logf(spu_extract(x,2));
-   float z3 = logf(spu_extract(x,3));
-   vector float v = (vector float) {z0, z1, z2, z3};
-   return spu_mul(v, k);
+   return _log2f4(x);
 }
 
 
-- 
cgit v1.2.3


From 1f7a323a138e6cc43b1192022b071c606a5ee6f4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 12:14:11 -0600
Subject: cell: add scalar param to emit_function_call() to indicate scalar
 function calls

Scalar calls only use the X component of the src regs and smear the
result across the dest register's X/Y/Z/W.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 103 +++++++++++++++++++----------
 1 file changed, 69 insertions(+), 34 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index d4d644d6e8..5c41b264ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1303,60 +1303,91 @@ lookup_function(struct cell_context *cell, const char *funcname)
 /**
  * Emit code to call a SPU function.
  * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
  */
 static boolean
 emit_function_call(struct codegen *gen,
                    const struct tgsi_full_instruction *inst,
-                   char *funcname, uint num_args)
+                   char *funcname, uint num_args, boolean scalar)
 {
    const uint addr = lookup_function(gen->cell, funcname);
    char comment[100];
-   int ch;
+   int s_regs[3];
+   int func_called = FALSE;
+   uint a, ch;
+   int retval_reg = -1;
 
    assert(num_args <= 3);
 
    snprintf(comment, sizeof(comment), "CALL %s:", funcname);
    spe_comment(gen->f, -4, comment);
 
+   if (scalar) {
+      for (a = 0; a < num_args; a++) {
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+      }
+      /* we'll call the function, put the return value in this register,
+       * then replicate it across all write-enabled components in d_reg.
+       */
+      retval_reg = spe_allocate_available_register(gen->f);
+   }
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s_regs[3], d_reg;
+         int d_reg;
          ubyte usedRegs[SPE_NUM_REGS];
-         uint a, i, numUsed;
+         uint i, numUsed;
 
-         for (a = 0; a < num_args; a++) {
-            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+         if (!scalar) {
+            for (a = 0; a < num_args; a++) {
+               s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+            }
          }
-         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         numUsed = spe_get_registers_used(gen->f, usedRegs);
-         assert(numUsed < gen->frame_size / 16 - 2);
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         /* save registers to stack */
-         for (i = 0; i < numUsed; i++) {
-            uint reg = usedRegs[i];
-            int offset = 2 + i;
-            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
-         }
+         if (!scalar || !func_called) {
+            /* for a scalar function, we'll really only call the function once */
 
-         /* setup function arguments */
-         for (a = 0; a < num_args; a++) {
-            spe_move(gen->f, 3 + a, s_regs[a]);
-         }
+            numUsed = spe_get_registers_used(gen->f, usedRegs);
+            assert(numUsed < gen->frame_size / 16 - 2);
 
-         /* branch to function, save return addr */
-         spe_brasl(gen->f, SPE_REG_RA, addr);
+            /* save registers to stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               int offset = 2 + i;
+               spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
 
-         /* save function's return value */
-         spe_move(gen->f, d_reg, 3);
+            /* setup function arguments */
+            for (a = 0; a < num_args; a++) {
+               spe_move(gen->f, 3 + a, s_regs[a]);
+            }
 
-         /* restore registers from stack */
-         for (i = 0; i < numUsed; i++) {
-            uint reg = usedRegs[i];
-            if (reg != d_reg) {
-               int offset = 2 + i;
-               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            /* branch to function, save return addr */
+            spe_brasl(gen->f, SPE_REG_RA, addr);
+
+            /* save function's return value */
+            if (scalar)
+               spe_move(gen->f, retval_reg, 3);
+            else
+               spe_move(gen->f, d_reg, 3);
+
+            /* restore registers from stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               if (reg != d_reg && reg != retval_reg) {
+                  int offset = 2 + i;
+                  spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+               }
             }
+
+            func_called = TRUE;
+         }
+
+         if (scalar) {
+            spe_move(gen->f, d_reg, retval_reg);
          }
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
@@ -1364,6 +1395,10 @@ emit_function_call(struct codegen *gen,
       }
    }
 
+   if (scalar) {
+      spe_release_register(gen->f, retval_reg);
+   }
+
    return true;
 }
 
@@ -1770,15 +1805,15 @@ emit_instruction(struct codegen *gen,
       return emit_END(gen);
 
    case TGSI_OPCODE_COS:
-      return emit_function_call(gen, inst, "spu_cos", 1);
+      return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
    case TGSI_OPCODE_SIN:
-      return emit_function_call(gen, inst, "spu_sin", 1);
+      return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
    case TGSI_OPCODE_POW:
-      return emit_function_call(gen, inst, "spu_pow", 2);
+      return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
    case TGSI_OPCODE_EXPBASE2:
-      return emit_function_call(gen, inst, "spu_exp2", 1);
+      return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
    case TGSI_OPCODE_LOGBASE2:
-      return emit_function_call(gen, inst, "spu_log2", 1);
+      return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
    case TGSI_OPCODE_TEX:
       /* fall-through for now */
    case TGSI_OPCODE_TXD:
-- 
cgit v1.2.3


From 09570d2e737a4c9f3f24edd78af3b897ee261733 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 14:08:13 -0600
Subject: gallium: test for PIPE_OS_LINUX instead of __linux__

---
 src/gallium/auxiliary/rtasm/rtasm_execmem.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 19087589a8..864bd4d3fe 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -38,12 +38,13 @@
 #include "rtasm_execmem.h"
 
 
-#if defined(__linux__)
+#if defined(PIPE_OS_LINUX)
+
 
 /*
  * Allocate a large block of memory which can hold code then dole it out
  * in pieces by means of the generic memory manager code.
-*/
+ */
 
 #include <unistd.h>
 #include <sys/mman.h>
@@ -113,7 +114,7 @@ rtasm_exec_free(void *addr)
 }
 
 
-#else
+#else /* PIPE_OS_LINUX */
 
 /*
  * Just use regular memory.
@@ -133,4 +134,4 @@ rtasm_exec_free(void *addr)
 }
 
 
-#endif
+#endif /* PIPE_OS_LINUX */
-- 
cgit v1.2.3


From 3ad56968f09397a8dd417eae025b9506efaf8414 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 14:19:12 -0600
Subject: gallium: prefix memory manager functions with u_ to differentiate
 from functions in mesa/main/mm.c

---
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c | 10 +++++-----
 src/gallium/auxiliary/rtasm/rtasm_execmem.c     |  8 ++++----
 src/gallium/auxiliary/util/u_mm.c               | 12 ++++++------
 src/gallium/auxiliary/util/u_mm.h               | 12 ++++++------
 4 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index fe80ca30ee..6e10cf1806 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -100,7 +100,7 @@ mm_buffer_destroy(struct pb_buffer *buf)
    assert(buf->base.refcount == 0);
    
    pipe_mutex_lock(mm->mutex);
-   mmFreeMem(mm_buf->block);
+   u_mmFreeMem(mm_buf->block);
    FREE(buf);
    pipe_mutex_unlock(mm->mutex);
 }
@@ -175,14 +175,14 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    
    mm_buf->mgr = mm;
    
-   mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+   mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
    if(!mm_buf->block) {
       debug_printf("warning: heap full\n");
 #if 0
       mmDumpMemInfo(mm->heap);
 #endif
       
-      mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+      mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
       if(!mm_buf->block) {
          FREE(mm_buf);
          pipe_mutex_unlock(mm->mutex);
@@ -213,7 +213,7 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
    
    pipe_mutex_lock(mm->mutex);
 
-   mmDestroy(mm->heap);
+   u_mmDestroy(mm->heap);
    
    pb_unmap(mm->buffer);
    pb_reference(&mm->buffer, NULL);
@@ -254,7 +254,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    if(!mm->map)
       goto failure;
 
-   mm->heap = mmInit(0, size); 
+   mm->heap = u_mmInit(0, size); 
    if (!mm->heap)
       goto failure;
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 864bd4d3fe..df353633e8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -63,7 +63,7 @@ static void
 init_heap(void)
 {
    if (!exec_heap)
-      exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+      exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
    
    if (!exec_mem)
       exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
@@ -84,7 +84,7 @@ rtasm_exec_malloc(size_t size)
 
    if (exec_heap) {
       size = (size + 31) & ~31;
-      block = mmAllocMem( exec_heap, size, 32, 0 );
+      block = u_mmAllocMem( exec_heap, size, 32, 0 );
    }
 
    if (block)
@@ -104,10 +104,10 @@ rtasm_exec_free(void *addr)
    pipe_mutex_lock(exec_mutex);
 
    if (exec_heap) {
-      struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+      struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
    
       if (block)
-	 mmFreeMem(block);
+	 u_mmFreeMem(block);
    }
 
    pipe_mutex_unlock(exec_mutex);
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 0f51dd5977..592ace00fc 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -31,7 +31,7 @@
 
 
 void
-mmDumpMemInfo(const struct mem_block *heap)
+u_mmDumpMemInfo(const struct mem_block *heap)
 {
    debug_printf("Memory heap %p:\n", (void *)heap);
    if (heap == 0) {
@@ -58,7 +58,7 @@ mmDumpMemInfo(const struct mem_block *heap)
 }
 
 struct mem_block *
-mmInit(int ofs, int size)
+u_mmInit(int ofs, int size)
 {
    struct mem_block *heap, *block;
   
@@ -165,7 +165,7 @@ SliceBlock(struct mem_block *p,
 
 
 struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 {
    struct mem_block *p;
    const int mask = (1 << align2)-1;
@@ -198,7 +198,7 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 
 
 struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
+u_mmFindBlock(struct mem_block *heap, int start)
 {
    struct mem_block *p;
 
@@ -237,7 +237,7 @@ Join2Blocks(struct mem_block *p)
 }
 
 int
-mmFreeMem(struct mem_block *b)
+u_mmFreeMem(struct mem_block *b)
 {
    if (!b)
       return 0;
@@ -266,7 +266,7 @@ mmFreeMem(struct mem_block *b)
 
 
 void
-mmDestroy(struct mem_block *heap)
+u_mmDestroy(struct mem_block *heap)
 {
    struct mem_block *p;
 
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
index b226b101cb..ce20e48763 100644
--- a/src/gallium/auxiliary/util/u_mm.h
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -49,7 +49,7 @@ struct mem_block {
  * input: total size in bytes
  * return: a heap pointer if OK, NULL if error
  */
-extern struct mem_block *mmInit(int ofs, int size);
+extern struct mem_block *u_mmInit(int ofs, int size);
 
 /**
  * Allocate 'size' bytes with 2^align2 bytes alignment,
@@ -61,7 +61,7 @@ extern struct mem_block *mmInit(int ofs, int size);
  *		startSearch = linear offset from start of heap to begin search
  * return: pointer to the allocated block, 0 if error
  */
-extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, 
+extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2, 
                             int startSearch);
 
 /**
@@ -69,23 +69,23 @@ extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2
  * input: pointer to a block
  * return: 0 if OK, -1 if error
  */
-extern int mmFreeMem(struct mem_block *b);
+extern int u_mmFreeMem(struct mem_block *b);
 
 /**
  * Free block starts at offset
  * input: pointer to a heap, start offset
  * return: pointer to a block
  */
-extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start);
 
 /**
  * destroy MM
  */
-extern void mmDestroy(struct mem_block *mmInit);
+extern void u_mmDestroy(struct mem_block *mmInit);
 
 /**
  * For debuging purpose.
  */
-extern void mmDumpMemInfo(const struct mem_block *mmInit);
+extern void u_mmDumpMemInfo(const struct mem_block *mmInit);
 
 #endif
-- 
cgit v1.2.3


From 8828d52348d81e1b9ec985200a430554873b5f4e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 14:28:57 -0600
Subject: gallium: fix alignment parameter passed to u_mmAllocMem()

Was 32, now 5.  The param is expressed as a power of two exponent.
The net effect is that the alignment was a no-op on X86 but on PPC we
always got the same memory address everytime rtasm_exec_malloc() was called.
---
 src/gallium/auxiliary/rtasm/rtasm_execmem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index df353633e8..be7433baf8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -83,8 +83,8 @@ rtasm_exec_malloc(size_t size)
    init_heap();
 
    if (exec_heap) {
-      size = (size + 31) & ~31;
-      block = u_mmAllocMem( exec_heap, size, 32, 0 );
+      size = (size + 31) & ~31;  /* next multiple of 32 bytes */
+      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
    }
 
    if (block)
-- 
cgit v1.2.3


From 8160cb4935151a12588acbe546f00ce8d77bda91 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 14:55:02 -0600
Subject: gallium: fix alignment parameter passed to u_mmAllocMem()

Was 32, now 5.  The param is expressed as a power of two exponent.
The net effect is that the alignment was a no-op on X86 but on PPC we
always got the same memory address everytime rtasm_exec_malloc() was called.
---
 src/gallium/auxiliary/rtasm/rtasm_execmem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 19087589a8..bb3b1a4c25 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -82,8 +82,8 @@ rtasm_exec_malloc(size_t size)
    init_heap();
 
    if (exec_heap) {
-      size = (size + 31) & ~31;
-      block = mmAllocMem( exec_heap, size, 32, 0 );
+      size = (size + 31) & ~31;  /* next multiple of 32 bytes */
+      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
    }
 
    if (block)
-- 
cgit v1.2.3


From a5d920297a2affe34c535d30a2c49588f92f69ad Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 16:26:10 -0600
Subject: gallium: use execmem for PPC code, grow instruction buffer as needed

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 70 +++++++++++++++++++++++----------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  1 +
 src/gallium/auxiliary/tgsi/tgsi_ppc.c   |  8 ++++
 3 files changed, 58 insertions(+), 21 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index a90b5587b0..e73ed71a0b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -38,6 +38,7 @@
 #include <stdio.h>
 #include "util/u_memory.h"
 #include "pipe/p_debug.h"
+#include "rtasm_execmem.h"
 #include "rtasm_ppc.h"
 
 
@@ -46,9 +47,9 @@ ppc_init_func(struct ppc_function *p, unsigned max_inst)
 {
    uint i;
 
-   p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
    p->num_inst = 0;
-   p->max_inst = max_inst;
+   p->max_inst = 100; /* first guess at buffer size */
+   p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
    p->reg_used = 0x0;
    p->fp_used = 0x0;
    p->vec_used = 0x0;
@@ -66,12 +67,19 @@ ppc_release_func(struct ppc_function *p)
 {
    assert(p->num_inst <= p->max_inst);
    if (p->store != NULL) {
-      align_free(p->store);
+      rtasm_exec_free(p->store);
    }
    p->store = NULL;
 }
 
 
+uint
+ppc_num_instructions(const struct ppc_function *p)
+{
+   return p->num_inst;
+}
+
+
 void (*ppc_get_func(struct ppc_function *p))(void)
 {
 #if 0
@@ -202,6 +210,35 @@ ppc_release_vec_register(struct ppc_function *p, int reg)
 }
 
 
+/**
+ * Append instruction to instruction buffer.  Grow buffer if out of room.
+ */
+static void
+emit_instruction(struct ppc_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE);
+      }
+      rtasm_exec_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
 
 union vx_inst {
    uint32_t bits;
@@ -223,8 +260,7 @@ emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vA = vA;
    inst.inst.vB = vB;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -250,8 +286,7 @@ emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vB = vB;
    inst.inst.rC = 0;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -277,8 +312,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
    inst.inst.vB = vB;
    inst.inst.vC = vC;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -300,8 +334,7 @@ emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
    inst.inst.li = li;
    inst.inst.aa = aa;
    inst.inst.lk = lk;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
@@ -330,8 +363,7 @@ emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
    inst.inst.bh = bh;
    inst.inst.op2 = op2;
    inst.inst.lk = lk;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 static INLINE void
@@ -373,8 +405,7 @@ emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
    inst.inst.rb = rb;
    inst.inst.op2 = op2;
    inst.inst.unused = 0x0;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
@@ -398,8 +429,7 @@ emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
    inst.inst.rt = rt;
    inst.inst.ra = ra;
    inst.inst.si = (unsigned) (si & 0xffff);
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -428,8 +458,7 @@ emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
    inst.inst.unused = 0x0;
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -458,8 +487,7 @@ emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
    inst.inst.oe = oe;
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 561e139bce..d0477dec94 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -64,6 +64,7 @@ struct ppc_function
 
 extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
 extern void ppc_release_func(struct ppc_function *p);
+extern uint ppc_num_instructions(const struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
 extern void ppc_dump_func(const struct ppc_function *p);
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 5d13070922..a92b1902e3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1315,6 +1315,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    tgsi_parse_free( &parse );
 
+   if (ppc_num_instructions(func) == 0) {
+      /* ran out of memory for instructions */
+      ok = FALSE;
+   }
+
+   if (!ok)
+      debug_printf("TGSI->PPC translation failed\n");
+
    return ok;
 }
 
-- 
cgit v1.2.3


From 725ba94ce5701aa8690c7ab2ea792dda86cbbe7a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 16:35:59 -0600
Subject: gallium: no longer pass max_inst to ppc_init_func()

---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 2 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc.c  | 2 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index d720c7bbd5..8b75136144 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -197,7 +197,7 @@ draw_create_vs_ppc(struct draw_context *draw,
    vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
                                       sizeof(float), 16);
 
-   ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
+   ppc_init_func( &vs->ppc_program );
 
    if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
 			&vs->ppc_program, 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index e73ed71a0b..6d11263be8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -43,7 +43,7 @@
 
 
 void
-ppc_init_func(struct ppc_function *p, unsigned max_inst)
+ppc_init_func(struct ppc_function *p)
 {
    uint i;
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index d0477dec94..afb4704c39 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -62,7 +62,7 @@ struct ppc_function
 
 
-extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_init_func(struct ppc_function *p);
 extern void ppc_release_func(struct ppc_function *p);
 extern uint ppc_num_instructions(const struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
-- 
cgit v1.2.3


From f952aac1da432336f330122cacc30a87f52b4101 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 16:56:28 -0600
Subject: gallium: grow SPE instruction buffer as needed

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 57 +++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 16 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index dea1aed032..f8568f690b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -185,6 +185,34 @@ reg_name(int reg)
 }
 
 
+static void
+emit_instruction(struct spe_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE);
+      }
+      align_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 		    unsigned rA, unsigned rB, const char *name)
 {
@@ -193,8 +221,7 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, %s\n",
@@ -212,8 +239,7 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
@@ -230,8 +256,7 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -249,8 +274,7 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -268,8 +292,7 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -295,8 +318,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -311,8 +333,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -394,15 +415,19 @@ void _name (struct spe_function *p, int imm) \
 
 /**
  * Initialize an spe_function.
- * \param code_size  size of instruction buffer to allocate, in bytes.
+ * \param code_size  initial size of instruction buffer to allocate, in bytes.
+ *                   If zero, use a default.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     unsigned int i;
 
-    p->store = align_malloc(code_size, 16);
+    if (!code_size)
+       code_size = 64;
+
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
+    p->store = align_malloc(code_size, 16);
 
     p->set_count = 0;
     memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
-- 
cgit v1.2.3


From 7d7f0f170692962cf57d6893428f3a18f590c060 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 17:02:30 -0600
Subject: gallium: fix copy&paste bug

---
 src/gallium/auxiliary/rtasm/rtasm_execmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index bb3b1a4c25..f16191cb61 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -83,7 +83,7 @@ rtasm_exec_malloc(size_t size)
 
    if (exec_heap) {
       size = (size + 31) & ~31;  /* next multiple of 32 bytes */
-      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
+      block = mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
    }
 
    if (block)
-- 
cgit v1.2.3


From 766cb95a4564c48f35b5180155ab40320a68e371 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 17:02:56 -0600
Subject: gallium: new sanity assertions in mmAllocMem()

---
 src/gallium/auxiliary/util/u_mm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 0f51dd5977..01dd67c810 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -172,6 +172,10 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
    int startofs = 0;
    int endofs;
 
+   assert(size >= 0);
+   assert(align2 >= 0);
+   assert(align2 <= 12); /* sanity check, 2^12 (4KB) enough? */
+
    if (!heap || align2 < 0 || size <= 0)
       return NULL;
 
-- 
cgit v1.2.3


From 157ddc14183807834068687f02c67b66acf9effa Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Thu, 30 Oct 2008 11:22:20 -0600
Subject: cell:  Added check for PIPE_FLUSH_RENDER_CACHE to cell_flush to fix
 black blocks during st_readpixels due to a flush wait not happening in order
 to allow any previous rendering to complete.

---
 src/gallium/drivers/cell/ppu/cell_flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 6596b72010..a64967b4b9 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -49,7 +49,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
       flags |= CELL_FLUSH_WAIT;
    }
 
-   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+   if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
       flags |= CELL_FLUSH_WAIT;
 
    draw_flush( cell->draw );
-- 
cgit v1.2.3


From 711f8a1dd94e2e1e715615d947e03015ef972326 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 30 Oct 2008 15:24:23 -0600
Subject: CELL: stencil bug fixes

Two definitive bugs in stenciling were fixed.

The first, reversed registers in the generated Select Bytes (selb)
instruction, caused the stenciling INCR and DECR operations to
fail dramatically, putting new values in where old values were
supposed to be and vice versa.

The second caused stencil tiles to not be read and written from
main memory by the SPUs.  A per-spu flag, spu.read_depth, was used
to indicate whether the SPU should be reading depth tiles, and was set
only when depth was enabled.  A second flag, spu.read_stencil, was
set when stenciling was enabled, but never referenced.

As stenciling and depth are in the same tiles on the Cell, and there
is no corresponding TAG_WRITE_TILE_STENCIL to complement
TAG_WRITE_TILE_COLOR and TAG_WRITE_TILE_Z, I fixed this by
eliminating the unused "spu.read_stencil", renaming "spu.read_depth"
to "spu.read_depth_stencil", and setting it if either stenciling or
depth is enabled.

I also added an optimization to the fragment ops generation code,
that avoids calculating stencil values and/or stencil writemask
when the stencil operations are all KEEP.
---
 progs/trivial/tri-stencil.c                      | 13 ++++++++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 25 ++++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.c       |  3 +--
 src/gallium/drivers/cell/spu/spu_main.h          |  3 +--
 src/gallium/drivers/cell/spu/spu_render.c        |  4 ++--
 src/gallium/drivers/cell/spu/spu_tri.c           |  2 +-
 6 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'src/gallium')

diff --git a/progs/trivial/tri-stencil.c b/progs/trivial/tri-stencil.c
index 5edbef26ce..7686e16aef 100644
--- a/progs/trivial/tri-stencil.c
+++ b/progs/trivial/tri-stencil.c
@@ -49,7 +49,15 @@ static void Key(unsigned char key, int x, int y)
 
     switch (key) {
       case 27:
+        printf("Exiting...\n");
 	exit(1);
+      case 'r':
+        printf("Redisplaying...\n");
+        glutPostRedisplay();
+        break;
+      default:
+        printf("No such key '%c'...\n", key);
+        break;
     }
 }
 
@@ -89,7 +97,7 @@ static void Draw(void)
    glEnd();
 #endif
 
-#if 0
+#if 1
    glStencilFunc(GL_EQUAL, 1, 1);
    glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
 
@@ -130,7 +138,8 @@ int main(int argc, char **argv)
 	exit(1);
     }
 
-    glutInitWindowPosition(0, 0); glutInitWindowSize( 300, 300);
+    glutInitWindowPosition(0, 0); 
+    glutInitWindowSize( 300, 300);
 
     type = GLUT_RGB | GLUT_SINGLE | GLUT_DEPTH | GLUT_STENCIL;
     glutInitDisplayMode(type);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4e1e53ecdc..8e4dd82404 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1282,7 +1282,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
       spe_ai(f, newS_reg, fbS_reg, 1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1295,7 +1295,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
       /* Add Word Immediate with a (-1) value works */
       spe_ai(f, newS_reg, fbS_reg, -1);
       /* Select from the current value or the new value based on the equality test */
-      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
 
       spe_release_register(f, equals_reg);
       break;
@@ -1534,15 +1534,28 @@ gen_stencil_depth_test(struct spe_function *f,
     * meaning that we have to calculate the stencil values but do not
     * need to mask them), we can avoid generating code.  Don't forget
     * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
-      /* Trivial: don't need to calculate stencil values, and don't need to 
-       * write them back to the framebuffer.
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
+       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+       /* No changes to any stencil values */
+       need_to_calculate_stencil_values = false;
+       need_to_writemask_stencil_values = false;
+    }
+    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 63818d4c46..d726622d94 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -244,8 +244,7 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
       }
    }
 
-   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
-   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 668af10be2..692790c9f3 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -160,8 +160,7 @@ struct spu_global
    tile_t ztile ALIGN16_ATTRIB;
 
    /** Read depth/stencil tiles? */
-   boolean read_depth;
-   boolean read_stencil;
+   boolean read_depth_stencil;
 
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5515bb55c9..7c225e2f27 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -98,7 +98,7 @@ my_tile(uint tx, uint ty)
 static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
          //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
@@ -153,7 +153,7 @@ static INLINE void
 wait_put_cz_tiles(void)
 {
    wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       wait_on_mask(1 << TAG_WRITE_TILE_Z);
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 4caf7d6b61..5f908159bb 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -369,7 +369,7 @@ flush_spans(void)
    }
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
-   if (spu.read_depth) {
+   if (spu.read_depth_stencil) {
       if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          //printf("SPU: %u: waiting for ztile\n", spu.init.id);
-- 
cgit v1.2.3


From 443e102fdc8084dd2c73549c83de10524eb94b31 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Thu, 30 Oct 2008 15:53:12 -0600
Subject: cell: Protected use of non-initialized untile buffers

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 7734381c7e..28161d166e 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -376,8 +376,10 @@ cell_untwiddle_texture(struct pipe_screen *screen,
       }
       break;
    default:
-      printf("Cell: untwiddle unsupported texture format\n");
-      ;
+      {
+         ct->untiled_data[level] = NULL;
+         printf("Cell: untwiddle unsupported texture format\n");
+      }
    }
 
    pipe_buffer_unmap(screen, surface->buffer);
@@ -442,7 +444,8 @@ cell_tex_surface_release(struct pipe_screen *screen,
    struct cell_texture *ct = cell_texture((*s)->texture);
    const uint level = (*s)->level;
 
-   if ((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+   if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+   {
       align_free(ct->untiled_data[level]);
       ct->untiled_data[level] = NULL;
    }
@@ -476,7 +479,7 @@ cell_surface_map(struct pipe_screen *screen,
       return NULL;
    else
    {
-      if (surface->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
          return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
       }
       else {
-- 
cgit v1.2.3


From b81a7dc2d8ba09b48d5022cf9ff65f2fad890e11 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Thu, 30 Oct 2008 23:52:59 +0100
Subject: gallivm: replace the temp parameters of the JIT function with
 alloca'ed temps. This avoids useless writes of temporary results.

---
 src/gallium/auxiliary/gallivm/gallivm_cpu.cpp |  6 ++--
 src/gallium/auxiliary/gallivm/storagesoa.cpp  | 44 +++++++++++++++++++--------
 src/gallium/auxiliary/gallivm/storagesoa.h    | 10 +++---
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp  | 11 ++-----
 4 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 3a2f2878a3..93a9748bdb 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -179,8 +179,7 @@ struct gallivm_cpu_engine * gallivm_global_cpu_engine()
 
 typedef void (*vertex_shader_runner)(void *ainputs,
                                      void *dests,
-                                     float (*aconsts)[4],
-                                     void *temps);
+                                     float (*aconsts)[4]);
 
 #define MAX_TGSI_VERTICES 4
 /*!
@@ -223,8 +222,7 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
       /* run shader */
       runner(machine->Inputs,
              machine->Outputs,
-             (float (*)[4]) constants,
-             machine->Temps);
+             (float (*)[4]) constants);
 
       /* Unswizzle all output results
        */
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 4fc075cf6d..e1e5cabcf5 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -48,13 +48,11 @@ using namespace llvm;
 StorageSoa::StorageSoa(llvm::BasicBlock *block,
                        llvm::Value *input,
                        llvm::Value *output,
-                       llvm::Value *consts,
-                       llvm::Value *temps)
+                       llvm::Value *consts)
    : m_block(block),
      m_input(input),
      m_output(output),
      m_consts(consts),
-     m_temps(temps),
      m_immediates(0),
      m_idx(0)
 {
@@ -169,7 +167,7 @@ std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder,
 {
    llvm::Value* res;
    std::vector<llvm::Value*> res2(4);
-   llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
+   llvm::Value *xChannel;
 
    xChannel = elementPointer(m_consts, idx, 0);
 
@@ -195,14 +193,15 @@ std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
    return res;
 }
 
-std::vector<llvm::Value*> StorageSoa::tempElement(llvm::Value *idx)
+std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx)
 {
    std::vector<llvm::Value*> res(4);
+   llvm::Value *temp = m_temps[idx];
 
-   res[0] = element(m_temps, idx, 0);
-   res[1] = element(m_temps, idx, 1);
-   res[2] = element(m_temps, idx, 2);
-   res[3] = element(m_temps, idx, 3);
+   res[0] = element(temp, constantInt(0), 0);
+   res[1] = element(temp, constantInt(0), 1);
+   res[2] = element(temp, constantInt(0), 2);
+   res[3] = element(temp, constantInt(0), 3);
 
    return res;
 }
@@ -326,7 +325,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
       val = outputElement(realIndex);
       break;
    case TGSI_FILE_TEMPORARY:
-      val = tempElement(realIndex);
+      val = tempElement(m_builder, idx);
       break;
    case TGSI_FILE_CONSTANT:
       val = constElement(m_builder, realIndex);
@@ -355,19 +354,39 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
    return res;
 }
 
+llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder)
+{
+   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vecArray = ArrayType::get(vector, 4);
+   AllocaInst *alloca = new AllocaInst(vecArray, "temp",
+                                       m_builder->GetInsertBlock());
+
+   return alloca;
+}
+
+
 void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-                       int mask)
+                       int mask, llvm::IRBuilder<>* m_builder)
 {
    llvm::Value *out = 0;
+   llvm::Value *realIndex = 0;
    switch(type) {
    case TGSI_FILE_OUTPUT:
       out = m_output;
+      realIndex = constantInt(idx);
       break;
    case TGSI_FILE_TEMPORARY:
-      out = m_temps;
+      // if that temp doesn't already exist, alloca it
+      if (m_temps.find(idx) == m_temps.end())
+         m_temps[idx] = allocaTemp(m_builder);
+
+      out = m_temps[idx];
+
+      realIndex = constantInt(0);
       break;
    case TGSI_FILE_INPUT:
       out = m_input;
+      realIndex = constantInt(idx);
       break;
    case TGSI_FILE_ADDRESS: {
       llvm::Value *addr = m_addresses[idx];
@@ -385,7 +404,6 @@ void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm
       assert(0);
       break;
    }
-   llvm::Value *realIndex = constantInt(idx);
    if ((mask & TGSI_WRITEMASK_X)) {
       llvm::Value *xChannel = elementPointer(out, realIndex, 0);
       new StoreInst(val[0], xChannel, false, m_block);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index f21ca6ec43..56886f85e7 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -52,14 +52,13 @@ public:
    StorageSoa(llvm::BasicBlock *block,
               llvm::Value *input,
               llvm::Value *output,
-              llvm::Value *consts,
-              llvm::Value *temps);
+              llvm::Value *consts);
 
 
    std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
                                   llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
    void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-              int mask);
+              int mask, llvm::IRBuilder<>* m_builder);
 
    void addImmediate(float *vec);
    void declareImmediates();
@@ -84,7 +83,7 @@ private:
    llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
    std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
    std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx);
    std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
 private:
    llvm::BasicBlock *m_block;
@@ -92,12 +91,13 @@ private:
    llvm::Value *m_input;
    llvm::Value *m_output;
    llvm::Value *m_consts;
-   llvm::Value *m_temps;
+   std::map<int, llvm::Value*> m_temps;
    llvm::GlobalVariable *m_immediates;
 
    std::map<int, llvm::Value*> m_addresses;
 
    std::vector<std::vector<float> > m_immediatesToFlush;
+   llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder);
 
    mutable std::map<int, llvm::ConstantInt*> m_constInts;
    mutable char        m_name[32];
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 1191a6cae9..c11b88af9e 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -53,7 +53,6 @@ static inline FunctionType *vertexShaderFunctionType()
    // [4 x <4 x float>] inputs,
    // [4 x <4 x float>] output,
    // [4 x [1 x float]] consts,
-   // [4 x <4 x float>] temps
 
    std::vector<const Type*> funcArgs;
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -67,7 +66,6 @@ static inline FunctionType *vertexShaderFunctionType()
    funcArgs.push_back(vectorArrayPtr);//inputs
    funcArgs.push_back(vectorArrayPtr);//output
    funcArgs.push_back(constsArrayPtr);//consts
-   funcArgs.push_back(vectorArrayPtr);//temps
 
    FunctionType *functionType = FunctionType::get(
       /*Result=*/Type::VoidTy,
@@ -246,7 +244,6 @@ translate_instruction(llvm::Module *module,
          val = storage->constElement(src->SrcRegister.Index, indIdx);
       } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
          val = storage->inputElement(src->SrcRegister.Index, indIdx);
-      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
          val = storage->tempElement(src->SrcRegister.Index);
       } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -677,7 +674,6 @@ translate_instruction(llvm::Module *module,
 
       if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
          storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
          storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
       } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -1027,7 +1023,8 @@ translate_instructionir(llvm::Module *module,
    for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
       storage->store((enum tgsi_file_type)dst->DstRegister.File,
-                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask,
+		     instr->getIRBuilder() );
    }
 }
 
@@ -1122,8 +1119,6 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
    output->setName("outputs");
    Value *consts = args++;
    consts->setName("consts");
-   Value *temps = args++;
-   temps->setName("temps");
 
    BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
 
@@ -1132,7 +1127,7 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
    fi = tgsi_default_full_instruction();
    fd = tgsi_default_full_declaration();
 
-   StorageSoa storage(label_entry, input, output, consts, temps);
+   StorageSoa storage(label_entry, input, output, consts);
    InstructionsSoa instr(mod, shader, label_entry, &storage);
 
    while(!tgsi_parse_end_of_tokens(&parse)) {
-- 
cgit v1.2.3


From 14e1505cce24ee294cb98683504cc4537c20f34a Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 30 Oct 2008 21:31:07 -0600
Subject: CELL: fix use of stencil value mask

The Cell stencil tests were completely ignoring the stencil value mask.
Now the original code paths are still used if the stencil value mask
is all 1s; but code to use the mask for the stencil value and reference
value comparisons is now emitted if the mask is not all 1s.
---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 154 ++++++++++++++++-------
 1 file changed, 112 insertions(+), 42 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 8e4dd82404..6e2a5d2980 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1141,13 +1141,17 @@ gen_colormask(struct spe_function *f,
  * access to the Compare Immediate instructions where we don't in 
  * gen_depth_test(), which is what makes us very different.
  *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
  * The return value in the stencil_pass_reg is a bitmask of valid
  * fragments that also passed the stencil test.  The bitmask of valid
- * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
  */
 static void
 gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
-                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_max_value,
+                 unsigned int fragment_mask_reg, unsigned int fbS_reg, 
                  unsigned int stencil_pass_reg)
 {
    /* Generate code that puts the set of passing fragments into the stencil_pass_reg
@@ -1155,68 +1159,134 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
     */
    switch (state->func) {
    case PIPE_FUNC_EQUAL:
-      /* stencil_pass = mask & (s == reference) */
-      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
    case PIPE_FUNC_NOTEQUAL:
-      /* stencil_pass = mask & ~(s == reference) */
-      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & ~(s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
    case PIPE_FUNC_GREATER:
-      /* stencil_pass = mask & (s > reference) */
-      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) > (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
-   case PIPE_FUNC_LESS: {
-      /* stencil_pass = mask & (reference > s) */
-      /* There's no convenient Compare Less Than Immediate instruction, so
-       * we'll have to do this one the harder way, by loading a register and 
-       * comparing directly.  Compare Logical Greater Than Word (clgt) 
-       * treats its operands as unsigned - no sign extension.
-       */
-      unsigned int tmp_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, tmp_reg, state->ref_value);
-      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
-      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
-      spe_release_register(f, tmp_reg);
+   case PIPE_FUNC_LESS:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference > s) */
+         /* There's no convenient Compare Less Than Immediate instruction, so
+          * we'll have to do this one the harder way, by loading a register and 
+          * comparing directly.  Compare Logical Greater Than Word (clgt) 
+          * treats its operands as unsigned - no sign extension.
+          */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
-   }
 
    case PIPE_FUNC_LEQUAL:
-      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
-      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
-      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s <= reference) 
+          *              = fragment_mask & ~(s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
 
-   case PIPE_FUNC_GEQUAL: {
-      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
-      /* As above, we have to do this by loading a register */
-      unsigned int tmp_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, tmp_reg, state->ref_value);
-      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
-      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
-      spe_release_register(f, tmp_reg);
+   case PIPE_FUNC_GEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s >= reference) ]
+          *               = fragment_mask & ~(reference > s) */
+         /* As above, we have to do this by loading a register */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
       break;
-   }
 
    case PIPE_FUNC_NEVER:
-      /* stencil_pass = mask & 0 = 0 */
+      /* stencil_pass = fragment_mask & 0 = 0 */
       spe_load_uint(f, stencil_pass_reg, 0);
       break;
 
    case PIPE_FUNC_ALWAYS:
-      /* stencil_pass = mask & 1 = mask */
-      spe_move(f, stencil_pass_reg, mask_reg);
+      /* stencil_pass = fragment_mask & 1 = fragment_mask */
+      spe_move(f, stencil_pass_reg, fragment_mask_reg);
       break;
    }
 
    /* The fragments that passed the stencil test are now in stencil_pass_reg.
-    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
     */
 }
 
@@ -1596,7 +1666,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
-   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+   gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
 
    /* If two-sided stenciling is on, generate code to run the stencil
     * test on the backfacing stencil as well, and combine the two results
@@ -1605,7 +1675,7 @@ gen_stencil_depth_test(struct spe_function *f,
    if (dsa->stencil[1].enabled) {
       unsigned int temp_reg = spe_allocate_available_register(f);
       spe_comment(f, 0, "Running backface stencil test");
-      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
       spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
       spe_release_register(f, temp_reg);
    }
-- 
cgit v1.2.3


From 82e1026c30dd416231df66daf9b2f28bfc1f1cd6 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Sat, 18 Oct 2008 13:31:00 +0900
Subject: gallium: Fix msvc warning.

---
 src/gallium/auxiliary/util/u_tile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 853c503f4f..00c12badf2 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -504,7 +504,7 @@ a8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned a;
          a = float_to_ubyte(pRow[3]);
-         *dst++ = a;
+         *dst++ = (ubyte) a;
       }
       p += src_stride;
    }
-- 
cgit v1.2.3


From 467c4760b337a541c7af27f1ed3bd5c4ecba316f Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 28 Oct 2008 16:10:55 +0900
Subject: gallium: Ensure refcounts of live objects are never zero.

---
 src/gallium/include/pipe/p_inlines.h | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
index d70de8e301..5e79b7f485 100644
--- a/src/gallium/include/pipe/p_inlines.h
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -82,11 +82,14 @@ static INLINE void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 {
    /* bump the refcount first */
-   if (surf) 
+   if (surf) {
+      assert(surf->refcount);
       surf->refcount++;
+   }
 
    if (*ptr) {
-
+      assert((*ptr)->refcount);
+      
       /* There are currently two sorts of surfaces... This needs to be
        * fixed so that all surfaces are views into a texture.
        */
@@ -113,11 +116,16 @@ winsys_buffer_reference(struct pipe_winsys *winsys,
 		      struct pipe_buffer **ptr,
 		      struct pipe_buffer *buf)
 {
-   if (buf) 
+   if (buf) {
+      assert(buf->refcount);
       buf->refcount++;
+   }
 
-   if (*ptr && --(*ptr)->refcount == 0)
-      winsys->buffer_destroy( winsys, *ptr );
+   if (*ptr) {
+      assert((*ptr)->refcount);
+      if(--(*ptr)->refcount == 0)
+         winsys->buffer_destroy( winsys, *ptr );
+   }
 
    *ptr = buf;
 }
@@ -133,12 +141,15 @@ pipe_texture_reference(struct pipe_texture **ptr,
 {
    assert(ptr);
 
-   if (pt) 
+   if (pt) { 
+      assert(pt->refcount);
       pt->refcount++;
+   }
 
    if (*ptr) {
       struct pipe_screen *screen = (*ptr)->screen;
       assert(screen);
+      assert((*ptr)->refcount);
       screen->texture_release(screen, ptr);
 
       assert(!*ptr);
@@ -154,6 +165,7 @@ pipe_texture_release(struct pipe_texture **ptr)
    struct pipe_screen *screen;
    assert(ptr);
    screen = (*ptr)->screen;
+   assert((*ptr)->refcount);
    screen->texture_release(screen, ptr);
    *ptr = NULL;
 }
@@ -176,12 +188,6 @@ pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
    return screen->winsys->user_buffer_create(screen->winsys, ptr, size);
 }
 
-static INLINE void
-pipe_buffer_destroy( struct pipe_screen *screen, struct pipe_buffer *buf )
-{
-   screen->winsys->buffer_destroy(screen->winsys, buf);
-}
-
 static INLINE void *
 pipe_buffer_map(struct pipe_screen *screen,
                 struct pipe_buffer *buf,
-- 
cgit v1.2.3


From 28a2edb7389107cd46eb382a44d339dd7972310a Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 28 Oct 2008 16:11:09 +0900
Subject: pipebuffer: Ensure refcounts of live buffer objects are never zero.

---
 src/gallium/auxiliary/pipebuffer/pb_buffer.h        | 15 ++++++++++++---
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c |  3 +--
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 8505d333bd..19db8a6a91 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -177,12 +177,16 @@ pb_get_base_buffer( struct pb_buffer *buf,
 }
 
 
+/**
+ * Don't call this directly. Use pb_reference instead.
+ */
 static INLINE void 
 pb_destroy(struct pb_buffer *buf)
 {
    assert(buf);
    if(!buf)
       return;
+   assert(buf->base.refcount == 0);
    buf->vtbl->destroy(buf);
 }
 
@@ -193,11 +197,16 @@ static INLINE void
 pb_reference(struct pb_buffer **dst,
              struct pb_buffer *src)
 {
-   if (src) 
+   if (src) {
+      assert(src->base.refcount);
       src->base.refcount++;
+   }
 
-   if (*dst && --(*dst)->base.refcount == 0)
-      pb_destroy( *dst );
+   if (*dst) {
+      assert((*dst)->base.refcount);
+      if(--(*dst)->base.refcount == 0)
+         pb_destroy( *dst );
+   }
 
    *dst = src;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 633ee70a75..e2594ea236 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -86,8 +86,7 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
    
    fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
    if(!fenced_buf) {
-      assert(buf->base.refcount == 1);
-      pb_destroy(buf);
+      pb_reference(&buf, NULL);
    }
    
    return fenced_buf;
-- 
cgit v1.2.3


From 95d108416c27f45f4de1178abbe6797cd128ef6a Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Fri, 31 Oct 2008 19:50:43 +0900
Subject: gallium: Fix typo.

---
 src/gallium/auxiliary/util/u_time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
index bf7d1d1c8d..57b80e5604 100644
--- a/src/gallium/auxiliary/util/u_time.c
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -200,7 +200,7 @@ util_time_timeout(const struct util_time *start,
 }
 
 
-#if defined(PIPE_SUBSYSYEM_WINDOWS_DISPLAY)
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
 void util_time_sleep(unsigned usecs)
 {
    LONGLONG start, curr, end;
-- 
cgit v1.2.3


From bdf24007cae9ce485ef123e935eb87c7cba4e0e5 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 3 Nov 2008 20:50:14 +0900
Subject: gallium: WinCE portability fixes.

---
 src/gallium/auxiliary/util/p_debug.c | 39 +++++++++++++++++++++++++++++++++++-
 src/gallium/auxiliary/util/u_math.h  |  2 +-
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 3ed8bdfdf3..a1a51d7ef2 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -36,6 +36,13 @@
 #include <windows.h>
 #include <winddi.h>
 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <windows.h> 
+#include <types.h> 
+
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 
 #ifndef WIN32_LEAN_AND_MEAN
@@ -98,7 +105,35 @@ void _debug_vprintf(const char *format, va_list ap)
       OutputDebugStringA(buf);
       buf[0] = '\0';
    }
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   wchar_t *wide_format;
+   long wide_str_len;   
+   char buf[512];   
+   int ret;   
+#if (_WIN32_WCE < 600)
+   ret = vsprintf(buf, format, ap);   
+   if(ret < 0){   
+       sprintf(buf, "Cant handle debug print!");   
+       ret = 25;
+   }
+#else
+   ret = vsprintf_s(buf, 512, format, ap);   
+   if(ret < 0){   
+       sprintf_s(buf, 512, "Cant handle debug print!");   
+       ret = 25;
+   }
+#endif
+   buf[ret] = '\0';   
+   /* Format is ascii - needs to be converted to wchar_t for printing */   
+   wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);   
+   wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));   
+   if (wide_format) {   
+      MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,   
+            wide_format, wide_str_len);   
+      NKDbgPrintfW(wide_format, wide_format);   
+      free(wide_format);   
+   } 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
    /* TODO */
 #else /* !PIPE_SUBSYSTEM_WINDOWS */
    vfprintf(stderr, format, ap);
@@ -637,6 +672,7 @@ void
 debug_dump_surface_bmp(const char *filename,
                        struct pipe_surface *surface)
 {
+#ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
    struct util_stream *stream;
    unsigned surface_usage;
    struct bmp_file_header bmfh;
@@ -703,6 +739,7 @@ error2:
    FREE(rgba);
 error1:
    ;
+#endif
 }
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index be7303e550..d2eaa2e7f7 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -68,7 +68,7 @@ __inline double ceil(double val)
    return ceil_val;
 }
 
-#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE_OGL
 __inline double floor(double val)
 {
    double floor_val;
-- 
cgit v1.2.3


From 7b42a5d634d32c3f15f3a3535b2b9328dfca49bf Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Sat, 25 Oct 2008 03:35:01 +0900
Subject: gallium: Read from PIPE_FORMAT_Z32_FLOAT.

Mainly for debugging purposes for now.
---
 src/gallium/auxiliary/util/u_tile.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 00c12badf2..fa683b6774 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -769,6 +769,32 @@ z24s8_get_tile_rgba(const unsigned *src,
 }
 
 
+/*** PIPE_FORMAT_Z32_FLOAT ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32f_get_tile_rgba(const float *src,
+                   unsigned w, unsigned h,
+                   float *p,
+                   unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++;
+      }
+      p += dst_stride;
+   }
+}
+
+
 /*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
 
 /**
@@ -913,6 +939,9 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_Z24S8_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_Z32_FLOAT:
+      z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_YCBCR:
       ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
       break;
-- 
cgit v1.2.3


From 95438727ddc4012d6e2db843d7173607b2a23b56 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Tue, 26 Aug 2008 17:40:24 +0200
Subject: gallium: Silence compiler warnings on Windows.

---
 src/gallium/auxiliary/util/u_tile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index fa683b6774..32f6b072a0 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -460,7 +460,7 @@ l8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned r;
          r = float_to_ubyte(pRow[0]);
-         *dst++ = r;
+         *dst++ = (ubyte) r;
       }
       p += src_stride;
    }
@@ -634,7 +634,7 @@ i8_put_tile_rgba(ubyte *dst,
       for (j = 0; j < w; j++, pRow += 4) {
          unsigned r;
          r = float_to_ubyte(pRow[0]);
-         *dst++ = r;
+         *dst++ = (ubyte) r;
       }
       p += src_stride;
    }
-- 
cgit v1.2.3


From 502974b345dae8a3ca641083b4df5183b04ca825 Mon Sep 17 00:00:00 2001
From: michal <michal@quad.(none)>
Date: Wed, 5 Nov 2008 11:48:56 +0100
Subject: tgsi: Implement OPCODE_TRUNC.

---
 src/gallium/auxiliary/tgsi/tgsi_sse2.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4681b29f52..c115956c5d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -657,6 +657,17 @@ emit_f2it(
       make_xmm( xmm ) );
 }
 
+static void
+emit_i2f(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvtdq2ps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
 static void PIPE_CDECL
 flr4f(
    float *store )
@@ -1967,7 +1978,12 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TRUNC:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         emit_i2f( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_SHL:
-- 
cgit v1.2.3


From 5a0299875c7a4a9a0cb2cf55777c92c1b17d528b Mon Sep 17 00:00:00 2001
From: michal <michal@quad.(none)>
Date: Wed, 5 Nov 2008 11:58:11 +0100
Subject: draw: Implement TGSI_OPCODE_TRUNC.

---
 src/gallium/auxiliary/draw/draw_vs_aos.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 87232865e2..a6880685db 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -1632,6 +1632,17 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst
    return TRUE;
 }
 
+static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+
+   sse2_cvttps2dq(cp->func, tmp0, arg0);
+   sse2_cvtdq2ps(cp->func, tmp0, tmp0);
+
+   store_dest(cp, &op->FullDstRegisters[0], tmp0);
+   return TRUE;
+}
 
 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
 {
@@ -1770,6 +1781,9 @@ emit_instruction( struct aos_compilation *cp,
    case TGSI_OPCODE_SIN:
       return emit_SIN(cp, inst);
 
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(cp, inst);
+
    case TGSI_OPCODE_END:
       return TRUE;
 
-- 
cgit v1.2.3


From de2ace201fe26d36a2a75211a7d8447940a47fbe Mon Sep 17 00:00:00 2001
From: michal <michal@quad.(none)>
Date: Wed, 5 Nov 2008 11:48:56 +0100
Subject: tgsi: Implement OPCODE_TRUNC.

---
 src/gallium/auxiliary/tgsi/tgsi_sse2.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index f79170b9d6..47e52c8424 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -784,6 +784,17 @@ emit_f2it(
       make_xmm( xmm ) );
 }
 
+static void
+emit_i2f(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvtdq2ps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
 static void PIPE_CDECL
 flr4f(
    float *store )
@@ -2104,7 +2115,12 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TRUNC:
-      return 0;
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         emit_i2f( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_SHL:
-- 
cgit v1.2.3


From 7115b79b77e541f3eb81db00f6f0c34a0f224feb Mon Sep 17 00:00:00 2001
From: michal <michal@quad.(none)>
Date: Wed, 5 Nov 2008 11:58:11 +0100
Subject: draw: Implement TGSI_OPCODE_TRUNC.

---
 src/gallium/auxiliary/draw/draw_vs_aos.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 87232865e2..a6880685db 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -1632,6 +1632,17 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst
    return TRUE;
 }
 
+static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+
+   sse2_cvttps2dq(cp->func, tmp0, arg0);
+   sse2_cvtdq2ps(cp->func, tmp0, tmp0);
+
+   store_dest(cp, &op->FullDstRegisters[0], tmp0);
+   return TRUE;
+}
 
 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
 {
@@ -1770,6 +1781,9 @@ emit_instruction( struct aos_compilation *cp,
    case TGSI_OPCODE_SIN:
       return emit_SIN(cp, inst);
 
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(cp, inst);
+
    case TGSI_OPCODE_END:
       return TRUE;
 
-- 
cgit v1.2.3


From 64a9908816a95849557678c8cab6071aa086f7e2 Mon Sep 17 00:00:00 2001
From: Jakob Bornecrantz <jakob@tungstengraphics.com>
Date: Wed, 5 Nov 2008 16:49:48 +0100
Subject: i915: Remove faulty assert

---
 src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
index 8a18bfd9a4..34ad7eebe1 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
@@ -94,7 +94,6 @@ intelDisplaySurface(__DRIdrawablePrivate *dPriv,
       int i;
 
       ASSERT(surf->buffer);
-      ASSERT(surf->cpp == cpp);
 
       DBG(SWAP, "screen pitch %d  src surface pitch %d\n",
 	  pitch, surf->stride);
-- 
cgit v1.2.3


From fc3b361191c35d2b0b072c08e39b1e5b26d7e2a6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 08:57:11 -0700
Subject: gallium: disable some debug output

---
 src/gallium/auxiliary/draw/draw_vs_aos.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index a6880685db..6141ba9cbf 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -2190,7 +2190,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    if (!vaos->buffer)
       goto fail;
 
-   debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+   if (0)
+      debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
 
 #if 0
    tgsi_dump(vs->state.tokens, 0);
-- 
cgit v1.2.3


From 05a17f83b0a6549fde41540f9075505e81ab08d3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 08:58:40 -0700
Subject: gallium: added some debug code (disabled)

---
 src/gallium/auxiliary/draw/draw_pt.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 87ec6ae20c..b98c0a0ecf 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -33,6 +33,8 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+#include "tgsi/tgsi_dump.h"
 
 static unsigned trim( unsigned count, unsigned first, unsigned incr )
 {
@@ -195,6 +197,28 @@ draw_arrays(struct draw_context *draw, unsigned prim,
       draw->reduced_prim = reduced_prim;
    }
 
+#if 0
+   {
+      int i;
+      debug_printf("draw_arrays(prim=%u start=%u count=%u):\n",
+                   prim, start, count);
+      tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
+      debug_printf("Elements:\n");
+      for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+         debug_printf("  format=%s comps=%u\n",
+                      pf_name(draw->pt.vertex_element[i].src_format),
+                      draw->pt.vertex_element[i].nr_components);
+      }
+      debug_printf("Buffers:\n");
+      for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+         debug_printf("  pitch=%u offset=%u ptr=%p\n",
+                      draw->pt.vertex_buffer[i].pitch,
+                      draw->pt.vertex_buffer[i].buffer_offset,
+                      draw->pt.user.vbuffer[i]);
+      }
+   }
+#endif
+
    /* drawing done here: */
    draw_pt_arrays(draw, prim, start, count);
 }
-- 
cgit v1.2.3


From a137f03c56688c190f3542fb6b7c9a4ff4c80cff Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 13:55:56 -0700
Subject: gallium: added some sanity check assertions for constant buffer
 indexing

---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index df002939c6..ea5a44fb8a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -958,6 +958,10 @@ fetch_src_file_channel(
       switch( file ) {
       case TGSI_FILE_CONSTANT:
          assert(mach->Consts);
+         assert(index->i[0] >= 0);
+         assert(index->i[1] >= 0);
+         assert(index->i[2] >= 0);
+         assert(index->i[3] >= 0);
          chan->f[0] = mach->Consts[index->i[0]][swizzle];
          chan->f[1] = mach->Consts[index->i[1]][swizzle];
          chan->f[2] = mach->Consts[index->i[2]][swizzle];
-- 
cgit v1.2.3


From 03c0ce4c61fd970509d605fe78166e828fc1df57 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 13:56:20 -0700
Subject: gallium: added tgsi_set_exec_mask()

---
 src/gallium/auxiliary/tgsi/tgsi_exec.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index c4e649e69c..fc40a25e09 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -165,6 +165,10 @@ struct tgsi_exec_labels
 #define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
 #define TGSI_EXEC_TEMP_HALF_C       1
 
+/* execution mask, each value is either 0 or ~0 */
+#define TGSI_EXEC_MASK_I            (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_MASK_C            2
+
 #define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
 
 #define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 5)
@@ -265,6 +269,27 @@ void
 tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach);
 
 
+static INLINE void
+tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
+{
+   mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
+      mask;
+}
+
+
+/** Set execution mask values prior to executing the shader */
+static INLINE void
+tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
+                   boolean ch0, boolean ch1, boolean ch2, boolean ch3)
+{
+   int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
+   mask[0] = ch0 ? ~0 : 0;
+   mask[1] = ch1 ? ~0 : 0;
+   mask[2] = ch2 ? ~0 : 0;
+   mask[3] = ch3 ? ~0 : 0;
+}
+
+
 #if defined __cplusplus
 } /* extern "C" */
 #endif
-- 
cgit v1.2.3


From f0debbb0bb951bfc6dc0ae467564b3b1230324cf Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 14:02:07 -0700
Subject: gallium: call tgsi_set_exec_mask() and use exec mask in SSE ARL code

This prevents vertex shaders from referencing invalid memory locations when
the shader is operating on less than four vertices or fragments.
---
 src/gallium/auxiliary/draw/draw_vs_exec.c |  6 ++++++
 src/gallium/auxiliary/draw/draw_vs_sse.c  | 14 +++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_sse2.c    | 35 ++++++++++++++++++++++++++++---
 src/gallium/drivers/softpipe/sp_fs_sse.c  |  3 ++-
 4 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 44563803f9..82d27d4493 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -120,6 +120,12 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
 	 input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
+      tgsi_set_exec_mask(machine,
+                         1,
+                         max_vertices > 1,
+                         max_vertices > 2,
+                         max_vertices > 3);
+
       /* run interpreter */
       tgsi_exec_machine_run( machine );
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0efabd9de8..77ba5152f9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -99,9 +99,23 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
    struct tgsi_exec_machine *machine = shader->machine;
    unsigned int i;
 
+   /* By default, execute all channels.  XXX move this inside the loop
+    * below when we support shader conditionals/loops.
+    */
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
+
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
 
+      if (max_vertices < 4) {
+         /* disable the unused execution channels */
+         tgsi_set_exec_mask(machine,
+                            1,
+                            max_vertices > 1,
+                            max_vertices > 2,
+                            0);
+      }
+
       /* run compiled shader
        */
       shader->func(machine->Inputs,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index c115956c5d..4d59106dbf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -69,6 +69,9 @@
 
 #define TEMP_R0   TGSI_EXEC_TEMP_R0
 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
+#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
+
 
 /**
  * X86 utility functions.
@@ -230,6 +233,9 @@ emit_const(
    int indirectIndex )
 {
    if (indirect) {
+      /* 'vec' is the offset from the address register's value.
+       * We're loading CONST[ADDR+vec] into an xmm register.
+       */
       struct x86_reg r0 = get_input_base();
       struct x86_reg r1 = get_output_base();
       uint i;
@@ -240,18 +246,40 @@ emit_const(
       x86_push( func, r0 );
       x86_push( func, r1 );
 
+      /*
+       * Loop over the four pixels or vertices in the quad.
+       * Get the value of the address (offset) register for pixel/vertex[i],
+       * add it to the src offset and index into the constant buffer.
+       * Note that we're working on SOA data.
+       * If any of the pixel/vertex execution channels are unused their
+       * values will be garbage.  It's very important that we don't use
+       * those garbage values as indexes into the constant buffer since
+       * that'll cause segfaults.
+       * The solution is to bitwise-AND the offset with the execution mask
+       * register whose values are either 0 or ~0.
+       * The caller must setup the execution mask register to indicate
+       * which channels are valid/alive before running the shader.
+       * The execution mask will also figure into loops and conditionals
+       * someday.
+       */
       for (i = 0; i < QUAD_SIZE; i++) {
-         x86_lea( func, r0, get_const( vec, chan ) );
+         /* r1 = address register[i] */
          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+         /* r0 = execution mask[i] */
+         x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
+         /* r1 = r1 & r0 */
+         x86_and( func, r1, r0 );
+         /* r0 = 'vec', the offset */
+         x86_lea( func, r0, get_const( vec, chan ) );
 
-         /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
+         /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
           */
          x86_add( func, r1, r1 );
          x86_add( func, r1, r1 );
          x86_add( func, r1, r1 );
          x86_add( func, r1, r1 );
 
-         x86_add( func, r0, r1 );
+         x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
          x86_mov( func, r1, x86_deref( r0 ) );
          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
       }
@@ -265,6 +293,7 @@ emit_const(
          get_temp( TEMP_R0, CHAN_X ) );
    }
    else {
+      /* 'vec' is the index into the src register file, such as TEMP[vec] */
       assert( vec >= 0 );
 
       sse_movss(
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 496ed43df2..50eb2c07bc 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -92,7 +92,8 @@ fs_sse_run( const struct sp_fragment_shader *base,
 		       machine->Temps);
 
    /* init kill mask */
-   machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] = 0x0;
+   tgsi_set_kill_mask(machine, 0x0);
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
 
    shader->func( machine->Inputs,
 		 machine->Outputs,
-- 
cgit v1.2.3


From cbce12b5404846520bb776f73885f0ea99a13124 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 17:14:00 -0700
Subject: gallium: s/mmDestroy/u_mmDestroy/

---
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index 6e10cf1806..a976d3041a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -262,7 +262,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    
 failure:
 if(mm->heap)
-   mmDestroy(mm->heap);
+   u_mmDestroy(mm->heap);
    if(mm->map)
       pb_unmap(mm->buffer);
    if(mm)
-- 
cgit v1.2.3


From 88360913a730795d031b2ff20fe50d438ef1c151 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 17:20:35 -0700
Subject: cell: minor reformatting, var renaming

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 28161d166e..ae88d06912 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -28,6 +28,7 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   *   Michel Dänzer <michel@tungstengraphics.com>
+  *   Brian Paul
   */
 
 #include "pipe/p_context.h"
@@ -41,10 +42,10 @@
 #include "cell_state.h"
 #include "cell_texture.h"
 
-/* Simple, maximally packed layout.
- */
 
-static unsigned minify( unsigned d )
+
+static unsigned
+minify(unsigned d)
 {
    return MAX2(1, d>>1);
 }
@@ -209,6 +210,7 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
    }
 }
 
+
 /**
  * For Cell.  Basically, rearrange the pixels/quads from this layout:
  *  +--+--+--+--+
@@ -238,22 +240,22 @@ twiddle_tile(const uint *tileIn, uint *tileOut)
    }
 }
 
+
 /**
  * Convert image from tiled layout to linear layout.  4-byte pixels.
  */
 static void
 untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
-                     uint src_stride, const uint *src)
+                     uint dst_stride, const uint *src)
 {
    const uint tile_size2 = tile_size * tile_size;
    const uint h_t = (h + tile_size - 1) / tile_size;
    const uint w_t = (w + tile_size - 1) / tile_size;
    uint *tile_buf;
-
    uint it, jt;  /* tile counters */
    uint i, j;    /* intra-tile counters */
 
-   src_stride /= 4; /* convert from bytes to pixels */
+   dst_stride /= 4; /* convert from bytes to pixels */
 
    tile_buf = align_malloc(tile_size * tile_size * 4, 16);
    
@@ -282,7 +284,7 @@ untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
                uint dstj = jt * tile_size + j;
                ASSERT(dsti < h);
                ASSERT(dstj < w);
-               dst[dsti * src_stride + dstj] = tsrc[i * tile_size + j];
+               dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
             }
          }
       }
@@ -291,6 +293,7 @@ untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
    align_free(tile_buf);
 }
 
+
 /**
  * Convert linear texture image data to tiled format for SPU usage.
  */
@@ -341,6 +344,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
    pipe_buffer_unmap(screen, surface->buffer);
 }
 
+
 /**
  * Convert SPU tiled texture image data to linear format for app usage.
  */
-- 
cgit v1.2.3


From 639a2b0ec853eda49e3e7150b2ed7f8f40d101af Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 19:26:20 -0700
Subject: gallium: don't range check tgsi register index for indirect accesses

Fixes progs/vp/arl.txt test.
---
 src/gallium/auxiliary/tgsi/tgsi_sanity.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 11659247c0..bc7b941b78 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -153,17 +153,21 @@ check_register_usage(
    if (!check_file_name( ctx, file ))
       return FALSE;
 
-   if (index < 0 || index > MAX_REGISTERS) {
-      report_error( ctx, "%s[%i]: Invalid index %s", file_names[file], index, name );
-      return FALSE;
-   }
-
    if (indirect_access) {
+      /* Note that 'index' is an offset relative to the value of the
+       * address register.  No range checking done here.
+       */
       if (!is_any_register_declared( ctx, file ))
          report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
       ctx->regs_ind_used[file] = TRUE;
    }
    else {
+      if (index < 0 || index > MAX_REGISTERS) {
+         report_error( ctx, "%s[%i]: Invalid index %s",
+                       file_names[file], index, name );
+         return FALSE;
+      }
+
       if (!is_register_declared( ctx, file, index ))
          report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name );
       ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG));
-- 
cgit v1.2.3


From 5b2b064a5c1328449e3eb8179afc2ba366f18ae6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 5 Nov 2008 20:04:49 -0700
Subject: gallium: check execution mask in indirect register loads

Zero-out the index for disabled execution channels to avoid using potential
garbage values (thus avoiding bad array indexing).
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index ea5a44fb8a..53e92b96ae 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1045,12 +1045,16 @@ fetch_source(
    if (reg->SrcRegister.Indirect) {
       union tgsi_exec_channel index2;
       union tgsi_exec_channel indir_index;
+      const uint execmask = mach->ExecMask;
+      uint i;
 
+      /* which address register (always zero now) */
       index2.i[0] =
       index2.i[1] =
       index2.i[2] =
       index2.i[3] = reg->SrcRegisterInd.Index;
 
+      /* get current value of address register[swizzle] */
       swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
       fetch_src_file_channel(
          mach,
@@ -1059,10 +1063,19 @@ fetch_source(
          &index2,
          &indir_index );
 
+      /* add value of address register to the offset */
       index.i[0] += indir_index.i[0];
       index.i[1] += indir_index.i[1];
       index.i[2] += indir_index.i[2];
       index.i[3] += indir_index.i[3];
+
+      /* for disabled execution channels, zero-out the index to
+       * avoid using a potential garbage value.
+       */
+      for (i = 0; i < QUAD_SIZE; i++) {
+         if ((execmask & (1 << i)) == 0)
+            index.i[i] = 0;
+      }
    }
 
    if( reg->SrcRegister.Dimension ) {
@@ -1091,6 +1104,8 @@ fetch_source(
       if (reg->SrcRegisterDim.Indirect) {
          union tgsi_exec_channel index2;
          union tgsi_exec_channel indir_index;
+         const uint execmask = mach->ExecMask;
+         uint i;
 
          index2.i[0] =
          index2.i[1] =
@@ -1109,6 +1124,14 @@ fetch_source(
          index.i[1] += indir_index.i[1];
          index.i[2] += indir_index.i[2];
          index.i[3] += indir_index.i[3];
+
+         /* for disabled execution channels, zero-out the index to
+          * avoid using a potential garbage value.
+          */
+         for (i = 0; i < QUAD_SIZE; i++) {
+            if ((execmask & (1 << i)) == 0)
+               index.i[i] = 0;
+         }
       }
    }
 
-- 
cgit v1.2.3


From d177c9ddda2c452cf7d6696d89cf4458ef986f98 Mon Sep 17 00:00:00 2001
From: Zack Rusin <zack@tungstengraphics.com>
Date: Thu, 6 Nov 2008 16:07:28 -0500
Subject: gallium: actually flip the coordinates

---
 src/gallium/auxiliary/util/u_rect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index f5619ef791..30f32413d7 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -222,7 +222,8 @@ util_surface_copy(struct pipe_context *pipe,
                      w, h,
                      src_map,
                      do_flip ? -(int) src->stride : src->stride,
-                     src_x, src_y);
+                     src_x,
+                     do_flip ? w - src_y : src_y);
    }
 
    pipe->screen->surface_unmap(pipe->screen, src);
-- 
cgit v1.2.3


From 93fd5e150ba2a86b51816b60bf5faf1da34803b7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 6 Nov 2008 14:56:59 -0700
Subject: softpipe: debug code (disabled)

---
 src/gallium/drivers/softpipe/sp_quad_output.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index d05e12d1d9..b7aac7f84a 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -64,6 +64,14 @@ output_quad(struct quad_stage *qs, struct quad_header *quad)
             for (i = 0; i < 4; i++) { /* loop over color chans */
                tile->data.color[y][x][i] = quadColor[i][j];
             }
+            if (0) {
+               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
+                            quad->input.x0 + x,
+                            quad->input.y0 + y,
+                            quadColor[0][j],
+                            quadColor[1][j],
+                            quadColor[2][j]);
+            }
          }
       }
    }
-- 
cgit v1.2.3


From 6c3e7365d5245cfad597cd69e2f8f689e62546b9 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 6 Nov 2008 14:57:20 -0700
Subject: gallium: debug code to print vertex array data (disabled)

---
 src/gallium/auxiliary/draw/draw_pt.c | 89 ++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index b98c0a0ecf..3c175f31d8 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -178,6 +178,92 @@ void draw_pt_destroy( struct draw_context *draw )
 }
 
 
+/**
+ * Debug- print the first 'count' vertices.
+ */
+static void
+draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
+{
+   uint i;
+
+   debug_printf("Draw arrays(prim = %u, start = %u, count = %u)\n",
+                prim, start, count);
+
+   for (i = 0; i < count; i++) {
+      uint ii, j;
+
+      if (draw->pt.user.elts) {
+         /* indexed arrays */
+         switch (draw->pt.user.eltSize) {
+         case 1:
+            {
+               const ubyte *elem = (const ubyte *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         case 2:
+            {
+               const ushort *elem = (const ushort *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         case 4:
+            {
+               const uint *elem = (const uint *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         default:
+            assert(0);
+         }
+         debug_printf("Element[%u + %u] -> Vertex %u:\n", start, i, ii);
+      }
+      else {
+         /* non-indexed arrays */
+         ii = start + i;
+         debug_printf("Vertex %u:\n", ii);
+      }
+
+      for (j = 0; j < draw->pt.nr_vertex_elements; j++) {
+         uint buf = draw->pt.vertex_element[j].vertex_buffer_index;
+         ubyte *ptr = (ubyte *) draw->pt.user.vbuffer[buf];
+         ptr += draw->pt.vertex_buffer[buf].pitch * ii;
+         ptr += draw->pt.vertex_element[j].src_offset;
+
+         debug_printf("  Attr %u: ", j);
+         switch (draw->pt.vertex_element[j].src_format) {
+         case PIPE_FORMAT_R32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f  @ %p\n", v[0], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f  @ %p\n", v[0], v[1], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32B32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f %f  @ %p\n", v[0], v[1], v[2], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32B32A32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f %f %f  @ %p\n", v[0], v[1], v[2], v[3],
+                            (void *) v);
+            }
+            break;
+         default:
+            debug_printf("other format (fix me)\n");
+            ;
+         }
+      }
+   }
+}
 
 
 /**
@@ -197,6 +283,9 @@ draw_arrays(struct draw_context *draw, unsigned prim,
       draw->reduced_prim = reduced_prim;
    }
 
+   if (0)
+      draw_print_arrays(draw, prim, start, MIN2(count, 20));
+
 #if 0
    {
       int i;
-- 
cgit v1.2.3


From bb8a9ce705f309a3b38d10c61c3865db79a0f71c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 6 Nov 2008 19:24:47 -0700
Subject: gallium: implement TGSI_OPCODE_NRM/NRM4 in tgsi_exec.c

---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 59 +++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 53e92b96ae..41dffc3dba 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2462,7 +2462,64 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      assert (0);
+      /* 3-component vector normalize */
+      {
+         union tgsi_exec_channel tmp, dot;
+
+         /* tmp = dp3(src0, src0): */
+         FETCH( &r[0], 0, CHAN_X );
+         micro_mul( &tmp, &r[0], &r[0] );
+
+         FETCH( &r[1], 0, CHAN_Y );
+         micro_mul( &dot, &r[1], &r[1] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[2], 0, CHAN_Z );
+         micro_mul( &dot, &r[2], &r[2] );
+         micro_add( &tmp, &tmp, &dot );
+
+         /* tmp = 1 / tmp */
+         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+         /* note: w channel is undefined */
+         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+            /* chan = chan * tmp */
+            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+            STORE( &r[chan_index], 0, chan_index );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_NRM4:
+      /* 4-component vector normalize */
+      {
+         union tgsi_exec_channel tmp, dot;
+
+         /* tmp = dp4(src0, src0): */
+         FETCH( &r[0], 0, CHAN_X );
+         micro_mul( &tmp, &r[0], &r[0] );
+
+         FETCH( &r[1], 0, CHAN_Y );
+         micro_mul( &dot, &r[1], &r[1] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[2], 0, CHAN_Z );
+         micro_mul( &dot, &r[2], &r[2] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[3], 0, CHAN_W );
+         micro_mul( &dot, &r[3], &r[3] );
+         micro_add( &tmp, &tmp, &dot );
+
+         /* tmp = 1 / tmp */
+         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+            /* chan = chan * tmp */
+            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+            STORE( &r[chan_index], 0, chan_index );
+         }
+      }
       break;
 
    case TGSI_OPCODE_DIV:
-- 
cgit v1.2.3


From b493fdd7e333b9a94176a603009643326a538690 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 7 Nov 2008 11:29:07 -0700
Subject: CELL: fix several stencil problems

This small set of changes repairs several different stenciling problems;
now redbook/stencil also runs correctly (and maybe others - I haven't
checked everything yet).

- The number of instructions that had been allocated for fragment ops
  used to be 64 (in cell/common.h).  With complicated stencil use, we
  managed to get up to 93, which caused a segfault before we noticed
  we'd overran our memory buffer.  It's now been bumped to 128,
  which should be enough for even complicated stencil and fragment op
  usage.

- The status of cell surfaces never changed beyond the initial
  PIPE_SURFACE_STATUS_UNDEFINED.  When a user called glClear()
  to clear just the Z buffer (but not the stencil buffer), this caused
  the check_clear_depth_with_quad() function to return false (because
  the surface status was believed to be undefined), and so the device
  was instructed to clear the whole buffer (including the stencil buffer),
  instead of correctly using a quad to clear just the depth, leaving the
  stencil alone.

  This has been fixed similarly to the way the i915 driver handles
  the surface status: during cell_clear_surface(), the status is
  set to PIPE_SURFACE_STATUS_DEFINED.  Then a partial buffer clear is
  handled with a quad, as expected.  Note that we are *not* using
  PIPE_SURFACE_STATUS_CLEAR (also similar to the i915); technically,
  we should be setting the surface status to CLEAR on a clear, and
  to DEFINED when we actually draw something (say on cell_vbuf_draw()),
  but it's difficult to figure out exactly which surfaces are affected
  by a cell_vbuf_draw(), so for now we're doing the easy thing.

- The fragment ops handling was very clever about only pulling out the
  parts of the Z/stencil buffer that it needed for calculations;
  but this failed when only part of the buffer was written, because
  the part that was never pulled out was inadvertently cleared.

  Now all the data from the combined Z/stencil buffer is pulled out,
  just so the proper values can be recombined later and written back
  to the buffer correctly.  As a bonus, the fragment op code generation
  is simplified.
---
 src/gallium/drivers/cell/common.h                |   2 +-
 src/gallium/drivers/cell/ppu/cell_clear.c        |  13 ++
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 153 ++++++++++-------------
 3 files changed, 80 insertions(+), 88 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 23fb0b0831..87488ea2d7 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -122,7 +122,7 @@
 #define CELL_DEBUG_CACHE                (1 << 6)
 
 /** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 64
+#define SPU_MAX_FRAGMENT_OPS_INSTS 128
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index c9c0c721bb..037635e466 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -106,4 +106,17 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
+
+   /* Technically, the surface's contents are now known and cleared,
+    * so we could set the status to PIPE_SURFACE_STATUS_CLEAR.  But
+    * it turns out it's quite painful to recognize when any particular
+    * surface goes from PIPE_SURFACE_STATUS_CLEAR to 
+    * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+    * the drawing commands could be operating on numerous draw buffers,
+    * which we'd have to iterate through to set all their stati...
+    * For now, we cheat a bit and set the surface's status to DEFINED
+    * right here.  Later we should revisit this and set the status to
+    * CLEAR here, and find a better place to set the status to DEFINED.
+    */
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 6e2a5d2980..d9c3ff3f4d 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1997,81 +1997,79 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
        * Z and/or stencil.  We'll also convert the incoming fragment Z
        * value in fragZ_reg from a floating point value in [0.0..1.0] to
        * an unsigned integer value with the appropriate resolution.
+       * Note that even if depth or stencil is *not* enabled, if it's
+       * present in the buffer, we pull it out and put it back later;
+       * otherwise, we can inadvertently destroy the contents of
+       * buffers we're not supposed to touch (e.g., if the user is
+       * clearing the depth buffer but not the stencil buffer, a
+       * quad of constant depth is drawn over the surface; the stencil
+       * buffer must be maintained).
        */
       switch(zs_format) {
 
          case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
          case PIPE_FORMAT_X8Z24_UNORM:
-            if (dsa->depth.enabled) {
-               /* We need the Z part at least */
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* four 24-bit Z values in the low-order bits */
-               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 24-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-            }
-            if (dsa->stencil[0].enabled) {
-               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
-               /* four 8-bit Z values in the high-order bits */
-               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
-            }
-            break;
+            /* Pull out both Z and stencil */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+            /* four 24-bit Z values in the low-order bits */
+            spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+            /* four 8-bit stencil values in the high-order bits */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+         break;
 
          case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
          case PIPE_FORMAT_Z24X8_UNORM:
-            if (dsa->depth.enabled) {
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* shift by 8 to get the upper 24-bit values */
-               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 24-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-            }
-            if (dsa->stencil[0].enabled) {
-               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
-               /* 8-bit stencil in the low-order bits - mask them out */
-               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
-            }
-            break;
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+            /* shift by 8 to get the upper 24-bit values */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+            /* 8-bit stencil in the low-order bits - mask them out */
+            spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+         break;
 
          case PIPE_FORMAT_Z32_UNORM:
-            if (dsa->depth.enabled) {
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* Copy over 4 32-bit values */
-               spe_move(f, fbZ_reg, fbZS_reg);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 32-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            }
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 32-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
             /* No stencil, so can't do anything there */
-            break;
+         break;
 
          case PIPE_FORMAT_Z16_UNORM:
-            if (dsa->depth.enabled) {
-               /* XXX Not sure this is correct, but it was here before, so we're
-                * going with it for now
-                */
-               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-               /* Copy over 4 32-bit values */
-               spe_move(f, fbZ_reg, fbZS_reg);
-
-               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-                * to a 16-bit unsigned integer
-                */
-               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-            }
+            /* XXX Not sure this is correct, but it was here before, so we're
+             * going with it for now
+             */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 16-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
             /* No stencil */
-            break;
 
          default:
             ASSERT(0); /* invalid format */
@@ -2118,39 +2116,19 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_comment(f, 0, "Store quad's depth/stencil values in tile");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            if (fbS_reg_set && fbZ_reg_set) {
-               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
-            }
-            else if (fbS_reg_set) {
-               spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            }
-            else {
-               spe_move(f, fbZS_reg, fbZ_reg);
-            }
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            if (fbS_reg_set && fbZ_reg_set) {
-               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
-            }
-            else if (fbS_reg_set) {
-               spe_move(f, fbZS_reg, fbS_reg);
-            }
-            else {
-               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            }
+            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            if (fbZ_reg_set) {
-               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
-            }
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            if (fbZ_reg_set) {
-               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
-            }
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
             ASSERT(0);   /* XXX to do */
@@ -2163,6 +2141,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
+      /* Don't need these any more */
       release_optional_register(f, &fbZ_reg_set, fbZ_reg);
       release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
-- 
cgit v1.2.3


From cf9836cf09790de70732963ea571b83719c0c03c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 7 Nov 2008 13:02:43 -0700
Subject: gallium: implement TGSI_OPCODE_DP2A, add sqrt to NRM3/NRM4

---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 8d135f8777..1da04ab7e0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2034,7 +2034,21 @@ exec_instruction(
 
    case TGSI_OPCODE_DOT2ADD:
       /* TGSI_OPCODE_DP2A */
-      assert (0);
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[2], 2, CHAN_X );
+      micro_add( &r[0], &r[0], &r[2] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
       break;
 
    case TGSI_OPCODE_INDEX:
@@ -2479,7 +2493,8 @@ exec_instruction(
          micro_mul( &dot, &r[2], &r[2] );
          micro_add( &tmp, &tmp, &dot );
 
-         /* tmp = 1 / tmp */
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
 
          /* note: w channel is undefined */
@@ -2512,7 +2527,8 @@ exec_instruction(
          micro_mul( &dot, &r[3], &r[3] );
          micro_add( &tmp, &tmp, &dot );
 
-         /* tmp = 1 / tmp */
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
          micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
 
          FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-- 
cgit v1.2.3


From a52a6d7bcdaa47604151b9af07ebcd394316e784 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 7 Nov 2008 13:03:07 -0700
Subject: gallium: added SSE for DP2, DP2A

---
 src/gallium/auxiliary/tgsi/tgsi_sse2.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 3df0c5db3f..b8e4ab6d83 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -1742,7 +1742,18 @@ emit_instruction(
 
    case TGSI_OPCODE_DOT2ADD:
    /* TGSI_OPCODE_DP2A */
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_INDEX:
@@ -2084,7 +2095,16 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_DP2:
-      return 0;
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
       break;
 
    case TGSI_OPCODE_TXL:
-- 
cgit v1.2.3


From a58dbf34ca88656739a8f8e5f4259e760365c9d0 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Sat, 8 Nov 2008 10:29:23 -0700
Subject: gallium: implement SSE codegen for TGSI_OPCODE_NRM/NRM4

---
 src/gallium/auxiliary/tgsi/tgsi_sse2.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index b8e4ab6d83..3ce2c1c27b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -2087,7 +2087,39 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      return 0;
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+         /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
+         FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
+         FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
+         FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
+         if (dims == 4) {
+            FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
+         }
+         emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
+         emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
+         emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
+         emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
+         emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         if (dims == 4) {
+            emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
+            emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
+            emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
+         }
+         emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
+         FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+            if (chan_index < dims) {
+               emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
+               STORE( func, *inst, 4+chan_index, 0, chan_index );
+            }
+         }
+      }
       break;
 
    case TGSI_OPCODE_DIV:
-- 
cgit v1.2.3


From 399da3a337932c6074a69ac73e711138271308eb Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Sun, 9 Nov 2008 09:36:22 -0700
Subject: gallium: use PIPE_ARCH_SSE to protect use of SSE instrinsics only

This allows us to use SSE codegen with debug builds again.
When PIPE_ARCH_SSE is set (w/ gcc -msse -msse2) we will also use the
gcc SSE intrinsic functions.
---
 src/gallium/auxiliary/draw/draw_vs_sse.c |  2 +-
 src/gallium/auxiliary/tgsi/tgsi_sse2.c   | 42 +++++++++++++++++++++++++-------
 src/gallium/drivers/softpipe/sp_fs_sse.c |  2 +-
 3 files changed, 35 insertions(+), 11 deletions(-)

(limited to 'src/gallium')

diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0e2036f12a..77ba5152f9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
 
 #include "draw_vs.h"
 
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
 
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 3ce2c1c27b..f93db18114 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -27,12 +27,14 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
 
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
 #include "util/u_sse.h"
+#endif
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -627,6 +629,9 @@ emit_func_call_dst_src(
       code );
 }
 
+
+#if defined(PIPE_ARCH_SSE)
+
 /*
  * Fast SSE2 implementation of special math functions.
  */
@@ -678,6 +683,7 @@ exp2f4(__m128 x)
    return _mm_mul_ps(expipart, expfpart);
 }
 
+
 /**
  * See http://www.devmaster.net/forums/showthread.php?p=43580
  */
@@ -720,12 +726,16 @@ log2f4(__m128 x)
    return _mm_add_ps(logmant, exp);
 }
 
+
 static INLINE __m128
 powf4(__m128 x, __m128 y)
 {
    return exp2f4(_mm_mul_ps(log2f4(x), y));
 }
 
+#endif /* PIPE_ARCH_SSE */
+
+
 
 /**
  * Low-level instruction translators.
@@ -780,13 +790,20 @@ emit_cos(
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 ex24f(
    float *store )
 {
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_exp2( store[0] );
+   store[1] = util_fast_exp2( store[1] );
+   store[2] = util_fast_exp2( store[2] );
+   store[3] = util_fast_exp2( store[3] );
+#endif
 }
 
 static void
@@ -871,13 +888,20 @@ emit_frc(
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 lg24f(
    float *store )
 {
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_log2( store[0] );
+   store[1] = util_fast_log2( store[1] );
+   store[2] = util_fast_log2( store[2] );
+   store[3] = util_fast_log2( store[3] );
+#endif
 }
 
 static void
@@ -930,19 +954,19 @@ emit_neg(
 }
 
 static void PIPE_CDECL
-#if defined(PIPE_CC_GCC)
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 __attribute__((force_align_arg_pointer))
 #endif
 pow4f(
    float *store )
 {
-#if 1
+#if defined(PIPE_ARCH_SSE)
    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 #else
-   store[0] = powf( store[0], store[4] );
-   store[1] = powf( store[1], store[5] );
-   store[2] = powf( store[2], store[6] );
-   store[3] = powf( store[3], store[7] );
+   store[0] = util_fast_pow( store[0], store[4] );
+   store[1] = util_fast_pow( store[1], store[5] );
+   store[2] = util_fast_pow( store[2], store[6] );
+   store[3] = util_fast_pow( store[3], store[7] );
 #endif
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 8aa597f633..31908a517b 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
 #include "tgsi/tgsi_sse2.h"
 
 
-#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_X86)
 
 #include "rtasm/rtasm_x86sse.h"
 
-- 
cgit v1.2.3