cell: Implement code-gen for logic op

This also implements code-gen for the float-to-packed color conversion. It's currently hardcoded for A8R8G8B8, but that can easily be fixed as soon as other color depths are supported by the Cell driver.
author: Ian Romanick <idr@us.ibm.com> 2008-03-26 10:45:32 -0700
committer: Ian Romanick <idr@us.ibm.com> 2008-03-26 10:47:17 -0700
commit: 92126cea846959bb2152905a7712753d1114bd6b (patch)
tree: 23e2288011423533f0d8f554c254b2e50f0cae4f /src/gallium/drivers/cell/spu
parent: 1ecb2e4a7a5881d5a98679b421d78fd11c729ebc (diff)
3 files changed, 60 insertions, 27 deletions
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 0a490ab277..fccff01e10 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -64,6 +64,9 @@ static unsigned char depth_stencil_code_buffer[4 * 64]
 static unsigned char fb_blend_code_buffer[4 * 64]
     ALIGN16_ATTRIB;
 
+static unsigned char logicop_code_buffer[4 * 64]
+    ALIGN16_ATTRIB;
+
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -513,6 +516,22 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
       }
+      case CELL_CMD_STATE_LOGICOP: {
+         struct cell_command_logicop *code =
+             (struct cell_command_logicop *) &buffer[pos+1];
+
+              mfc_get(logicop_code_buffer,
+                      (unsigned int) code->base,  /* src */
+                      code->size,
+                      TAG_BATCH_BUFFER,
+                      0, /* tid */
+                      0  /* rid */);
+         wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+	 spu.logicop = (logicop_func) logicop_code_buffer;
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
+         break;
+      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 49f5d99674..c20452931a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -77,9 +77,14 @@ struct spu_blend_results {
 typedef struct spu_blend_results (*blend_func)(
     qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a,
+    qword const_r, qword const_g, qword const_b, qword const_a);
+
+typedef struct spu_blend_results (*logicop_func)(
+    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
+    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
     qword frag_mask);
 
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -111,6 +116,8 @@ struct spu_global
    blend_func blend;
    qword const_blend_color[4] ALIGN16_ATTRIB;
 
+   logicop_func logicop;
+
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct cell_command_texture texture;
 
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index e6a1ce01df..95c629a8aa 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -305,7 +305,6 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      const vector unsigned char shuffle = spu.color_shuffle;
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
@@ -330,45 +329,53 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
+      /* Convert fragment data from AoS to SoA format.
+       */
+      qword soa_frag[4];
+      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+
       /* Read the current framebuffer values.
-       *
-       * Ignore read_fb for now.  In the future we can use this to avoid
-       * reading the framebuffer if read_fb is false and the fragment mask is
-       * all 0xffffffff.  This is the common case, so it is probably worth
-       * the effort.  We'll have to profile to determine whether or not the
-       * extra conditional branches hurt overall performance.
        */
-      vec_float4 aos_pix[4] = {
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-         spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+      const qword pix[4] = {
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
+         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
       };
 
       qword soa_pix[4];
-      qword soa_frag[4];
 
-      /* Convert pixel and fragment data from AoS to SoA format.
-       */
-      _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+      if (spu.read_fb) {
+         /* Convert pixel data from AoS to SoA format.
+          */
+         vec_float4 aos_pix[4] = {
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
+            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
+         };
+
+         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+      }
 
-      const struct spu_blend_results result =
+
+      struct spu_blend_results result =
           (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
                        soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
                        spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3],
-                       (qword) mask);
+                       spu.const_blend_color[2], spu.const_blend_color[3]);
 
 
       /* Convert final pixel data from SoA to AoS format.
        */
-      _transpose_matrix4x4(aos_pix, (const vec_float4 *) &result);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_pack_color_shuffle(aos_pix[0], shuffle);
-      spu.ctile.ui[iy+0][ix+1] = spu_pack_color_shuffle(aos_pix[1], shuffle);
-      spu.ctile.ui[iy+1][ix+0] = spu_pack_color_shuffle(aos_pix[2], shuffle);
-      spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(aos_pix[3], shuffle);
+      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
+                              result.r, result.g, result.b, result.a,
+                              (qword) mask);
+
+      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
+      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
+      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
+      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
    }
 #endif
 }
author	Ian Romanick <idr@us.ibm.com>	2008-03-26 10:45:32 -0700
committer	Ian Romanick <idr@us.ibm.com>	2008-03-26 10:47:17 -0700
commit	92126cea846959bb2152905a7712753d1114bd6b (patch)
tree	23e2288011423533f0d8f554c254b2e50f0cae4f /src/gallium/drivers/cell/spu
parent	1ecb2e4a7a5881d5a98679b421d78fd11c729ebc (diff)