From 217d37940771dd02ff1aa365105eca2c7a09d623 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 2 Apr 2008 14:01:42 -0600
Subject: cell: minor texture improvements

Precompute tiles_per_row.  Use ushort multiplies in a few places.  New comments.
---
 src/gallium/drivers/cell/spu/spu_main.c    |  2 ++
 src/gallium/drivers/cell/spu/spu_main.h    |  3 ++-
 src/gallium/drivers/cell/spu/spu_texture.c | 32 ++++++++++++++++++++----------
 3 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'src/gallium/drivers')

diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 1ab1c40379..e04ffeb9b1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -343,6 +343,8 @@ cmd_state_texture(const struct cell_command_texture *texture)
    spu.texture[unit].width = width;
    spu.texture[unit].height = height;
 
+   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+
    spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
    spu.texture[unit].tex_size_mask = (vector unsigned int)
          { width - 1, height - 1, 0, 0 };
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e9e39cbeab..e962e1426c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -105,7 +105,8 @@ struct spu_framebuffer {
 struct spu_texture
 {
    void *start;
-   uint width, height;
+   ushort width, height;
+   ushort tiles_per_row;
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
    vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index ceab246980..e9a2754e57 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -51,22 +51,25 @@ invalidate_tex_cache(void)
 static uint
 get_texel(uint unit, vec_uint4 coordinate)
 {
+   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   ushort x = spu_extract(coordinate, 0);
+   ushort y = spu_extract(coordinate, 1);
+   unsigned tile_offset = sizeof(tile_t)
+      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
+   ushort texel_offset = (ushort) 4
+      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
    vec_uint4 tmp;
-   unsigned x = spu_extract(coordinate, 0);
-   unsigned y = spu_extract(coordinate, 1);
-   const unsigned tiles_per_row = spu.texture[unit].width / TILE_SIZE;
-   unsigned tile_offset = sizeof(tile_t) * ((y / TILE_SIZE * tiles_per_row) 
-                                            + (x / TILE_SIZE));
-   unsigned texel_offset = 4 * (((y % TILE_SIZE) * TILE_SIZE)
-                                + (x % TILE_SIZE));
 
    spu_dcache_fetch_unaligned((qword *) & tmp,
-                              spu.texture[unit].start + tile_offset + texel_offset,
+                              texture_ea + tile_offset + texel_offset,
                               4);
    return spu_extract(tmp, 0);
 }
 
 
+/**
+ * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ */
 static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
@@ -76,7 +79,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    const qword offset_x = si_andi((qword) x, 0x1f);
    const qword offset_y = si_andi((qword) y, 0x1f);
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].width / TILE_SIZE);
+   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
    const qword tile_size = (qword) spu_splats(sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -97,6 +100,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
                               texture_ea + spu_extract(offset, 3), 4);
 }
 
+
 /**
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
@@ -126,17 +130,25 @@ sample_texture_bilinear(uint unit, vector float texcoord)
 
    vec_uint4 texels[4];
    
+   /* setup texcoords for quad:
+    *  +-----+-----+
+    *  |x0,y0|x1,y1|
+    *  +-----+-----+
+    *  |x2,y2|x3,y3|
+    *  +-----+-----+
+    */
    vec_uint4 x = spu_splats(spu_extract(itc, 0));
    vec_uint4 y = spu_splats(spu_extract(itc, 1));
-
    x = spu_add(x, offset_x);
    y = spu_add(y, offset_y);
 
+   /* GL_REPEAT wrap mode: */
    x = spu_and(x, spu.texture[unit].tex_size_x_mask);
    y = spu_and(y, spu.texture[unit].tex_size_y_mask);
 
    get_four_texels(unit, x, y, texels);
 
+   /* integer A8R8G8B8 to float texel conversion */
    vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
    vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
    vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-- 
cgit v1.2.3


From a7504ad587ee8cbfa9958ad23321a691ce0823d3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 2 Apr 2008 14:30:28 -0600
Subject: cell: added some comments/ideas about better texture sampling

---
 src/gallium/drivers/cell/spu/spu_texture.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers')

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index e9a2754e57..5051774f00 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -48,9 +48,16 @@ invalidate_tex_cache(void)
 }
 
 
+/**
+ * XXX look into getting texels for all four pixels in a quad at once.
+ */
 static uint
 get_texel(uint unit, vec_uint4 coordinate)
 {
+   /*
+    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
+    * SIMD since X and Y are already in a SIMD register.
+    */
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
    ushort x = spu_extract(coordinate, 0);
    ushort y = spu_extract(coordinate, 1);
@@ -69,6 +76,16 @@ get_texel(uint unit, vec_uint4 coordinate)
 
 /**
  * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ *
+ * NOTE: in the typical case of bilinear filtering, the four texels
+ * are in a 2x2 group so we could get by with just two dcache fetches
+ * (two side-by-side texels per fetch).  But when bilinear filtering
+ * wraps around a texture edge, we'll probably need code like we have
+ * now.
+ * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
+ * it's quite likely that the four pixels in a quad will need some of the
+ * same texels.  So look into doing texture fetches for four pixels at
+ * a time.
  */
 static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
@@ -103,7 +120,6 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 
 /**
  * Get texture sample at texcoord.
- * XXX this is extremely primitive for now.
  */
 vector float
 sample_texture_nearest(uint unit, vector float texcoord)
-- 
cgit v1.2.3