summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorIan Romanick <idr@us.ibm.com>2008-02-12 11:29:34 -0800
committerIan Romanick <idr@us.ibm.com>2008-02-14 10:08:48 -0800
commitdd07e154d26c2c3ec248b7143eb67b6b4410246a (patch)
tree36104f1770d9ea5c07837e2f27b9780c068439a8 /src
parent125451b9f024ea5845eb6c1b3056bc1f1995cc55 (diff)
Fetch routines convert and transpose all 4 vertices at once.
Diffstat (limited to 'src')
-rw-r--r--src/mesa/pipe/cell/spu/spu_vertex_fetch.c152
-rw-r--r--src/mesa/pipe/cell/spu/spu_vertex_shader.h2
2 files changed, 71 insertions, 83 deletions
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index cbd389435e..3bbf9b7be4 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -100,7 +100,7 @@ fetch_unaligned(qword *dst, unsigned ea, unsigned size)
}
-#define CVT_32_FLOAT(q) (*q)
+#define CVT_32_FLOAT(q) (*(q))
static INLINE qword
CVT_64_FLOAT(const qword *qw)
@@ -242,85 +242,90 @@ CVT_32_SNORM(const qword *qw)
* This is probably needed/dupliocated elsewhere, eg format
* conversion, texture sampling etc.
*/
-#define FETCH_ATTRIB( NAME, SZ, CVT ) \
-static qword \
-fetch_##NAME(const qword *qw) \
-{ \
- qword expanded = CVT(qw); \
- return si_selb(expanded, (qword) defaults, SZ); \
+#define FETCH_ATTRIB( NAME, SZ, CVT, N ) \
+static void \
+fetch_##NAME(qword *out, const qword *in) \
+{ \
+ qword tmp[4]; \
+ \
+ tmp[0] = si_selb(CVT(in + (0 * N)), (qword) defaults, SZ); \
+ tmp[1] = si_selb(CVT(in + (1 * N)), (qword) defaults, SZ); \
+ tmp[2] = si_selb(CVT(in + (2 * N)), (qword) defaults, SZ); \
+ tmp[3] = si_selb(CVT(in + (3 * N)), (qword) defaults, SZ); \
+ _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) tmp); \
}
-FETCH_ATTRIB( R64G64B64A64_FLOAT, SZ_4, CVT_64_FLOAT )
-FETCH_ATTRIB( R64G64B64_FLOAT, SZ_3, CVT_64_FLOAT )
-FETCH_ATTRIB( R64G64_FLOAT, SZ_2, CVT_64_FLOAT )
-FETCH_ATTRIB( R64_FLOAT, SZ_1, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64A64_FLOAT, SZ_4, CVT_64_FLOAT, 2 )
+FETCH_ATTRIB( R64G64B64_FLOAT, SZ_3, CVT_64_FLOAT, 2 )
+FETCH_ATTRIB( R64G64_FLOAT, SZ_2, CVT_64_FLOAT, 2 )
+FETCH_ATTRIB( R64_FLOAT, SZ_1, CVT_64_FLOAT, 2 )
-FETCH_ATTRIB( R32G32B32A32_FLOAT, SZ_4, CVT_32_FLOAT )
-FETCH_ATTRIB( R32G32B32_FLOAT, SZ_3, CVT_32_FLOAT )
-FETCH_ATTRIB( R32G32_FLOAT, SZ_2, CVT_32_FLOAT )
-FETCH_ATTRIB( R32_FLOAT, SZ_1, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32A32_FLOAT, SZ_4, CVT_32_FLOAT, 1 )
+FETCH_ATTRIB( R32G32B32_FLOAT, SZ_3, CVT_32_FLOAT, 1 )
+FETCH_ATTRIB( R32G32_FLOAT, SZ_2, CVT_32_FLOAT, 1 )
+FETCH_ATTRIB( R32_FLOAT, SZ_1, CVT_32_FLOAT, 1 )
-FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED )
-FETCH_ATTRIB( R32G32B32_USCALED, SZ_3, CVT_32_USCALED )
-FETCH_ATTRIB( R32G32_USCALED, SZ_2, CVT_32_USCALED )
-FETCH_ATTRIB( R32_USCALED, SZ_1, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED, 1 )
+FETCH_ATTRIB( R32G32B32_USCALED, SZ_3, CVT_32_USCALED, 1 )
+FETCH_ATTRIB( R32G32_USCALED, SZ_2, CVT_32_USCALED, 1 )
+FETCH_ATTRIB( R32_USCALED, SZ_1, CVT_32_USCALED, 1 )
-FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED )
-FETCH_ATTRIB( R32G32B32_SSCALED, SZ_3, CVT_32_SSCALED )
-FETCH_ATTRIB( R32G32_SSCALED, SZ_2, CVT_32_SSCALED )
-FETCH_ATTRIB( R32_SSCALED, SZ_1, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED, 1 )
+FETCH_ATTRIB( R32G32B32_SSCALED, SZ_3, CVT_32_SSCALED, 1 )
+FETCH_ATTRIB( R32G32_SSCALED, SZ_2, CVT_32_SSCALED, 1 )
+FETCH_ATTRIB( R32_SSCALED, SZ_1, CVT_32_SSCALED, 1 )
-FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM )
-FETCH_ATTRIB( R32G32B32_UNORM, SZ_3, CVT_32_UNORM )
-FETCH_ATTRIB( R32G32_UNORM, SZ_2, CVT_32_UNORM )
-FETCH_ATTRIB( R32_UNORM, SZ_1, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM, 1 )
+FETCH_ATTRIB( R32G32B32_UNORM, SZ_3, CVT_32_UNORM, 1 )
+FETCH_ATTRIB( R32G32_UNORM, SZ_2, CVT_32_UNORM, 1 )
+FETCH_ATTRIB( R32_UNORM, SZ_1, CVT_32_UNORM, 1 )
-FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM )
-FETCH_ATTRIB( R32G32B32_SNORM, SZ_3, CVT_32_SNORM )
-FETCH_ATTRIB( R32G32_SNORM, SZ_2, CVT_32_SNORM )
-FETCH_ATTRIB( R32_SNORM, SZ_1, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM, 1 )
+FETCH_ATTRIB( R32G32B32_SNORM, SZ_3, CVT_32_SNORM, 1 )
+FETCH_ATTRIB( R32G32_SNORM, SZ_2, CVT_32_SNORM, 1 )
+FETCH_ATTRIB( R32_SNORM, SZ_1, CVT_32_SNORM, 1 )
-FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED )
-FETCH_ATTRIB( R16G16B16_USCALED, SZ_3, CVT_16_USCALED )
-FETCH_ATTRIB( R16G16_USCALED, SZ_2, CVT_16_USCALED )
-FETCH_ATTRIB( R16_USCALED, SZ_1, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED, 1 )
+FETCH_ATTRIB( R16G16B16_USCALED, SZ_3, CVT_16_USCALED, 1 )
+FETCH_ATTRIB( R16G16_USCALED, SZ_2, CVT_16_USCALED, 1 )
+FETCH_ATTRIB( R16_USCALED, SZ_1, CVT_16_USCALED, 1 )
-FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED )
-FETCH_ATTRIB( R16G16B16_SSCALED, SZ_3, CVT_16_SSCALED )
-FETCH_ATTRIB( R16G16_SSCALED, SZ_2, CVT_16_SSCALED )
-FETCH_ATTRIB( R16_SSCALED, SZ_1, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED, 1 )
+FETCH_ATTRIB( R16G16B16_SSCALED, SZ_3, CVT_16_SSCALED, 1 )
+FETCH_ATTRIB( R16G16_SSCALED, SZ_2, CVT_16_SSCALED, 1 )
+FETCH_ATTRIB( R16_SSCALED, SZ_1, CVT_16_SSCALED, 1 )
-FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM )
-FETCH_ATTRIB( R16G16B16_UNORM, SZ_3, CVT_16_UNORM )
-FETCH_ATTRIB( R16G16_UNORM, SZ_2, CVT_16_UNORM )
-FETCH_ATTRIB( R16_UNORM, SZ_1, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM, 1 )
+FETCH_ATTRIB( R16G16B16_UNORM, SZ_3, CVT_16_UNORM, 1 )
+FETCH_ATTRIB( R16G16_UNORM, SZ_2, CVT_16_UNORM, 1 )
+FETCH_ATTRIB( R16_UNORM, SZ_1, CVT_16_UNORM, 1 )
-FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM )
-FETCH_ATTRIB( R16G16B16_SNORM, SZ_3, CVT_16_SNORM )
-FETCH_ATTRIB( R16G16_SNORM, SZ_2, CVT_16_SNORM )
-FETCH_ATTRIB( R16_SNORM, SZ_1, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM, 1 )
+FETCH_ATTRIB( R16G16B16_SNORM, SZ_3, CVT_16_SNORM, 1 )
+FETCH_ATTRIB( R16G16_SNORM, SZ_2, CVT_16_SNORM, 1 )
+FETCH_ATTRIB( R16_SNORM, SZ_1, CVT_16_SNORM, 1 )
-FETCH_ATTRIB( R8G8B8A8_USCALED, SZ_4, CVT_8_USCALED )
-FETCH_ATTRIB( R8G8B8_USCALED, SZ_3, CVT_8_USCALED )
-FETCH_ATTRIB( R8G8_USCALED, SZ_2, CVT_8_USCALED )
-FETCH_ATTRIB( R8_USCALED, SZ_1, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8A8_USCALED, SZ_4, CVT_8_USCALED, 1 )
+FETCH_ATTRIB( R8G8B8_USCALED, SZ_3, CVT_8_USCALED, 1 )
+FETCH_ATTRIB( R8G8_USCALED, SZ_2, CVT_8_USCALED, 1 )
+FETCH_ATTRIB( R8_USCALED, SZ_1, CVT_8_USCALED, 1 )
-FETCH_ATTRIB( R8G8B8A8_SSCALED, SZ_4, CVT_8_SSCALED )
-FETCH_ATTRIB( R8G8B8_SSCALED, SZ_3, CVT_8_SSCALED )
-FETCH_ATTRIB( R8G8_SSCALED, SZ_2, CVT_8_SSCALED )
-FETCH_ATTRIB( R8_SSCALED, SZ_1, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8A8_SSCALED, SZ_4, CVT_8_SSCALED, 1 )
+FETCH_ATTRIB( R8G8B8_SSCALED, SZ_3, CVT_8_SSCALED, 1 )
+FETCH_ATTRIB( R8G8_SSCALED, SZ_2, CVT_8_SSCALED, 1 )
+FETCH_ATTRIB( R8_SSCALED, SZ_1, CVT_8_SSCALED, 1 )
-FETCH_ATTRIB( R8G8B8A8_UNORM, SZ_4, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8B8_UNORM, SZ_3, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8_UNORM, SZ_2, CVT_8_UNORM )
-FETCH_ATTRIB( R8_UNORM, SZ_1, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8A8_UNORM, SZ_4, CVT_8_UNORM, 1 )
+FETCH_ATTRIB( R8G8B8_UNORM, SZ_3, CVT_8_UNORM, 1 )
+FETCH_ATTRIB( R8G8_UNORM, SZ_2, CVT_8_UNORM, 1 )
+FETCH_ATTRIB( R8_UNORM, SZ_1, CVT_8_UNORM, 1 )
-FETCH_ATTRIB( R8G8B8A8_SNORM, SZ_4, CVT_8_SNORM )
-FETCH_ATTRIB( R8G8B8_SNORM, SZ_3, CVT_8_SNORM )
-FETCH_ATTRIB( R8G8_SNORM, SZ_2, CVT_8_SNORM )
-FETCH_ATTRIB( R8_SNORM, SZ_1, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8A8_SNORM, SZ_4, CVT_8_SNORM, 1 )
+FETCH_ATTRIB( R8G8B8_SNORM, SZ_3, CVT_8_SNORM, 1 )
+FETCH_ATTRIB( R8G8_SNORM, SZ_2, CVT_8_SNORM, 1 )
+FETCH_ATTRIB( R8_SNORM, SZ_1, CVT_8_SNORM, 1 )
-FETCH_ATTRIB( A8R8G8B8_UNORM, SZ_4, CVT_8_UNORM )
+FETCH_ATTRIB( A8R8G8B8_UNORM, SZ_4, CVT_8_UNORM, 1 )
@@ -584,7 +589,6 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
unsigned idx;
const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
- qword p[4];
qword in[2 * 4];
@@ -609,23 +613,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
/* Convert all 4 vertices to vectors of float.
*/
- idx = 0;
- for (i = 0; i < 4; i++) {
- p[i] = (*fetch)(in + idx);
- idx += quads_per_entry;
- }
-
-
- /* Transpose/swizzle into vector-friendly format. Currently
- * assuming that all vertex shader inputs are float[4], but this
- * isn't true -- if the vertex shader only wants tex0.xy, we
- * could optimize for that.
- *
- * To do so fully without codegen would probably require an
- * excessive number of fetch functions, but we could at least
- * minimize the transpose step:
- */
- _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
+ (*fetch)(&machine->Inputs[attr].xyzw[0].q, in);
}
}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index ea044e841d..8b37a239a4 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -6,7 +6,7 @@
struct spu_vs_context;
-typedef qword (*spu_fetch_func)(const qword *qw);
+typedef void (*spu_fetch_func)(qword *out, const qword *in);
typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
struct spu_exec_machine *machine,
const unsigned *elts,