summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary/translate
diff options
context:
space:
mode:
authorLuca Barbieri <luca@luca-barbieri.com>2010-08-10 09:51:20 +0200
committerLuca Barbieri <luca@luca-barbieri.com>2010-08-16 16:57:05 +0200
commitddcf028aa0a1bd6f79381164c8b1c3b816792e47 (patch)
tree47f5ff96aeaf826c9898af2b4ef694ad7d606c05 /src/gallium/auxiliary/translate
parent9271059b361128070c68b3d1a7982b4f9f151546 (diff)
translate_generic: use memcpy if possible (v3)
Changes in v3: - If we can do a copy, don't try to get an emit func, as that can assert(0) Changes in v2: - Add comment regarding copy_size When used in GPU drivers, translate can be used to simultaneously perform a gather operation, and convert away from unsupported formats. In this use case, input and output formats will often be identical: clearly it would make sense to use a memcpy in this case. Instead, translate will insist to convert to and from 32-bit floating point numbers. This is not only extremely expensive, but it also loses precision for 32/64-bit integers and 64-bit floating point numbers. This patch changes translate_generic to just use memcpy if the formats are identical, non-blocked, and with an integral number of bytes per pixel (note that all sensible vertex formats are like this).
Diffstat (limited to 'src/gallium/auxiliary/translate')
-rw-r--r--src/gallium/auxiliary/translate/translate_generic.c108
1 files changed, 75 insertions, 33 deletions
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 42cfd763e9..9d2653920d 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -64,6 +64,14 @@ struct translate_generic {
unsigned input_stride;
unsigned max_index;
+ /* this value is set to -1 if this is a normal element with output_format != input_format:
+ * in this case, u_format is used to do a full conversion
+ *
+ * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids:
+ * in this case, memcpy is used to copy this amount of bytes
+ */
+ int copy_size;
+
} attrib[PIPE_MAX_ATTRIBS];
unsigned nr_attrib;
@@ -354,8 +362,6 @@ static emit_func get_emit_func( enum pipe_format format )
}
}
-
-
/**
* Fetch vertex attributes for 'count' vertices.
*/
@@ -380,9 +386,10 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
float data[4];
char *dst = vert + tg->attrib[attr].output_offset;
- if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+ if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
const uint8_t *src;
unsigned index;
+ int copy_size;
if (tg->attrib[attr].instance_divisor) {
index = instance_id / tg->attrib[attr].instance_divisor;
@@ -396,27 +403,34 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
src = tg->attrib[attr].input_ptr +
tg->attrib[attr].input_stride * index;
- tg->attrib[attr].fetch( data, src, 0, 0 );
-
- if (0)
- debug_printf("Fetch elt attr %d from %p stride %d div %u max %u index %d: "
- " %f, %f, %f, %f \n",
- attr,
- tg->attrib[attr].input_ptr,
- tg->attrib[attr].input_stride,
- tg->attrib[attr].instance_divisor,
- tg->attrib[attr].max_index,
- index,
- data[0], data[1],data[2], data[3]);
+ copy_size = tg->attrib[attr].copy_size;
+ if(likely(copy_size >= 0))
+ memcpy(dst, src, copy_size);
+ else
+ {
+ tg->attrib[attr].fetch( data, src, 0, 0 );
+
+ if (0)
+ debug_printf("Fetch elt attr %d from %p stride %d div %u max %u index %d: "
+ " %f, %f, %f, %f \n",
+ attr,
+ tg->attrib[attr].input_ptr,
+ tg->attrib[attr].input_stride,
+ tg->attrib[attr].instance_divisor,
+ tg->attrib[attr].max_index,
+ index,
+ data[0], data[1],data[2], data[3]);
+ tg->attrib[attr].emit( data, dst );
+ }
} else {
- data[0] = (float)instance_id;
+ if(likely(tg->attrib[attr].copy_size >= 0))
+ memcpy(data, &instance_id, 4);
+ else
+ {
+ data[0] = (float)instance_id;
+ tg->attrib[attr].emit( data, dst );
+ }
}
-
- if (0)
- debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
- i, elt, attr, data[0], data[1], data[2], data[3]);
-
- tg->attrib[attr].emit( data, dst );
}
vert += tg->translate.key.output_stride;
}
@@ -448,6 +462,7 @@ static void PIPE_CDECL generic_run( struct translate *translate,
if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
const uint8_t *src;
unsigned index;
+ int copy_size;
if (tg->attrib[attr].instance_divisor) {
index = instance_id / tg->attrib[attr].instance_divisor;
@@ -462,25 +477,33 @@ static void PIPE_CDECL generic_run( struct translate *translate,
src = tg->attrib[attr].input_ptr +
tg->attrib[attr].input_stride * index;
- tg->attrib[attr].fetch( data, src, 0, 0 );
+ copy_size = tg->attrib[attr].copy_size;
+ if(likely(copy_size >= 0))
+ memcpy(dst, src, copy_size);
+ else
+ {
+ tg->attrib[attr].fetch( data, src, 0, 0 );
- if (0)
- debug_printf("Fetch linear attr %d from %p stride %d index %d: "
+ if (0)
+ debug_printf("Fetch linear attr %d from %p stride %d index %d: "
" %f, %f, %f, %f \n",
attr,
tg->attrib[attr].input_ptr,
tg->attrib[attr].input_stride,
index,
data[0], data[1],data[2], data[3]);
+
+ tg->attrib[attr].emit( data, dst );
+ }
} else {
- data[0] = (float)instance_id;
+ if(likely(tg->attrib[attr].copy_size >= 0))
+ memcpy(data, &instance_id, 4);
+ else
+ {
+ data[0] = (float)instance_id;
+ tg->attrib[attr].emit( data, dst );
+ }
}
-
- if (0)
- debug_printf("vert %d attr %d: %f %f %f %f\n",
- i, attr, data[0], data[1], data[2], data[3]);
-
- tg->attrib[attr].emit( data, dst );
}
vert += tg->translate.key.output_stride;
@@ -544,9 +567,28 @@ struct translate *translate_generic_create( const struct translate_key *key )
tg->attrib[i].input_offset = key->element[i].input_offset;
tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
- tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
tg->attrib[i].output_offset = key->element[i].output_offset;
+ tg->attrib[i].copy_size = -1;
+ if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID)
+ {
+ if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED
+ || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED)
+ tg->attrib[i].copy_size = 4;
+ }
+ else
+ {
+ if(key->element[i].input_format == key->element[i].output_format
+ && format_desc->block.width == 1
+ && format_desc->block.height == 1
+ && !(format_desc->block.bits & 7))
+ tg->attrib[i].copy_size = format_desc->block.bits >> 3;
+ }
+
+ if(tg->attrib[i].copy_size < 0)
+ tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+ else
+ tg->attrib[i].emit = NULL;
}
tg->nr_attrib = key->nr_elements;