summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary/translate
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/auxiliary/translate')
-rw-r--r--src/gallium/auxiliary/translate/translate.h18
-rw-r--r--src/gallium/auxiliary/translate/translate_generic.c40
-rw-r--r--src/gallium/auxiliary/translate/translate_sse.c199
3 files changed, 207 insertions, 50 deletions
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index 34526eb061..54ed2c1a4b 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -44,12 +44,19 @@
#include "pipe/p_format.h"
#include "pipe/p_state.h"
+enum translate_element_type {
+ TRANSLATE_ELEMENT_NORMAL,
+ TRANSLATE_ELEMENT_INSTANCE_ID
+};
+
struct translate_element
{
+ enum translate_element_type type;
enum pipe_format input_format;
enum pipe_format output_format;
unsigned input_buffer:8;
unsigned input_offset:24;
+ unsigned instance_divisor;
unsigned output_offset;
};
@@ -74,11 +81,13 @@ struct translate {
void (PIPE_CDECL *run_elts)( struct translate *,
const unsigned *elts,
unsigned count,
+ unsigned instance_id,
void *output_buffer);
void (PIPE_CDECL *run)( struct translate *,
unsigned start,
unsigned count,
+ unsigned instance_id,
void *output_buffer);
};
@@ -103,8 +112,13 @@ static INLINE int translate_keysize( const struct translate_key *key )
static INLINE int translate_key_compare( const struct translate_key *a,
const struct translate_key *b )
{
- int keysize = translate_keysize(a);
- return memcmp(a, b, keysize);
+ int keysize_a = translate_keysize(a);
+ int keysize_b = translate_keysize(b);
+
+ if (keysize_a != keysize_b) {
+ return keysize_a - keysize_b;
+ }
+ return memcmp(a, b, keysize_a);
}
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 266e7ee81e..24727d4988 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -46,9 +46,12 @@ struct translate_generic {
struct translate translate;
struct {
+ enum translate_element_type type;
+
fetch_func fetch;
unsigned buffer;
unsigned input_offset;
+ unsigned instance_divisor;
emit_func emit;
unsigned output_offset;
@@ -568,6 +571,7 @@ static emit_func get_emit_func( enum pipe_format format )
static void PIPE_CDECL generic_run_elts( struct translate *translate,
const unsigned *elts,
unsigned count,
+ unsigned instance_id,
void *output_buffer )
{
struct translate_generic *tg = translate_generic(translate);
@@ -583,13 +587,20 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
for (attr = 0; attr < nr_attrs; attr++) {
float data[4];
-
- const char *src = (tg->attrib[attr].input_ptr +
- tg->attrib[attr].input_stride * elt);
+ const char *src;
char *dst = (vert +
tg->attrib[attr].output_offset);
+ if (tg->attrib[attr].instance_divisor) {
+ src = tg->attrib[attr].input_ptr +
+ tg->attrib[attr].input_stride *
+ (instance_id / tg->attrib[attr].instance_divisor);
+ } else {
+ src = tg->attrib[attr].input_ptr +
+ tg->attrib[attr].input_stride * elt;
+ }
+
tg->attrib[attr].fetch( src, data );
if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
@@ -607,6 +618,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
static void PIPE_CDECL generic_run( struct translate *translate,
unsigned start,
unsigned count,
+ unsigned instance_id,
void *output_buffer )
{
struct translate_generic *tg = translate_generic(translate);
@@ -623,13 +635,25 @@ static void PIPE_CDECL generic_run( struct translate *translate,
for (attr = 0; attr < nr_attrs; attr++) {
float data[4];
- const char *src = (tg->attrib[attr].input_ptr +
- tg->attrib[attr].input_stride * elt);
-
char *dst = (vert +
tg->attrib[attr].output_offset);
- tg->attrib[attr].fetch( src, data );
+ if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+ const char *src;
+
+ if (tg->attrib[attr].instance_divisor) {
+ src = tg->attrib[attr].input_ptr +
+ tg->attrib[attr].input_stride *
+ (instance_id / tg->attrib[attr].instance_divisor);
+ } else {
+ src = tg->attrib[attr].input_ptr +
+ tg->attrib[attr].input_stride * elt;
+ }
+
+ tg->attrib[attr].fetch( src, data );
+ } else {
+ data[0] = (float)instance_id;
+ }
if (0) debug_printf("vert %d attr %d: %f %f %f %f\n",
i, attr, data[0], data[1], data[2], data[3]);
@@ -683,10 +707,12 @@ struct translate *translate_generic_create( const struct translate_key *key )
tg->translate.run = generic_run;
for (i = 0; i < key->nr_elements; i++) {
+ tg->attrib[i].type = key->element[i].type;
tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format);
tg->attrib[i].buffer = key->element[i].input_buffer;
tg->attrib[i].input_offset = key->element[i].input_offset;
+ tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
tg->attrib[i].output_offset = key->element[i].output_offset;
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index b62db8d8f3..8e152a002a 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -49,19 +49,31 @@
typedef void (PIPE_CDECL *run_func)( struct translate *translate,
unsigned start,
unsigned count,
- void *output_buffer );
+ unsigned instance_id,
+ void *output_buffer,
+ float instance_id_float );
typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
const unsigned *elts,
unsigned count,
- void *output_buffer );
+ unsigned instance_id,
+ void *output_buffer,
+ float instance_id_float );
struct translate_buffer {
const void *base_ptr;
unsigned stride;
- void *ptr; /* updated per vertex */
};
+struct translate_buffer_varient {
+ unsigned buffer_index;
+ unsigned instance_divisor;
+ void *ptr; /* updated either per vertex or per instance */
+};
+
+
+#define ELEMENT_BUFFER_INSTANCE_ID 1001
+
struct translate_sse {
struct translate translate;
@@ -81,6 +93,17 @@ struct translate_sse {
struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
unsigned nr_buffers;
+ /* Multiple buffer varients can map to a single buffer. */
+ struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
+ unsigned nr_buffer_varients;
+
+ /* Multiple elements can map to a single buffer varient. */
+ unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
+
+ boolean use_instancing;
+ unsigned instance_id;
+ float instance_id_float; /* XXX: needed while no integer support in TGSI */
+
run_func gen_run;
run_elts_func gen_run_elts;
@@ -359,32 +382,61 @@ static boolean init_inputs( struct translate_sse *p,
boolean linear )
{
unsigned i;
- if (linear) {
- for (i = 0; i < p->nr_buffers; i++) {
+ struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->instance_id));
+
+ for (i = 0; i < p->nr_buffer_varients; i++) {
+ struct translate_buffer_varient *varient = &p->buffer_varient[i];
+ struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
+
+ if (linear || varient->instance_divisor) {
struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[i].stride));
+ get_offset(p, &buffer->stride));
struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[i].ptr));
+ get_offset(p, &varient->ptr));
struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[i].base_ptr));
+ get_offset(p, &buffer->base_ptr));
struct x86_reg elt = p->idx_EBX;
- struct x86_reg tmp = p->tmp_EAX;
-
+ struct x86_reg tmp_EAX = p->tmp_EAX;
/* Calculate pointer to first attrib:
+ * base_ptr + stride * index, where index depends on instance divisor
*/
- x86_mov(p->func, tmp, buf_stride);
- x86_imul(p->func, tmp, elt);
- x86_add(p->func, tmp, buf_base_ptr);
+ if (varient->instance_divisor) {
+ /* Our index is instance ID divided by instance divisor.
+ */
+ x86_mov(p->func, tmp_EAX, instance_id);
+
+ if (varient->instance_divisor != 1) {
+ struct x86_reg tmp_EDX = p->machine_EDX;
+ struct x86_reg tmp_ECX = p->outbuf_ECX;
+
+ /* TODO: Add x86_shr() to rtasm and use it whenever
+ * instance divisor is power of two.
+ */
+
+ x86_push(p->func, tmp_EDX);
+ x86_push(p->func, tmp_ECX);
+ x86_xor(p->func, tmp_EDX, tmp_EDX);
+ x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
+ x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */
+ x86_pop(p->func, tmp_ECX);
+ x86_pop(p->func, tmp_EDX);
+ }
+ } else {
+ x86_mov(p->func, tmp_EAX, elt);
+ }
+ x86_imul(p->func, tmp_EAX, buf_stride);
+ x86_add(p->func, tmp_EAX, buf_base_ptr);
/* In the linear case, keep the buffer pointer instead of the
* index number.
*/
- if (p->nr_buffers == 1)
- x86_mov( p->func, elt, tmp );
+ if (linear && p->nr_buffer_varients == 1)
+ x86_mov(p->func, elt, tmp_EAX);
else
- x86_mov( p->func, buf_ptr, tmp );
+ x86_mov(p->func, buf_ptr, tmp_EAX);
}
}
@@ -394,31 +446,36 @@ static boolean init_inputs( struct translate_sse *p,
static struct x86_reg get_buffer_ptr( struct translate_sse *p,
boolean linear,
- unsigned buf_idx,
+ unsigned var_idx,
struct x86_reg elt )
{
- if (linear && p->nr_buffers == 1) {
+ if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
+ return x86_make_disp(p->machine_EDX,
+ get_offset(p, &p->instance_id_float));
+ }
+ if (linear && p->nr_buffer_varients == 1) {
return p->idx_EBX;
}
- else if (linear) {
+ else if (linear || p->buffer_varient[var_idx].instance_divisor) {
struct x86_reg ptr = p->tmp_EAX;
struct x86_reg buf_ptr =
x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[buf_idx].ptr));
+ get_offset(p, &p->buffer_varient[var_idx].ptr));
x86_mov(p->func, ptr, buf_ptr);
return ptr;
}
else {
struct x86_reg ptr = p->tmp_EAX;
+ const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
struct x86_reg buf_stride =
x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[buf_idx].stride));
+ get_offset(p, &p->buffer[varient->buffer_index].stride));
struct x86_reg buf_base_ptr =
x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[buf_idx].base_ptr));
+ get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
@@ -436,28 +493,33 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
static boolean incr_inputs( struct translate_sse *p,
boolean linear )
{
- if (linear && p->nr_buffers == 1) {
+ if (linear && p->nr_buffer_varients == 1) {
struct x86_reg stride = x86_make_disp(p->machine_EDX,
get_offset(p, &p->buffer[0].stride));
- x86_add(p->func, p->idx_EBX, stride);
- sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+ if (p->buffer_varient[0].instance_divisor == 0) {
+ x86_add(p->func, p->idx_EBX, stride);
+ sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+ }
}
else if (linear) {
unsigned i;
/* Is this worthwhile??
*/
- for (i = 0; i < p->nr_buffers; i++) {
+ for (i = 0; i < p->nr_buffer_varients; i++) {
+ struct translate_buffer_varient *varient = &p->buffer_varient[i];
struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[i].ptr));
+ get_offset(p, &varient->ptr));
struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
- get_offset(p, &p->buffer[i].stride));
+ get_offset(p, &p->buffer[varient->buffer_index].stride));
- x86_mov(p->func, p->tmp_EAX, buf_ptr);
- x86_add(p->func, p->tmp_EAX, buf_stride);
- if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
- x86_mov(p->func, buf_ptr, p->tmp_EAX);
+ if (varient->instance_divisor == 0) {
+ x86_mov(p->func, p->tmp_EAX, buf_ptr);
+ x86_add(p->func, p->tmp_EAX, buf_stride);
+ if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+ x86_mov(p->func, buf_ptr, p->tmp_EAX);
+ }
}
}
else {
@@ -514,7 +576,26 @@ static boolean build_vertex_emit( struct translate_sse *p,
x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
- x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
+ x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
+
+ /* Load instance ID.
+ */
+ if (p->use_instancing) {
+ x86_mov(p->func,
+ p->tmp_EAX,
+ x86_fn_arg(p->func, 4));
+ x86_mov(p->func,
+ x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
+ p->tmp_EAX);
+
+ /* XXX: temporary */
+ x86_mov(p->func,
+ p->tmp_EAX,
+ x86_fn_arg(p->func, 6));
+ x86_mov(p->func,
+ x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id_float)),
+ p->tmp_EAX);
+ }
/* Get vertex count, compare to zero
*/
@@ -531,17 +612,18 @@ static boolean build_vertex_emit( struct translate_sse *p,
label = x86_get_label(p->func);
{
struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
- int last_vb = -1;
+ int last_varient = -1;
struct x86_reg vb;
for (j = 0; j < p->translate.key.nr_elements; j++) {
const struct translate_element *a = &p->translate.key.element[j];
+ unsigned varient = p->element_to_buffer_varient[j];
/* Figure out source pointer address:
*/
- if (a->input_buffer != last_vb) {
- last_vb = a->input_buffer;
- vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
+ if (varient != last_varient) {
+ last_varient = varient;
+ vb = get_buffer_ptr(p, linear, varient, elt);
}
if (!translate_attr( p, a,
@@ -624,6 +706,7 @@ static void translate_sse_release( struct translate *translate )
static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
const unsigned *elts,
unsigned count,
+ unsigned instance_id,
void *output_buffer )
{
struct translate_sse *p = (struct translate_sse *)translate;
@@ -631,12 +714,15 @@ static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
p->gen_run_elts( translate,
elts,
count,
- output_buffer );
+ instance_id,
+ output_buffer,
+ (float)instance_id );
}
static void PIPE_CDECL translate_sse_run( struct translate *translate,
unsigned start,
unsigned count,
+ unsigned instance_id,
void *output_buffer )
{
struct translate_sse *p = (struct translate_sse *)translate;
@@ -644,7 +730,9 @@ static void PIPE_CDECL translate_sse_run( struct translate *translate,
p->gen_run( translate,
start,
count,
- output_buffer );
+ instance_id,
+ output_buffer,
+ (float)instance_id);
}
@@ -666,8 +754,37 @@ struct translate *translate_sse2_create( const struct translate_key *key )
p->translate.run_elts = translate_sse_run_elts;
p->translate.run = translate_sse_run;
- for (i = 0; i < key->nr_elements; i++)
- p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
+ for (i = 0; i < key->nr_elements; i++) {
+ if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
+ unsigned j;
+
+ p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
+
+ if (key->element[i].instance_divisor) {
+ p->use_instancing = TRUE;
+ }
+
+ /*
+ * Map vertex element to vertex buffer varient.
+ */
+ for (j = 0; j < p->nr_buffer_varients; j++) {
+ if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
+ p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
+ break;
+ }
+ }
+ if (j == p->nr_buffer_varients) {
+ p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
+ p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
+ p->nr_buffer_varients++;
+ }
+ p->element_to_buffer_varient[i] = j;
+ } else {
+ assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
+
+ p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
+ }
+ }
if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);