summaryrefslogtreecommitdiff
path: root/src/gallium
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/auxiliary/draw/draw_context.c8
-rw-r--r--src/gallium/auxiliary/draw/draw_context.h3
-rw-r--r--src/gallium/auxiliary/draw/draw_pipe_vbuf.c4
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h5
-rw-r--r--src/gallium/auxiliary/draw/draw_pt.c38
-rw-r--r--src/gallium/auxiliary/draw/draw_pt_emit.c4
-rw-r--r--src/gallium/auxiliary/draw/draw_pt_fetch_emit.c4
-rw-r--r--src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c43
-rw-r--r--src/gallium/auxiliary/draw/draw_vertex.c6
-rw-r--r--src/gallium/auxiliary/draw/draw_vertex.h44
-rw-r--r--src/gallium/auxiliary/draw/draw_vs.h4
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos.c68
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos.h21
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_aos_io.c211
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_llvm.c1
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_sse.c2
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_varient.c10
-rw-r--r--src/gallium/auxiliary/gallivm/gallivm_builtins.cpp252
-rw-r--r--src/gallium/auxiliary/gallivm/gallivm_cpu.cpp6
-rw-r--r--src/gallium/auxiliary/gallivm/instructions.cpp1225
-rw-r--r--src/gallium/auxiliary/gallivm/instructions.h33
-rw-r--r--src/gallium/auxiliary/gallivm/instructionssoa.cpp258
-rw-r--r--src/gallium/auxiliary/gallivm/instructionssoa.h5
-rw-r--r--src/gallium/auxiliary/gallivm/soabuiltins.c155
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.cpp53
-rw-r--r--src/gallium/auxiliary/gallivm/storagesoa.h7
-rw-r--r--src/gallium/auxiliary/gallivm/tgsitollvm.cpp86
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c382
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h76
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_x86sse.c44
-rw-r--r--src/gallium/auxiliary/rtasm/rtasm_x86sse.h7
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_sse2.c6
-rw-r--r--src/gallium/auxiliary/util/Makefile1
-rw-r--r--src/gallium/auxiliary/util/SConscript5
-rw-r--r--src/gallium/auxiliary/util/u_keymap.c309
-rw-r--r--src/gallium/auxiliary/util/u_keymap.h68
-rw-r--r--src/gallium/auxiliary/util/u_math.c21
-rw-r--r--src/gallium/auxiliary/util/u_math.h112
-rw-r--r--src/gallium/auxiliary/util/u_sse.h7
-rw-r--r--src/gallium/drivers/cell/common.h12
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.c10
-rw-r--r--src/gallium/drivers/cell/ppu/cell_context.h17
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fp.c704
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.c938
-rw-r--r--src/gallium/drivers/cell/ppu/cell_pipe_state.c2
-rw-r--r--src/gallium/drivers/cell/ppu/cell_render.c1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_screen.c4
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state.h5
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_emit.c140
-rw-r--r--src/gallium/drivers/cell/ppu/cell_state_shader.c5
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.c300
-rw-r--r--src/gallium/drivers/cell/ppu/cell_texture.h14
-rw-r--r--src/gallium/drivers/cell/ppu/cell_vbuf.c1
-rw-r--r--src/gallium/drivers/cell/ppu/cell_vertex_fetch.c30
-rw-r--r--src/gallium/drivers/cell/spu/spu_colorpack.h49
-rw-r--r--src/gallium/drivers/cell/spu/spu_command.c170
-rw-r--r--src/gallium/drivers/cell/spu/spu_funcs.c93
-rw-r--r--src/gallium/drivers/cell/spu/spu_main.h48
-rw-r--r--src/gallium/drivers/cell/spu/spu_per_fragment_op.c19
-rw-r--r--src/gallium/drivers/cell/spu/spu_per_fragment_op.h3
-rw-r--r--src/gallium/drivers/cell/spu/spu_render.c4
-rw-r--r--src/gallium/drivers/cell/spu/spu_texture.c595
-rw-r--r--src/gallium/drivers/cell/spu/spu_texture.h34
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.c431
-rw-r--r--src/gallium/drivers/cell/spu/spu_tri.h2
-rw-r--r--src/gallium/drivers/i915simple/i915_prim_emit.c4
-rw-r--r--src/gallium/drivers/i915simple/i915_state_derived.c4
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_sse.c2
-rw-r--r--src/gallium/drivers/softpipe/sp_setup.c12
-rw-r--r--src/gallium/include/pipe/p_config.h8
70 files changed, 5140 insertions, 2115 deletions
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 78249054f2..b439bc4059 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -274,6 +274,14 @@ draw_enable_point_sprites(struct draw_context *draw, boolean enable)
}
+void
+draw_set_force_passthrough( struct draw_context *draw, boolean enable )
+{
+ draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+ draw->force_passthrough = enable;
+}
+
+
/**
* Ask the draw module for the location/slot of the given vertex attribute in
* a post-transformed vertex.
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 0ab3681b64..3eeb453531 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -160,6 +160,9 @@ void draw_set_render( struct draw_context *draw,
void draw_set_driver_clipping( struct draw_context *draw,
boolean bypass_clipping );
+void draw_set_force_passthrough( struct draw_context *draw,
+ boolean enable );
+
/*******************************************************************************
* Draw pipeline
*/
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index c0cf4269db..9825e116c3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -231,9 +231,9 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint prim )
unsigned emit_sz = 0;
unsigned src_buffer = 0;
unsigned output_format;
- unsigned src_offset = (vbuf->vinfo->src_index[i] * 4 * sizeof(float) );
+ unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
- switch (vbuf->vinfo->emit[i]) {
+ switch (vbuf->vinfo->attrib[i].emit) {
case EMIT_4F:
output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 626a2e3e30..5d531146c5 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -163,12 +163,15 @@ struct draw_context
struct {
boolean bypass_clipping;
+ boolean bypass_vs;
} driver;
boolean flushing; /**< debugging/sanity */
boolean suspend_flushing; /**< internally set */
boolean bypass_clipping; /**< set if either api or driver bypass_clipping true */
+ boolean force_passthrough; /**< never clip or shade */
+
/* pipe state that we need: */
const struct pipe_rasterizer_state *rasterizer;
struct pipe_viewport_state viewport;
@@ -193,7 +196,7 @@ struct draw_context
const float (*aligned_constants)[4];
- float (*aligned_constant_storage)[4];
+ const float (*aligned_constant_storage)[4];
unsigned const_storage_size;
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 669c11c993..87ec6ae20c 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -69,26 +69,26 @@ draw_pt_arrays(struct draw_context *draw,
return TRUE;
}
-
- if (!draw->render) {
- opt |= PT_PIPELINE;
- }
-
- if (draw_need_pipeline(draw,
- draw->rasterizer,
- prim)) {
- opt |= PT_PIPELINE;
- }
-
- if (!draw->bypass_clipping && !draw->pt.test_fse) {
- opt |= PT_CLIPTEST;
+ if (!draw->force_passthrough) {
+ if (!draw->render) {
+ opt |= PT_PIPELINE;
+ }
+
+ if (draw_need_pipeline(draw,
+ draw->rasterizer,
+ prim)) {
+ opt |= PT_PIPELINE;
+ }
+
+ if (!draw->bypass_clipping && !draw->pt.test_fse) {
+ opt |= PT_CLIPTEST;
+ }
+
+ if (!draw->rasterizer->bypass_vs) {
+ opt |= PT_SHADE;
+ }
}
-
- if (!draw->rasterizer->bypass_vs) {
- opt |= PT_SHADE;
- }
-
-
+
if (opt == 0)
middle = draw->pt.middle.fetch_emit;
else if (opt == PT_SHADE && !draw->pt.no_fse)
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index d4eca80588..d520b05869 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -84,11 +84,11 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
unsigned emit_sz = 0;
unsigned src_buffer = 0;
unsigned output_format;
- unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) );
+ unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
- switch (vinfo->emit[i]) {
+ switch (vinfo->attrib[i].emit) {
case EMIT_4F:
output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 5a4db6cfe5..3966ad48ba 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -121,7 +121,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
memset(&key, 0, sizeof(key));
for (i = 0; i < vinfo->num_attribs; i++) {
- const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->src_index[i]];
+ const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->attrib[i].src_index];
unsigned emit_sz = 0;
unsigned input_format = src->src_format;
@@ -129,7 +129,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
unsigned input_offset = src->src_offset;
unsigned output_format;
- switch (vinfo->emit[i]) {
+ switch (vinfo->attrib[i].emit) {
case EMIT_4F:
output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 73fc70c1bc..f7e6a1a8ee 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -79,6 +79,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs;
const struct vertex_info *vinfo;
unsigned i;
+ unsigned nr_vbs = 0;
if (!draw->render->set_primitive( draw->render,
@@ -102,7 +103,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
fse->key.viewport = !draw->identity_viewport;
fse->key.clip = !draw->bypass_clipping;
- fse->key.pad = 0;
+ fse->key.const_vbuffers = 0;
memset(fse->key.element, 0,
fse->key.nr_elements * sizeof(fse->key.element[0]));
@@ -116,16 +117,23 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
*/
fse->key.element[i].in.buffer = src->vertex_buffer_index;
fse->key.element[i].in.offset = src->src_offset;
+ nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1);
}
+ for (i = 0; i < 5 && i < nr_vbs; i++) {
+ if (draw->pt.vertex_buffer[i].pitch == 0)
+ fse->key.const_vbuffers |= (1<<i);
+ }
+ if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers);
+
{
unsigned dst_offset = 0;
for (i = 0; i < vinfo->num_attribs; i++) {
unsigned emit_sz = 0;
- switch (vinfo->emit[i]) {
+ switch (vinfo->attrib[i].emit) {
case EMIT_4F:
emit_sz = 4 * sizeof(float);
break;
@@ -153,8 +161,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
* numbers, not to positions in the hw vertex description --
* that's handled by the output_offset field.
*/
- fse->key.element[i].out.format = vinfo->emit[i];
- fse->key.element[i].out.vs_output = vinfo->src_index[i];
+ fse->key.element[i].out.format = vinfo->attrib[i].emit;
+ fse->key.element[i].out.vs_output = vinfo->attrib[i].src_index;
fse->key.element[i].out.offset = dst_offset;
dst_offset += emit_sz;
@@ -162,13 +170,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
}
}
-
- /* Would normally look up a vertex shader and peruse its list of
- * varients somehow. We omitted that step and put all the
- * hardcoded "shaders" into an array. We're just making the
- * assumption that this happens to be a matching shader... ie
- * you're running isosurf, aren't you?
- */
+
fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader,
&fse->key );
@@ -177,18 +179,17 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
return ;
}
+ if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__,
+ fse->active->key.const_vbuffers);
+
/* Now set buffer pointers:
*/
- for (i = 0; i < num_vs_inputs; i++) {
- unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index;
-
- fse->active->set_input( fse->active,
- i,
-
- ((const ubyte *) draw->pt.user.vbuffer[buf] +
- draw->pt.vertex_buffer[buf].buffer_offset),
-
- draw->pt.vertex_buffer[buf].pitch );
+ for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+ fse->active->set_buffer( fse->active,
+ i,
+ ((const ubyte *) draw->pt.user.vbuffer[i] +
+ draw->pt.vertex_buffer[i].buffer_offset),
+ draw->pt.vertex_buffer[i].pitch );
}
*max_vertices = (draw->render->max_vertex_buffer_bytes /
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
index 1446f785c5..3214213e44 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.c
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -49,7 +49,7 @@ draw_compute_vertex_size(struct vertex_info *vinfo)
vinfo->size = 0;
for (i = 0; i < vinfo->num_attribs; i++) {
- switch (vinfo->emit[i]) {
+ switch (vinfo->attrib[i].emit) {
case EMIT_OMIT:
break;
case EMIT_4UB:
@@ -81,8 +81,8 @@ draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
unsigned i, j;
for (i = 0; i < vinfo->num_attribs; i++) {
- j = vinfo->src_index[i];
- switch (vinfo->emit[i]) {
+ j = vinfo->attrib[i].src_index;
+ switch (vinfo->attrib[i].emit) {
case EMIT_OMIT:
debug_printf("EMIT_OMIT:");
break;
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 16c65c4317..a943607d7e 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -75,12 +75,41 @@ struct vertex_info
{
uint num_attribs;
uint hwfmt[4]; /**< hardware format info for this format */
- enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS];
- enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS]; /**< EMIT_x */
- uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */
uint size; /**< total vertex size in dwords */
+
+ /* Keep this small and at the end of the struct to allow quick
+ * memcmp() comparisons.
+ */
+ struct {
+ ubyte interp_mode:4; /**< INTERP_x */
+ ubyte emit:4; /**< EMIT_x */
+ ubyte src_index; /**< map to post-xform attribs */
+ } attrib[PIPE_MAX_SHADER_INPUTS];
};
+static INLINE int
+draw_vinfo_size( const struct vertex_info *a )
+{
+ return ((const char *)&a->attrib[a->num_attribs] -
+ (const char *)a);
+}
+
+static INLINE int
+draw_vinfo_compare( const struct vertex_info *a,
+ const struct vertex_info *b )
+{
+ unsigned sizea = draw_vinfo_size( a );
+ return memcmp( a, b, sizea );
+}
+
+static INLINE void
+draw_vinfo_copy( struct vertex_info *dst,
+ const struct vertex_info *src )
+{
+ unsigned size = draw_vinfo_size( src );
+ memcpy( dst, src, size );
+}
+
/**
@@ -91,14 +120,15 @@ struct vertex_info
*/
static INLINE uint
draw_emit_vertex_attr(struct vertex_info *vinfo,
- enum attrib_emit emit, enum interp_mode interp,
+ enum attrib_emit emit,
+ enum interp_mode interp, /* only used by softpipe??? */
uint src_index)
{
const uint n = vinfo->num_attribs;
assert(n < PIPE_MAX_SHADER_INPUTS);
- vinfo->emit[n] = emit;
- vinfo->interp_mode[n] = interp;
- vinfo->src_index[n] = src_index;
+ vinfo->attrib[n].emit = emit;
+ vinfo->attrib[n].interp_mode = interp;
+ vinfo->attrib[n].src_index = src_index;
vinfo->num_attribs++;
return n;
}
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 45992d1986..68c24abad3 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -64,7 +64,7 @@ struct draw_vs_varient_key {
unsigned nr_outputs:8;
unsigned viewport:1;
unsigned clip:1;
- unsigned pad:5;
+ unsigned const_vbuffers:5;
struct draw_varient_element element[PIPE_MAX_ATTRIBS];
};
@@ -76,7 +76,7 @@ struct draw_vs_varient {
struct draw_vertex_shader *vs;
- void (*set_input)( struct draw_vs_varient *,
+ void (*set_buffer)( struct draw_vs_varient *,
unsigned i,
const void *ptr,
unsigned stride );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index a556477a76..87232865e2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -92,9 +92,9 @@ struct x86_reg aos_get_x86( struct aos_compilation *cp,
assert(which_reg == 1);
offset = Offset(struct aos_machine, constants);
break;
- case X86_ATTRIBS:
+ case X86_BUFFERS:
assert(which_reg == 0);
- offset = Offset(struct aos_machine, attrib);
+ offset = Offset(struct aos_machine, buffer);
break;
default:
assert(0);
@@ -196,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx )
}
+void aos_spill_all( struct aos_compilation *cp )
+{
+ unsigned i;
+
+ for (i = 0; i < 8; i++) {
+ if (cp->xmm[i].dirty)
+ spill(cp, i);
+ aos_release_xmm_reg(cp, i);
+ }
+}
+
+
static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
struct x86_reg reg )
{
@@ -1939,6 +1951,11 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
save_fpu_state( &cp );
set_fpu_round_nearest( &cp );
+ aos_init_inputs( &cp, linear );
+
+ cp.x86_reg[0] = 0;
+ cp.x86_reg[1] = 0;
+
/* Note address for loop jump
*/
label = x86_get_label(cp.func);
@@ -2018,13 +2035,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
/* Incr index
*/
- if (linear) {
- x86_inc(cp.func, cp.idx_EBX);
- }
- else {
- x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
- }
-
+ aos_incr_inputs( &cp, linear );
}
/* decr count, loop if not zero
*/
@@ -2065,15 +2076,13 @@ static void vaos_set_buffer( struct draw_vs_varient *varient,
unsigned stride )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
- unsigned i;
- for (i = 0; i < vaos->base.key.nr_inputs; i++) {
- if (vaos->base.key.element[i].in.buffer == buf) {
- vaos->attrib[i].input_ptr = ((char *)ptr +
- vaos->base.key.element[i].in.offset);
- vaos->attrib[i].input_stride = stride;
- }
+ if (buf < vaos->nr_vb) {
+ vaos->buffer[buf].base_ptr = (char *)ptr;
+ vaos->buffer[buf].stride = stride;
}
+
+ if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
}
@@ -2086,10 +2095,12 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
struct aos_machine *machine = vaos->draw->vs.aos_machine;
+ if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
machine->constants = vaos->draw->vs.aligned_constants;
machine->immediates = vaos->base.vs->immediates;
- machine->attrib = vaos->attrib;
+ machine->buffer = vaos->buffer;
vaos->gen_run_elts( machine,
elts,
@@ -2105,10 +2116,13 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
struct aos_machine *machine = vaos->draw->vs.aos_machine;
+ if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
+ vaos->base.key.const_vbuffers);
+
machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
machine->constants = vaos->draw->vs.aligned_constants;
machine->immediates = vaos->base.vs->immediates;
- machine->attrib = vaos->attrib;
+ machine->buffer = vaos->buffer;
vaos->gen_run_linear( machine,
start,
@@ -2127,7 +2141,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
- FREE( vaos->attrib );
+ FREE( vaos->buffer );
x86_release_func( &vaos->func[0] );
x86_release_func( &vaos->func[1] );
@@ -2140,6 +2154,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
const struct draw_vs_varient_key *key )
{
+ unsigned i;
struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
if (!vaos)
@@ -2147,17 +2162,22 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
vaos->base.key = *key;
vaos->base.vs = vs;
- vaos->base.set_input = vaos_set_buffer;
+ vaos->base.set_buffer = vaos_set_buffer;
vaos->base.destroy = vaos_destroy;
vaos->base.run_linear = vaos_run_linear;
vaos->base.run_elts = vaos_run_elts;
vaos->draw = vs->draw;
- vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
- if (!vaos->attrib)
+ for (i = 0; i < key->nr_inputs; i++)
+ vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
+
+ vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
+ if (!vaos->buffer)
goto fail;
+ debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+
#if 0
tgsi_dump(vs->state.tokens, 0);
#endif
@@ -2179,8 +2199,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
return &vaos->base;
fail:
- if (vaos && vaos->attrib)
- FREE(vaos->attrib);
+ if (vaos && vaos->buffer)
+ FREE(vaos->buffer);
if (vaos)
x86_release_func( &vaos->func[0] );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 7fe6f79db0..264387517b 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -87,9 +87,10 @@ struct lit_info {
#define MAX_SHINE_TAB 4
#define MAX_LIT_INFO 16
-struct aos_attrib {
- const void *input_ptr;
- unsigned input_stride;
+struct aos_buffer {
+ const void *base_ptr;
+ unsigned stride;
+ void *ptr; /* updated per vertex */
};
@@ -123,7 +124,7 @@ struct aos_machine {
const float (*immediates)[4]; /* points to shader data */
const float (*constants)[4]; /* points to draw data */
- const struct aos_attrib *attrib; /* points to ? */
+ const struct aos_buffer *buffer; /* points to ? */
};
@@ -175,12 +176,15 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp,
unsigned idx,
unsigned dirty );
+void aos_spill_all( struct aos_compilation *cp );
+
struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
unsigned file,
unsigned idx );
-boolean aos_fetch_inputs( struct aos_compilation *cp,
- boolean linear );
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
boolean aos_emit_outputs( struct aos_compilation *cp );
@@ -210,7 +214,7 @@ do { \
#define X86_NULL 0
#define X86_IMMEDIATES 1
#define X86_CONSTANTS 2
-#define X86_ATTRIBS 3
+#define X86_BUFFERS 3
struct x86_reg aos_get_x86( struct aos_compilation *cp,
unsigned which_reg,
@@ -232,7 +236,8 @@ struct draw_vs_varient_aos_sse {
struct draw_vs_varient base;
struct draw_context *draw;
- struct aos_attrib *attrib;
+ struct aos_buffer *buffer;
+ unsigned nr_vb;
vaos_run_linear_func gen_run_linear;
vaos_run_elts_func gen_run_elts;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 26297c74f8..dd79bc799a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -54,6 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
struct x86_reg data,
struct x86_reg src_ptr )
{
+#if 1
sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
/* data = z ? ? ? */
sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
@@ -62,6 +63,16 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
/* data = ? 0 z 1 */
sse_movlps(cp->func, data, src_ptr);
/* data = x y z 1 */
+#else
+ sse_movups(cp->func, data, src_ptr);
+ /* data = x y z ? */
+ sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
+ /* data = ? x y z */
+ sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
+ /* data = 1 x y z */
+ sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
+ /* data = x y z 1 */
+#endif
}
static void emit_load_R32G32( struct aos_compilation *cp,
@@ -95,28 +106,6 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
-static void get_src_ptr( struct aos_compilation *cp,
- struct x86_reg src,
- struct x86_reg elt,
- unsigned a )
-{
- struct x86_reg attrib = x86_make_disp(aos_get_x86( cp, 0, X86_ATTRIBS ),
- a * sizeof(struct aos_attrib));
-
- struct x86_reg input_ptr = x86_make_disp(attrib,
- Offset(struct aos_attrib, input_ptr));
-
- struct x86_reg input_stride = x86_make_disp(attrib,
- Offset(struct aos_attrib, input_stride));
-
- /* Calculate pointer to current attrib:
- */
- x86_mov(cp->func, src, input_stride);
- x86_imul(cp->func, src, elt);
- x86_add(cp->func, src, input_ptr);
-}
-
-
/* Extended swizzles? Maybe later.
*/
static void emit_swizzle( struct aos_compilation *cp,
@@ -128,22 +117,60 @@ static void emit_swizzle( struct aos_compilation *cp,
}
+
+static boolean get_buffer_ptr( struct aos_compilation *cp,
+ boolean linear,
+ unsigned buf_idx,
+ struct x86_reg elt,
+ struct x86_reg ptr)
+{
+ struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
+ buf_idx * sizeof(struct aos_buffer));
+
+ struct x86_reg buf_stride = x86_make_disp(buf,
+ Offset(struct aos_buffer, stride));
+ if (linear) {
+ struct x86_reg buf_ptr = x86_make_disp(buf,
+ Offset(struct aos_buffer, ptr));
+
+
+ /* Calculate pointer to current attrib:
+ */
+ x86_mov(cp->func, ptr, buf_ptr);
+ x86_mov(cp->func, elt, buf_stride);
+ x86_add(cp->func, elt, ptr);
+ if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
+ x86_mov(cp->func, buf_ptr, elt);
+ }
+ else {
+ struct x86_reg buf_base_ptr = x86_make_disp(buf,
+ Offset(struct aos_buffer, base_ptr));
+
+
+ /* Calculate pointer to current attrib:
+ */
+ x86_mov(cp->func, ptr, buf_stride);
+ x86_imul(cp->func, ptr, elt);
+ x86_add(cp->func, ptr, buf_base_ptr);
+ }
+
+ cp->insn_counter++;
+
+ return TRUE;
+}
+
+
static boolean load_input( struct aos_compilation *cp,
unsigned idx,
- boolean linear )
+ struct x86_reg bufptr )
{
unsigned format = cp->vaos->base.key.element[idx].in.format;
- struct x86_reg src = cp->tmp_EAX;
+ unsigned offset = cp->vaos->base.key.element[idx].in.offset;
struct x86_reg dataXMM = aos_get_xmm_reg(cp);
/* Figure out source pointer address:
*/
- get_src_ptr(cp,
- src,
- linear ? cp->idx_EBX : x86_deref(cp->idx_EBX),
- idx);
-
- src = x86_deref(src);
+ struct x86_reg src = x86_make_disp(bufptr, offset);
aos_adopt_xmm_reg( cp,
dataXMM,
@@ -179,20 +206,128 @@ static boolean load_input( struct aos_compilation *cp,
return TRUE;
}
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+static boolean load_inputs( struct aos_compilation *cp,
+ unsigned buffer,
+ struct x86_reg ptr )
{
unsigned i;
-
+
for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
- if (!load_input( cp, i, linear ))
- return FALSE;
- cp->insn_counter++;
+ if (cp->vaos->base.key.element[i].in.buffer == buffer) {
+
+ if (!load_input( cp, i, ptr ))
+ return FALSE;
+
+ cp->insn_counter++;
+ }
+ }
+
+ return TRUE;
+}
+
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
+{
+ unsigned i;
+ for (i = 0; i < cp->vaos->nr_vb; i++) {
+ struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
+ i * sizeof(struct aos_buffer));
+
+ struct x86_reg buf_base_ptr = x86_make_disp(buf,
+ Offset(struct aos_buffer, base_ptr));
+
+ if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
+ struct x86_reg ptr = cp->tmp_EAX;
+
+ x86_mov(cp->func, ptr, buf_base_ptr);
+
+ /* Load all inputs for this constant vertex buffer
+ */
+ load_inputs( cp, i, x86_deref(ptr) );
+
+ /* Then just force them out to aos_machine.input[]
+ */
+ aos_spill_all( cp );
+
+ }
+ else if (linear) {
+
+ struct x86_reg elt = cp->idx_EBX;
+ struct x86_reg ptr = cp->tmp_EAX;
+
+ struct x86_reg buf_stride = x86_make_disp(buf,
+ Offset(struct aos_buffer, stride));
+
+ struct x86_reg buf_ptr = x86_make_disp(buf,
+ Offset(struct aos_buffer, ptr));
+
+
+ /* Calculate pointer to current attrib:
+ */
+ x86_mov(cp->func, ptr, buf_stride);
+ x86_imul(cp->func, ptr, elt);
+ x86_add(cp->func, ptr, buf_base_ptr);
+
+
+ /* In the linear case, keep the buffer pointer instead of the
+ * index number.
+ */
+ if (cp->vaos->nr_vb == 1)
+ x86_mov( cp->func, elt, ptr );
+ else
+ x86_mov( cp->func, buf_ptr, ptr );
+
+ cp->insn_counter++;
+ }
+ }
+
+ return TRUE;
+}
+
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+{
+ unsigned j;
+
+ for (j = 0; j < cp->vaos->nr_vb; j++) {
+ if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
+ /* just retreive pre-transformed input */
+ }
+ else if (linear && cp->vaos->nr_vb == 1) {
+ load_inputs( cp, 0, cp->idx_EBX );
+ }
+ else {
+ struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
+ struct x86_reg ptr = cp->tmp_EAX;
+
+ if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
+ return FALSE;
+
+ if (!load_inputs( cp, j, ptr ))
+ return FALSE;
+ }
}
return TRUE;
}
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
+{
+ if (linear && cp->vaos->nr_vb == 1) {
+ struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ),
+ (0 * sizeof(struct aos_buffer) +
+ Offset(struct aos_buffer, stride)));
+
+ x86_add(cp->func, cp->idx_EBX, stride);
+ sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
+ }
+ else if (linear) {
+ /* Nothing to do */
+ }
+ else {
+ x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
+ }
+
+ return TRUE;
+}
@@ -203,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
struct x86_reg dst_ptr,
struct x86_reg dataXMM )
{
- sse_movups(cp->func, dst_ptr, dataXMM);
+ sse_movaps(cp->func, dst_ptr, dataXMM);
}
static void emit_store_R32G32B32( struct aos_compilation *cp,
@@ -306,7 +441,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp )
if (data.file != file_XMM) {
struct x86_reg tmp = aos_get_xmm_reg( cp );
- sse_movups(cp->func, tmp, data);
+ sse_movaps(cp->func, tmp, data);
data = tmp;
}
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 2ce30b9a02..727977bc3a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -32,6 +32,7 @@
* Brian Paul
*/
+#include "util/u_memory.h"
#include "pipe/p_shader_tokens.h"
#include "draw_private.h"
#include "draw_context.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0efabd9de8..b11ae31662 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
#include "draw_vs.h"
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
#include "pipe/p_shader_tokens.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 4daf05dae7..7ee567d478 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -64,10 +64,10 @@ struct draw_vs_varient_generic {
-static void vsvg_set_input( struct draw_vs_varient *varient,
- unsigned buffer,
- const void *ptr,
- unsigned stride )
+static void vsvg_set_buffer( struct draw_vs_varient *varient,
+ unsigned buffer,
+ const void *ptr,
+ unsigned stride )
{
struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
@@ -265,7 +265,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
vsvg->base.key = *key;
vsvg->base.vs = vs;
- vsvg->base.set_input = vsvg_set_input;
+ vsvg->base.set_buffer = vsvg_set_buffer;
vsvg->base.run_elts = vsvg_run_elts;
vsvg->base.run_linear = vsvg_run_linear;
vsvg->base.destroy = vsvg_destroy;
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
index 0fc5c4ec5c..fcc5c05794 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
@@ -1,140 +1,140 @@
static const unsigned char llvm_builtins_data[] = {
-0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x29,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
+0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
-0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c,
-0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x32,0x22,0x48,0x09,
-0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52,
-0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x29,0x80,0x21,0x00,
-0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,0x80,0x50,0x2b,0x03,
-0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,0x14,0x01,0x80,0x11,
-0x80,0x22,0x88,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,0x36,0x80,0x87,0x71,
-0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,0x70,0x87,0x7a,0xd8,
-0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,
-0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,0x71,0x60,0x07,0x7a,
-0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,0x20,0x07,0x7a,0x30,
-0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,
-0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,
-0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,0x07,0x74,0xa0,0x07,
-0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78,
-0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,0x00,0x07,0x7a,0x60,
-0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0x07,
-0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,0x04,0x20,0x76,0x46,
-0xfc,0x6c,0x48,0x92,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,0x12,0x20,0x00,0x00,
-0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x01,0x00,0x00,0x00,0x30,0x24,0x59,0x00,0x20,
-0x08,0x00,0x00,0x00,0x86,0x24,0x0a,0x00,0x04,0x00,0x00,0x00,0xc0,0x90,0x84,0x01,
-0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
-0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,0x00,0x00,0x00,0x0c,
-0x49,0x14,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x49,0x01,0x00,0x41,0x00,0x00,0x00,
-0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
+0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff,
+0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,
+0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,
+0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,
+0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,
+0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,
+0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05,
+0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36,
+0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7,
+0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
+0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71,
+0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,
+0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,
+0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a,
+0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07,
+0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,
+0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40,
+0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,
+0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f,
+0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00,
+0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02,
+0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00,
+0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01,
+0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
+0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64,
+0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02,
0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66,
0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95,
0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00,
0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60,
-0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,0x34,0xc9,0x30,0x41,
-0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,0x8d,0x35,0x56,0x01,
-0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0x46,0x41,0x08,0xcc,
-0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,0x35,0x04,0x80,0x39,
-0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,0x04,0x3e,0x30,0x0c,
-0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,0x05,0xd1,0x4c,0x11,
-0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41,
+0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01,
+0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6,
+0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d,
+0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7,
+0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f,
+0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84,
0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00,
-0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
+0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19,
-0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,0x17,0x60,0x20,0xc5,
-0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,0xf3,0xd4,0xb8,0x69,
-0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,0x0c,0x13,0xf3,0x9c,
-0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,0x8b,0x23,0x28,0x76,
-0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,0xd1,0x4c,0x11,0x66,
-0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,0x11,0x00,0x00,0x00,
-0x63,0x08,0x4d,0x64,0x16,0xc1,0x49,0x86,0xab,0x22,0x66,0x19,0x02,0x01,0x1b,0x43,
-0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,0x0a,0x20,0x0b,0x34,
-0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x24,0x83,0x57,0x11,0xb3,
-0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,0x48,0xb3,0x04,0xc6,
-0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,0x63,0x08,0xcd,0x64,
-0x64,0x40,0x70,0x92,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,0x60,0x0c,0xc1,0x99,
-0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,0x33,0x38,0xd0,0x00,
-0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,0xe0,0x24,0x03,0x1b,
-0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,0x30,0x63,0x08,0x0f,
-0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,0xa0,0x06,0x70,0x00,
-0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x76,0x52,0x4c,0xcc,
-0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,0xc6,0x50,0x8a,0x89,
-0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,0x79,0x68,0x73,0x20,
-0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,0x9e,0xdb,0x32,0x88,
-0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,0xa7,0xb7,0x95,0x62,
-0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,0x34,0x35,0x56,0x62,
-0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,0x2c,0x82,0xd3,0x0c,
-0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,0x24,0x01,0x63,0xec,
-0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,0xfc,0xc4,0xd0,0x90,
-0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,0xc1,0x71,0x7b,0x29,
-0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,0x32,0xf6,0xe6,0x46,
-0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,0x4e,0x33,0x58,0x47,
-0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,0x33,0xe1,0xbc,0xa5,
-0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,0x3a,0x40,0xc6,0xde,
-0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,0xed,0x82,0x10,0x9c,
-0xa6,0xba,0x81,0x44,0x70,0x9a,0xc1,0x17,0x9c,0x66,0x32,0x93,0x42,0x60,0x1e,0x7b,
-0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,0xa7,0x6d,0xa4,0x98,
-0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,0x9c,0x66,0xc0,0x7b,
-0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,0x8b,0xe0,0x34,0x83,
-0x2f,0x38,0xcd,0x64,0xd3,0xe6,0x61,0x08,0x4e,0x53,0xd5,0xf6,0x01,0x14,0x44,0x33,
-0x45,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
-0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,0x1e,0xe1,0x19,0xc6,
-0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,0x15,0xc1,0x31,0x84,
-0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,0x71,0x6c,0x23,0x38,
-0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79,
-0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5,
-0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0,
-0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,0x63,0x21,0x40,0x70,
-0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00,
-0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf,
-0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2,
-0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00,
-0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7,
-0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21,
-0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54,
-0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13,
-0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd,
-0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c,
-0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04,
-0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48,
-0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85,
-0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31,
-0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c,
-0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,
-0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51,
-0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51,
-0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d,
-0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05,
-0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04,
-0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4,
-0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58,
-0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33,
-0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00,
-0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f,
-0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52,
-0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec,
-0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,
-0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,
-0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,
-0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,
-0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18,
-0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,
-0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,
-0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c,
-0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00};
+0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05,
+0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,
+0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,
+0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb,
+0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,
+0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19,
+0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,
+0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70,
+0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,
+0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,
+0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,
+0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,
+0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,
+0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,
+0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,
+0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,
+0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,
+0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,
+0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,
+0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,
+0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,
+0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,
+0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,
+0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,
+0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,
+0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,
+0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,
+0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,
+0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,
+0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93,
+0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,
+0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,
+0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,
+0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6,
+0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
+0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,
+0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,
+0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,
+0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,
+0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,
+0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,
+0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,
+0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,
+0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,
+0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,
+0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,
+0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,
+0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,
+0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,
+0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,
+0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,
+0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,
+0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,
+0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,
+0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0,
+0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4,
+0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c,
+0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,
+0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
+0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,
+0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,
+0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc,
+0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,
+0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54,
+0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,
+0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4,
+0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,
+0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,
+0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,
+0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76,
+0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4,
+0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,
+0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,
+0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,
+0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
+0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
+0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
+0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
+0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84,
+0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index e64bfb1c6c..3a2f2878a3 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -46,6 +46,7 @@
#include "tgsi/tgsi_dump.h"
#include "util/u_memory.h"
+#include "util/u_math.h"
#include <llvm/Module.h>
#include <llvm/CallingConv.h>
@@ -157,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog
llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
llvm::ExecutionEngine *ee = cpu->engine;
assert(ee);
- /*FIXME : remove */
- ee->DisableLazyCompilation();
+ /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+ ee->DisableLazyCompilation(false);
ee->addModuleProvider(mp);
llvm::Function *func = func_for_shader(prog);
@@ -201,7 +202,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
unsigned int i, j;
unsigned slot;
vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-
assert(runner);
for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index a82dc30306..599975d5ad 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -83,6 +83,7 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
m_llvmPow = 0;
m_llvmFloor = 0;
m_llvmFlog = 0;
+ m_llvmFexp = 0;
m_llvmLit = 0;
m_fmtPtr = 0;
@@ -92,194 +93,271 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
m_mod = ParseBitcodeFile(buffer);
}
+llvm::BasicBlock * Instructions::currentBlock() const
+{
+ return m_builder.GetInsertBlock();
+}
+
+llvm::Value * Instructions::abs(llvm::Value *in)
+{
+ std::vector<llvm::Value*> vec = extractVector(in);
+ Value *xabs = callFAbs(vec[0]);
+ Value *yabs = callFAbs(vec[1]);
+ Value *zabs = callFAbs(vec[2]);
+ Value *wabs = callFAbs(vec[3]);
+ return vectorFromVals(xabs, yabs, zabs, wabs);
+}
+
llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
{
return m_builder.CreateAdd(in1, in2, name("add"));
}
-llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
- llvm::Value *in3)
+llvm::Value * Instructions::arl(llvm::Value *in)
{
- Value *mulRes = mul(in1, in2);
- return add(mulRes, in3);
+ return floor(in);
}
-
-llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
+
+void Instructions::beginLoop()
{
- return m_builder.CreateMul(in1, in2, name("mul"));
+ BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
+ BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+
+ m_builder.CreateBr(begin);
+ Loop loop;
+ loop.begin = begin;
+ loop.end = end;
+ m_builder.SetInsertPoint(begin);
+ m_loopStack.push(loop);
}
-const char * Instructions::name(const char *prefix)
+void Instructions::bgnSub(unsigned label)
{
- ++m_idx;
- snprintf(m_name, 32, "%s%d", prefix, m_idx);
- return m_name;
+ llvm::Function *func = findFunction(label);
+
+ Function::arg_iterator args = func->arg_begin();
+ Value *ptr_INPUT = args++;
+ ptr_INPUT->setName("INPUT");
+ m_storage->pushArguments(ptr_INPUT);
+
+ llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
+
+ m_func = func;
+ m_builder.SetInsertPoint(entry);
}
-llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+void Instructions::brk()
{
- Value *mulRes = mul(in1, in2);
- Value *x = m_builder.CreateExtractElement(mulRes,
- m_storage->constantInt(0),
- name("extractx"));
- Value *y = m_builder.CreateExtractElement(mulRes,
- m_storage->constantInt(1),
- name("extracty"));
- Value *z = m_builder.CreateExtractElement(mulRes,
- m_storage->constantInt(2),
- name("extractz"));
- Value *xy = m_builder.CreateAdd(x, y,name("xy"));
- Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
- return vectorFromVals(dot3, dot3, dot3, dot3);
+ assert(!m_loopStack.empty());
+ BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
+ m_builder.CreateBr(m_loopStack.top().end);
+ m_builder.SetInsertPoint(unr);
}
-llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+void Instructions::cal(int label, llvm::Value *input)
{
- if (!m_llvmFSqrt) {
- // predeclare the intrinsic
- std::vector<const Type*> fsqrtArgs;
- fsqrtArgs.push_back(Type::FloatTy);
- PAListPtr fsqrtPal;
- FunctionType* fsqrtType = FunctionType::get(
- /*Result=*/Type::FloatTy,
- /*Params=*/fsqrtArgs,
- /*isVarArg=*/false);
- m_llvmFSqrt = Function::Create(
- /*Type=*/fsqrtType,
- /*Linkage=*/GlobalValue::ExternalLinkage,
- /*Name=*/"llvm.sqrt.f32", m_mod);
- m_llvmFSqrt->setCallingConv(CallingConv::C);
- m_llvmFSqrt->setParamAttrs(fsqrtPal);
- }
- CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
- name("sqrt"));
- call->setCallingConv(CallingConv::C);
- call->setTailCall(false);
- return call;
+ std::vector<Value*> params;
+ params.push_back(input);
+ llvm::Function *func = findFunction(label);
+
+ m_builder.CreateCall(func, params.begin(), params.end());
}
-llvm::Value * Instructions::rsq(llvm::Value *in1)
+llvm::Value * Instructions::ceil(llvm::Value *in)
{
- Value *x = m_builder.CreateExtractElement(in1,
- m_storage->constantInt(0),
- name("extractx"));
- Value *abs = callFAbs(x);
- Value *sqrt = callFSqrt(abs);
-
- Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
- sqrt,
- name("rsqrt"));
- return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+ std::vector<llvm::Value*> vec = extractVector(in);
+ return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]),
+ callCeil(vec[2]), callCeil(vec[3]));
}
-llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
- llvm::Value *z, llvm::Value *w)
+llvm::Value * Instructions::clamp(llvm::Value *in1)
{
- Constant *const_vec = Constant::getNullValue(m_floatVecType);
- Value *res = m_builder.CreateInsertElement(const_vec, x,
- m_storage->constantInt(0),
- name("vecx"));
- res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
- name("vecxy"));
- res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
- name("vecxyz"));
- if (w)
- res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
- name("vecxyzw"));
- return res;
+ llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f);
+ llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f);
+ return min( max(zero, in1), one);
}
-llvm::Value *Instructions::callFAbs(llvm::Value *val)
+llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
{
- if (!m_llvmFAbs) {
- // predeclare the intrinsic
- std::vector<const Type*> fabsArgs;
- fabsArgs.push_back(Type::FloatTy);
- PAListPtr fabsPal;
- FunctionType* fabsType = FunctionType::get(
- /*Result=*/Type::FloatTy,
- /*Params=*/fabsArgs,
- /*isVarArg=*/false);
- m_llvmFAbs = Function::Create(
- /*Type=*/fabsType,
- /*Linkage=*/GlobalValue::ExternalLinkage,
- /*Name=*/"fabs", m_mod);
- m_llvmFAbs->setCallingConv(CallingConv::C);
- m_llvmFAbs->setParamAttrs(fabsPal);
- }
- CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
- name("fabs"));
- call->setCallingConv(CallingConv::C);
+ llvm::Function *func = m_mod->getFunction("cmp");
+ assert(func);
+
+ std::vector<Value*> params;
+ params.push_back(in1);
+ params.push_back(in2);
+ params.push_back(in3);
+ CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
call->setTailCall(false);
return call;
}
-llvm::Value * Instructions::lit(llvm::Value *in)
+llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
{
- if (!m_llvmLit) {
- m_llvmLit = m_mod->getFunction("lit");
- }
- CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
- call->setCallingConv(CallingConv::C);
- call->setTailCall(false);
- return call;
+ std::vector<llvm::Value*> vec1 = extractVector(in1);
+ std::vector<llvm::Value*> vec2 = extractVector(in2);
+ std::vector<llvm::Value*> vec3 = extractVector(in3);
+ Constant *half = ConstantFP::get(APFloat(0.5f));
+
+ Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp"));
+ Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+ name("selx"));
+
+ Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp"));
+ Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+ name("sely"));
+
+ Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp"));
+ Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+ name("selz"));
+
+ Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp"));
+ Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+ name("selw"));
+
+ return vectorFromVals(selx, sely, selz, selw);
}
-llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
{
- Value *res = m_builder.CreateSub(in1, in2, name("sub"));
- return res;
+ std::vector<llvm::Value*> vec1 = extractVector(in1);
+ std::vector<llvm::Value*> vec2 = extractVector(in2);
+ std::vector<llvm::Value*> vec3 = extractVector(in3);
+ Constant *zero = Constant::getNullValue(Type::FloatTy);
+
+ Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp"));
+ Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+ name("selx"));
+
+ Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp"));
+ Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+ name("sely"));
+
+ Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp"));
+ Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+ name("selz"));
+
+ Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp"));
+ Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+ name("selw"));
+
+ return vectorFromVals(selx, sely, selz, selw);
}
-llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+llvm::Value * Instructions::cos(llvm::Value *in)
{
- if (!m_llvmPow) {
- // predeclare the intrinsic
- std::vector<const Type*> powArgs;
- powArgs.push_back(Type::FloatTy);
- powArgs.push_back(Type::FloatTy);
- PAListPtr powPal;
- FunctionType* powType = FunctionType::get(
- /*Result=*/Type::FloatTy,
- /*Params=*/powArgs,
- /*isVarArg=*/false);
- m_llvmPow = Function::Create(
- /*Type=*/powType,
- /*Linkage=*/GlobalValue::ExternalLinkage,
- /*Name=*/"llvm.pow.f32", m_mod);
- m_llvmPow->setCallingConv(CallingConv::C);
- m_llvmPow->setParamAttrs(powPal);
- }
- std::vector<Value*> params;
- params.push_back(val1);
- params.push_back(val2);
- CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
- name("pow"));
- call->setCallingConv(CallingConv::C);
+#if 0
+ llvm::Function *func = m_mod->getFunction("vcos");
+ assert(func);
+
+ CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
call->setTailCall(false);
return call;
+#else
+ std::vector<llvm::Value*> elems = extractVector(in);
+ Function *func = m_mod->getFunction("cosf");
+ assert(func);
+ CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
+ cos->setCallingConv(CallingConv::C);
+ cos->setTailCall(true);
+ return vectorFromVals(cos, cos, cos, cos);
+#endif
}
-llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
{
Value *x1 = m_builder.CreateExtractElement(in1,
m_storage->constantInt(0),
name("x1"));
+ Value *y1 = m_builder.CreateExtractElement(in1,
+ m_storage->constantInt(1),
+ name("y1"));
+ Value *z1 = m_builder.CreateExtractElement(in1,
+ m_storage->constantInt(2),
+ name("z1"));
+
Value *x2 = m_builder.CreateExtractElement(in2,
m_storage->constantInt(0),
name("x2"));
- llvm::Value *val = callPow(x1, x2);
- return vectorFromVals(val, val, val, val);
+ Value *y2 = m_builder.CreateExtractElement(in2,
+ m_storage->constantInt(1),
+ name("y2"));
+ Value *z2 = m_builder.CreateExtractElement(in2,
+ m_storage->constantInt(2),
+ name("z2"));
+ Value *y1z2 = mul(y1, z2);
+ Value *z1y2 = mul(z1, y2);
+
+ Value *z1x2 = mul(z1, x2);
+ Value *x1z2 = mul(x1, z2);
+
+ Value *x1y2 = mul(x1, y2);
+ Value *y1x2 = mul(y1, x2);
+
+ return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
}
-llvm::Value * Instructions::rcp(llvm::Value *in1)
+llvm::Value * Instructions::ddx(llvm::Value *in)
{
- Value *x1 = m_builder.CreateExtractElement(in1,
- m_storage->constantInt(0),
- name("x1"));
- Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
- x1, name("rcp"));
- return vectorFromVals(res, res, res, res);
+ // FIXME
+ assert(0);
+}
+
+llvm::Value * Instructions::ddy(llvm::Value *in)
+{
+ // FIXME
+ assert(0);
+}
+
+llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
+{
+ return m_builder.CreateFDiv(in1, in2, name("div"));
+}
+
+llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+ Value *mulRes = mul(in1, in2);
+ Value *x = m_builder.CreateExtractElement(mulRes,
+ m_storage->constantInt(0),
+ name("extractx"));
+ Value *y = m_builder.CreateExtractElement(mulRes,
+ m_storage->constantInt(1),
+ name("extracty"));
+ Value *z = m_builder.CreateExtractElement(in3,
+ m_storage->constantInt(2),
+ name("extractz"));
+ Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+ Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add"));
+ return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
+}
+
+llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2)
+{
+ Value *mulRes = mul(in1, in2);
+ Value *x = m_builder.CreateExtractElement(mulRes,
+ m_storage->constantInt(0),
+ name("extractx"));
+ Value *y = m_builder.CreateExtractElement(mulRes,
+ m_storage->constantInt(1),
+ name("extracty"));
+ Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+ return vectorFromVals(xy, xy, xy, xy);
+}
+
+llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+{
+ Value *mulRes = mul(in1, in2);
+ Value *x = m_builder.CreateExtractElement(mulRes,
+ m_storage->constantInt(0),
+ name("extractx"));
+ Value *y = m_builder.CreateExtractElement(mulRes,
+ m_storage->constantInt(1),
+ name("extracty"));
+ Value *z = m_builder.CreateExtractElement(mulRes,
+ m_storage->constantInt(2),
+ name("extractz"));
+ Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+ Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
+ return vectorFromVals(dot3, dot3, dot3, dot3);
}
llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
@@ -321,6 +399,53 @@ llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
ry, z, w);
}
+void Instructions::elseop()
+{
+ assert(!m_ifStack.empty());
+ BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
+ m_builder.CreateBr(ifend);
+ m_builder.SetInsertPoint(m_ifStack.top());
+ currentBlock()->setName(name("ifelse"));
+ m_ifStack.pop();
+ m_ifStack.push(ifend);
+}
+
+void Instructions::endif()
+{
+ assert(!m_ifStack.empty());
+ m_builder.CreateBr(m_ifStack.top());
+ m_builder.SetInsertPoint(m_ifStack.top());
+ m_ifStack.pop();
+}
+
+void Instructions::endLoop()
+{
+ assert(!m_loopStack.empty());
+ Loop loop = m_loopStack.top();
+ m_builder.CreateBr(loop.begin);
+ loop.end->moveAfter(currentBlock());
+ m_builder.SetInsertPoint(loop.end);
+ m_loopStack.pop();
+}
+
+void Instructions::end()
+{
+ m_builder.CreateRetVoid();
+}
+
+void Instructions::endSub()
+{
+ m_func = 0;
+ m_builder.SetInsertPoint(0);
+}
+
+llvm::Value * Instructions::exp(llvm::Value *in)
+{
+ std::vector<llvm::Value*> vec = extractVector(in);
+ return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]),
+ callFExp(vec[2]), callFExp(vec[3]));
+}
+
llvm::Value * Instructions::ex2(llvm::Value *in)
{
llvm::Value *val = callPow(ConstantFP::get(APFloat(2.f)),
@@ -330,31 +455,6 @@ llvm::Value * Instructions::ex2(llvm::Value *in)
return vectorFromVals(val, val, val, val);
}
-llvm::Value * Instructions::callFloor(llvm::Value *val)
-{
- if (!m_llvmFloor) {
- // predeclare the intrinsic
- std::vector<const Type*> floorArgs;
- floorArgs.push_back(Type::FloatTy);
- PAListPtr floorPal;
- FunctionType* floorType = FunctionType::get(
- /*Result=*/Type::FloatTy,
- /*Params=*/floorArgs,
- /*isVarArg=*/false);
- m_llvmFloor = Function::Create(
- /*Type=*/floorType,
- /*Linkage=*/GlobalValue::ExternalLinkage,
- /*Name=*/"floorf", m_mod);
- m_llvmFloor->setCallingConv(CallingConv::C);
- m_llvmFloor->setParamAttrs(floorPal);
- }
- CallInst *call = m_builder.CreateCall(m_llvmFloor, val,
- name("floorf"));
- call->setCallingConv(CallingConv::C);
- call->setTailCall(false);
- return call;
-}
-
llvm::Value * Instructions::floor(llvm::Value *in)
{
std::vector<llvm::Value*> vec = extractVector(in);
@@ -362,42 +462,52 @@ llvm::Value * Instructions::floor(llvm::Value *in)
callFloor(vec[2]), callFloor(vec[3]));
}
-llvm::Value * Instructions::arl(llvm::Value *in)
-{
- return floor(in);
-}
-
llvm::Value * Instructions::frc(llvm::Value *in)
{
llvm::Value *flr = floor(in);
return sub(in, flr);
}
-llvm::Value * Instructions::callFLog(llvm::Value *val)
+void Instructions::ifop(llvm::Value *in)
{
- if (!m_llvmFlog) {
- // predeclare the intrinsic
- std::vector<const Type*> flogArgs;
- flogArgs.push_back(Type::FloatTy);
- PAListPtr flogPal;
- FunctionType* flogType = FunctionType::get(
- /*Result=*/Type::FloatTy,
- /*Params=*/flogArgs,
- /*isVarArg=*/false);
- m_llvmFlog = Function::Create(
- /*Type=*/flogType,
- /*Linkage=*/GlobalValue::ExternalLinkage,
- /*Name=*/"logf", m_mod);
- m_llvmFlog->setCallingConv(CallingConv::C);
- m_llvmFlog->setParamAttrs(flogPal);
- }
- CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
- name("logf"));
- call->setCallingConv(CallingConv::C);
+ BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
+ BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+
+ //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
+ //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
+ //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+
+ Constant *float0 = Constant::getNullValue(Type::FloatTy);
+
+ Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
+ name("extractx"));
+ Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
+ m_builder.CreateCondBr(xcmp, ifthen, ifend);
+ //m_builder.SetInsertPoint(yblock);
+
+ m_builder.SetInsertPoint(ifthen);
+ m_ifStack.push(ifend);
+}
+
+llvm::Value * Instructions::kil(llvm::Value *in)
+{
+ llvm::Function *func = m_mod->getFunction("kil");
+ assert(func);
+
+ CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
call->setTailCall(false);
return call;
}
+llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
+ llvm::Value *in3)
+{
+ llvm::Value *m = mul(in1, in2);
+ llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
+ llvm::Value *s = sub(vec1, in1);
+ return add(m, mul(s, in3));
+}
+
llvm::Value * Instructions::lg2(llvm::Value *in)
{
std::vector<llvm::Value*> vec = extractVector(in);
@@ -407,142 +517,176 @@ llvm::Value * Instructions::lg2(llvm::Value *in)
callFLog(vec[2]), callFLog(vec[3])), const_vec);
}
-llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::lit(llvm::Value *in)
+{
+ if (!m_llvmLit) {
+ m_llvmLit = m_mod->getFunction("lit");
+ }
+ CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value * Instructions::log(llvm::Value *in)
+{
+ std::vector<llvm::Value*> vec = extractVector(in);
+ return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+ callFLog(vec[2]), callFLog(vec[3]));
+}
+
+llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
+ llvm::Value *in3)
+{
+ Value *mulRes = mul(in1, in2);
+ return add(mulRes, in3);
+}
+
+llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
{
std::vector<llvm::Value*> vec1 = extractVector(in1);
std::vector<llvm::Value*> vec2 = extractVector(in2);
- Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+ Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
+ name("xcmp"));
Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
name("selx"));
- Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+ Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
+ name("ycmp"));
Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
name("sely"));
- Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+ Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
+ name("zcmp"));
Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
name("selz"));
- Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+ Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
+ name("wcmp"));
Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
name("selw"));
return vectorFromVals(selx, sely, selz, selw);
}
-llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
{
std::vector<llvm::Value*> vec1 = extractVector(in1);
std::vector<llvm::Value*> vec2 = extractVector(in2);
- Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
- name("xcmp"));
+ Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
name("selx"));
- Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
- name("ycmp"));
+ Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
name("sely"));
- Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
- name("zcmp"));
+ Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
name("selz"));
- Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
- name("wcmp"));
+ Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
name("selw"));
return vectorFromVals(selx, sely, selz, selw);
}
-void Instructions::printVector(llvm::Value *val)
+llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
{
- static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+ return m_builder.CreateMul(in1, in2, name("mul"));
+}
- if (!m_fmtPtr) {
- Constant *format = ConstantArray::get(frmt, true);
- ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
- GlobalVariable* globalFormat = new GlobalVariable(
- /*Type=*/arrayTy,
- /*isConstant=*/true,
- /*Linkage=*/GlobalValue::InternalLinkage,
- /*Initializer=*/0, // has initializer, specified below
- /*Name=*/name(".str"),
- m_mod);
- globalFormat->setInitializer(format);
+llvm::Value * Instructions::neg(llvm::Value *in)
+{
+ Value *neg = m_builder.CreateNeg(in, name("neg"));
+ return neg;
+}
- Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
- std::vector<Constant*> const_ptr_21_indices;
- const_ptr_21_indices.push_back(const_int0);
- const_ptr_21_indices.push_back(const_int0);
- m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
- &const_ptr_21_indices[0], const_ptr_21_indices.size());
- }
+llvm::Value * Instructions::nrm(llvm::Value *in)
+{
+ llvm::Value *v = rsq(in);
+ return mul(v, in);
+}
- Function *func_printf = m_mod->getFunction("printf");
- if (!func_printf)
- func_printf = declarePrintf();
- assert(func_printf);
- std::vector<llvm::Value*> vec = extractVector(val);
- Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
- Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
- Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
- Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
- std::vector<Value*> params;
- params.push_back(m_fmtPtr);
- params.push_back(dx);
- params.push_back(dy);
- params.push_back(dz);
- params.push_back(dw);
- CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
- name("printf"));
- call->setCallingConv(CallingConv::C);
- call->setTailCall(true);
+llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+{
+ Value *x1 = m_builder.CreateExtractElement(in1,
+ m_storage->constantInt(0),
+ name("x1"));
+ Value *x2 = m_builder.CreateExtractElement(in2,
+ m_storage->constantInt(0),
+ name("x2"));
+ llvm::Value *val = callPow(x1, x2);
+ return vectorFromVals(val, val, val, val);
}
-llvm::Function * Instructions::declarePrintf()
+llvm::Value * Instructions::rcp(llvm::Value *in1)
{
- std::vector<const Type*> args;
- PAListPtr params;
- FunctionType* funcTy = FunctionType::get(
- /*Result=*/IntegerType::get(32),
- /*Params=*/args,
- /*isVarArg=*/true);
- Function* func_printf = Function::Create(
- /*Type=*/funcTy,
- /*Linkage=*/GlobalValue::ExternalLinkage,
- /*Name=*/"printf", m_mod);
- func_printf->setCallingConv(CallingConv::C);
- func_printf->setParamAttrs(params);
- return func_printf;
+ Value *x1 = m_builder.CreateExtractElement(in1,
+ m_storage->constantInt(0),
+ name("x1"));
+ Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+ x1, name("rcp"));
+ return vectorFromVals(res, res, res, res);
+}
+
+llvm::Value * Instructions::rsq(llvm::Value *in1)
+{
+ Value *x = m_builder.CreateExtractElement(in1,
+ m_storage->constantInt(0),
+ name("extractx"));
+ Value *abs = callFAbs(x);
+ Value *sqrt = callFSqrt(abs);
+
+ Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+ sqrt,
+ name("rsqrt"));
+ return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
}
+llvm::Value * Instructions::scs(llvm::Value *in)
+{
+ llvm::Function *func = m_mod->getFunction("scs");
+ assert(func);
-llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
+ CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2)
{
Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
Constant *const0f = Constant::getNullValue(Type::FloatTy);
std::vector<llvm::Value*> vec1 = extractVector(in1);
std::vector<llvm::Value*> vec2 = extractVector(in2);
- Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
+
+ Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp"));
Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
- Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
+ Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp"));
Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
- Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
+ Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp"));
Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
- Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
+ Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp"));
Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
return vectorFromVals(x, y, z, w);
}
+
+llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2)
+{
+ Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+ return vectorFromVals(const0f, const0f, const0f, const0f);
+}
+
llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
{
Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
@@ -566,157 +710,118 @@ llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
return vectorFromVals(x, y, z, w);
}
-
-llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
{
Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
Constant *const0f = Constant::getNullValue(Type::FloatTy);
std::vector<llvm::Value*> vec1 = extractVector(in1);
std::vector<llvm::Value*> vec2 = extractVector(in2);
-
- Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+ Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
- Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+ Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
- Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+ Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
- Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+ Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
return vectorFromVals(x, y, z, w);
}
-llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sin(llvm::Value *in)
{
- Value *x1 = m_builder.CreateExtractElement(in1,
- m_storage->constantInt(0),
- name("x1"));
- Value *y1 = m_builder.CreateExtractElement(in1,
- m_storage->constantInt(1),
- name("y1"));
- Value *z1 = m_builder.CreateExtractElement(in1,
- m_storage->constantInt(2),
- name("z1"));
+ llvm::Function *func = m_mod->getFunction("vsin");
+ assert(func);
- Value *x2 = m_builder.CreateExtractElement(in2,
- m_storage->constantInt(0),
- name("x2"));
- Value *y2 = m_builder.CreateExtractElement(in2,
- m_storage->constantInt(1),
- name("y2"));
- Value *z2 = m_builder.CreateExtractElement(in2,
- m_storage->constantInt(2),
- name("z2"));
- Value *y1z2 = mul(y1, z2);
- Value *z1y2 = mul(z1, y2);
+ CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
+ call->setTailCall(false);
+ return call;
+}
- Value *z1x2 = mul(z1, x2);
- Value *x1z2 = mul(x1, z2);
+llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2)
+{
+ Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+ Constant *const0f = Constant::getNullValue(Type::FloatTy);
- Value *x1y2 = mul(x1, y2);
- Value *y1x2 = mul(y1, x2);
+ std::vector<llvm::Value*> vec1 = extractVector(in1);
+ std::vector<llvm::Value*> vec2 = extractVector(in2);
- return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
-}
+ Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp"));
+ Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+ Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp"));
+ Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
-llvm::Value * Instructions::abs(llvm::Value *in)
-{
- std::vector<llvm::Value*> vec = extractVector(in);
- Value *xabs = callFAbs(vec[0]);
- Value *yabs = callFAbs(vec[1]);
- Value *zabs = callFAbs(vec[2]);
- Value *wabs = callFAbs(vec[3]);
- return vectorFromVals(xabs, yabs, zabs, wabs);
+ Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp"));
+ Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+ Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp"));
+ Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+ return vectorFromVals(x, y, z, w);
}
-void Instructions::ifop(llvm::Value *in)
+llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
{
- BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
- BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+ Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+ Constant *const0f = Constant::getNullValue(Type::FloatTy);
- //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
- //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
- //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+ std::vector<llvm::Value*> vec1 = extractVector(in1);
+ std::vector<llvm::Value*> vec2 = extractVector(in2);
- Constant *float0 = Constant::getNullValue(Type::FloatTy);
+ Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+ Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
- Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
- name("extractx"));
- Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
- m_builder.CreateCondBr(xcmp, ifthen, ifend);
- //m_builder.SetInsertPoint(yblock);
+ Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+ Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
- m_builder.SetInsertPoint(ifthen);
- m_ifStack.push(ifend);
-}
+ Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+ Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
-llvm::BasicBlock * Instructions::currentBlock() const
-{
- return m_builder.GetInsertBlock();
-}
+ Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+ Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
-void Instructions::elseop()
-{
- assert(!m_ifStack.empty());
- BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
- m_builder.CreateBr(ifend);
- m_builder.SetInsertPoint(m_ifStack.top());
- currentBlock()->setName(name("ifelse"));
- m_ifStack.pop();
- m_ifStack.push(ifend);
+ return vectorFromVals(x, y, z, w);
}
-void Instructions::endif()
+llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2)
{
- assert(!m_ifStack.empty());
- m_builder.CreateBr(m_ifStack.top());
- m_builder.SetInsertPoint(m_ifStack.top());
- m_ifStack.pop();
-}
+ Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+ Constant *const0f = Constant::getNullValue(Type::FloatTy);
-llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
- llvm::Value *in3)
-{
- llvm::Value *m = mul(in1, in2);
- llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
- llvm::Value *s = sub(vec1, in1);
- return add(m, mul(s, in3));
-}
+ std::vector<llvm::Value*> vec1 = extractVector(in1);
+ std::vector<llvm::Value*> vec2 = extractVector(in2);
-void Instructions::beginLoop()
-{
- BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
- BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+ Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp"));
+ Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
- m_builder.CreateBr(begin);
- Loop loop;
- loop.begin = begin;
- loop.end = end;
- m_builder.SetInsertPoint(begin);
- m_loopStack.push(loop);
+ Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp"));
+ Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+ Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp"));
+ Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+ Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp"));
+ Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+ return vectorFromVals(x, y, z, w);
}
-void Instructions::endLoop()
+llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2)
{
- assert(!m_loopStack.empty());
- Loop loop = m_loopStack.top();
- m_builder.CreateBr(loop.begin);
- loop.end->moveAfter(currentBlock());
- m_builder.SetInsertPoint(loop.end);
- m_loopStack.pop();
+ Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+
+ return vectorFromVals(const1f, const1f, const1f, const1f);
}
-void Instructions::brk()
+llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
{
- assert(!m_loopStack.empty());
- BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
- m_builder.CreateBr(m_loopStack.top().end);
- m_builder.SetInsertPoint(unr);
+ Value *res = m_builder.CreateSub(in1, in2, name("sub"));
+ return res;
}
llvm::Value * Instructions::trunc(llvm::Value *in)
@@ -741,18 +846,298 @@ llvm::Value * Instructions::trunc(llvm::Value *in)
return vectorFromVals(fx, fy, fz, fw);
}
-void Instructions::end()
+llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
{
- m_builder.CreateRetVoid();
+ std::vector<llvm::Value*> vec1 = extractVector(in1);
+ std::vector<llvm::Value*> vec2 = extractVector(in2);
+ std::vector<llvm::Value*> vec3 = extractVector(in3);
+
+ Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3"));
+ Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3"));
+ Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3"));
+ Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3"));
+
+ Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3"));
+ Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3"));
+ Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3"));
+ Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3"));
+
+ return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3);
}
-void Instructions::cal(int label, llvm::Value *input)
+void Instructions::printVector(llvm::Value *val)
{
+ static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+
+ if (!m_fmtPtr) {
+ Constant *format = ConstantArray::get(frmt, true);
+ ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
+ GlobalVariable* globalFormat = new GlobalVariable(
+ /*Type=*/arrayTy,
+ /*isConstant=*/true,
+ /*Linkage=*/GlobalValue::InternalLinkage,
+ /*Initializer=*/0, // has initializer, specified below
+ /*Name=*/name(".str"),
+ m_mod);
+ globalFormat->setInitializer(format);
+
+ Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
+ std::vector<Constant*> const_ptr_21_indices;
+ const_ptr_21_indices.push_back(const_int0);
+ const_ptr_21_indices.push_back(const_int0);
+ m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
+ &const_ptr_21_indices[0], const_ptr_21_indices.size());
+ }
+
+ Function *func_printf = m_mod->getFunction("printf");
+ if (!func_printf)
+ func_printf = declarePrintf();
+ assert(func_printf);
+ std::vector<llvm::Value*> vec = extractVector(val);
+ Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
+ Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
+ Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
+ Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
std::vector<Value*> params;
- params.push_back(input);
- llvm::Function *func = findFunction(label);
+ params.push_back(m_fmtPtr);
+ params.push_back(dx);
+ params.push_back(dy);
+ params.push_back(dz);
+ params.push_back(dw);
+ CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
+ name("printf"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(true);
+}
- m_builder.CreateCall(func, params.begin(), params.end());
+const char * Instructions::name(const char *prefix)
+{
+ ++m_idx;
+ snprintf(m_name, 32, "%s%d", prefix, m_idx);
+ return m_name;
+}
+
+llvm::Value * Instructions::callCeil(llvm::Value *val)
+{
+ if (!m_llvmCeil) {
+ // predeclare the intrinsic
+ std::vector<const Type*> ceilArgs;
+ ceilArgs.push_back(Type::FloatTy);
+ AttrListPtr ceilPal;
+ FunctionType* ceilType = FunctionType::get(
+ /*Result=*/Type::FloatTy,
+ /*Params=*/ceilArgs,
+ /*isVarArg=*/false);
+ m_llvmCeil = Function::Create(
+ /*Type=*/ceilType,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"ceilf", m_mod);
+ m_llvmCeil->setCallingConv(CallingConv::C);
+ m_llvmCeil->setAttributes(ceilPal);
+ }
+ CallInst *call = m_builder.CreateCall(m_llvmCeil, val,
+ name("ceilf"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value *Instructions::callFAbs(llvm::Value *val)
+{
+ if (!m_llvmFAbs) {
+ // predeclare the intrinsic
+ std::vector<const Type*> fabsArgs;
+ fabsArgs.push_back(Type::FloatTy);
+ AttrListPtr fabsPal;
+ FunctionType* fabsType = FunctionType::get(
+ /*Result=*/Type::FloatTy,
+ /*Params=*/fabsArgs,
+ /*isVarArg=*/false);
+ m_llvmFAbs = Function::Create(
+ /*Type=*/fabsType,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"fabs", m_mod);
+ m_llvmFAbs->setCallingConv(CallingConv::C);
+ m_llvmFAbs->setAttributes(fabsPal);
+ }
+ CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
+ name("fabs"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value * Instructions::callFExp(llvm::Value *val)
+{
+ if (!m_llvmFexp) {
+ // predeclare the intrinsic
+ std::vector<const Type*> fexpArgs;
+ fexpArgs.push_back(Type::FloatTy);
+ AttrListPtr fexpPal;
+ FunctionType* fexpType = FunctionType::get(
+ /*Result=*/Type::FloatTy,
+ /*Params=*/fexpArgs,
+ /*isVarArg=*/false);
+ m_llvmFexp = Function::Create(
+ /*Type=*/fexpType,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"expf", m_mod);
+ m_llvmFexp->setCallingConv(CallingConv::C);
+ m_llvmFexp->setAttributes(fexpPal);
+ }
+ CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
+ name("expf"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value * Instructions::callFLog(llvm::Value *val)
+{
+ if (!m_llvmFlog) {
+ // predeclare the intrinsic
+ std::vector<const Type*> flogArgs;
+ flogArgs.push_back(Type::FloatTy);
+ AttrListPtr flogPal;
+ FunctionType* flogType = FunctionType::get(
+ /*Result=*/Type::FloatTy,
+ /*Params=*/flogArgs,
+ /*isVarArg=*/false);
+ m_llvmFlog = Function::Create(
+ /*Type=*/flogType,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"logf", m_mod);
+ m_llvmFlog->setCallingConv(CallingConv::C);
+ m_llvmFlog->setAttributes(flogPal);
+ }
+ CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
+ name("logf"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value * Instructions::callFloor(llvm::Value *val)
+{
+ if (!m_llvmFloor) {
+ // predeclare the intrinsic
+ std::vector<const Type*> floorArgs;
+ floorArgs.push_back(Type::FloatTy);
+ AttrListPtr floorPal;
+ FunctionType* floorType = FunctionType::get(
+ /*Result=*/Type::FloatTy,
+ /*Params=*/floorArgs,
+ /*isVarArg=*/false);
+ m_llvmFloor = Function::Create(
+ /*Type=*/floorType,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"floorf", m_mod);
+ m_llvmFloor->setCallingConv(CallingConv::C);
+ m_llvmFloor->setAttributes(floorPal);
+ }
+ CallInst *call = m_builder.CreateCall(m_llvmFloor, val,
+ name("floorf"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+{
+ if (!m_llvmFSqrt) {
+ // predeclare the intrinsic
+ std::vector<const Type*> fsqrtArgs;
+ fsqrtArgs.push_back(Type::FloatTy);
+ AttrListPtr fsqrtPal;
+ FunctionType* fsqrtType = FunctionType::get(
+ /*Result=*/Type::FloatTy,
+ /*Params=*/fsqrtArgs,
+ /*isVarArg=*/false);
+ m_llvmFSqrt = Function::Create(
+ /*Type=*/fsqrtType,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"llvm.sqrt.f32", m_mod);
+ m_llvmFSqrt->setCallingConv(CallingConv::C);
+ m_llvmFSqrt->setAttributes(fsqrtPal);
+ }
+ CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
+ name("sqrt"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+{
+ if (!m_llvmPow) {
+ // predeclare the intrinsic
+ std::vector<const Type*> powArgs;
+ powArgs.push_back(Type::FloatTy);
+ powArgs.push_back(Type::FloatTy);
+ AttrListPtr powPal;
+ FunctionType* powType = FunctionType::get(
+ /*Result=*/Type::FloatTy,
+ /*Params=*/powArgs,
+ /*isVarArg=*/false);
+ m_llvmPow = Function::Create(
+ /*Type=*/powType,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"llvm.pow.f32", m_mod);
+ m_llvmPow->setCallingConv(CallingConv::C);
+ m_llvmPow->setAttributes(powPal);
+ }
+ std::vector<Value*> params;
+ params.push_back(val1);
+ params.push_back(val2);
+ CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
+ name("pow"));
+ call->setCallingConv(CallingConv::C);
+ call->setTailCall(false);
+ return call;
+}
+
+llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
+ llvm::Value *z, llvm::Value *w)
+{
+ Constant *const_vec = Constant::getNullValue(m_floatVecType);
+ Value *res = m_builder.CreateInsertElement(const_vec, x,
+ m_storage->constantInt(0),
+ name("vecx"));
+ res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+ name("vecxy"));
+ res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+ name("vecxyz"));
+ if (w)
+ res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+ name("vecxyzw"));
+ return res;
+}
+
+llvm::Value * Instructions::constVector(float x, float y, float z, float w)
+{
+ std::vector<Constant*> vec(4);
+ vec[0] = ConstantFP::get(APFloat(x));
+ vec[1] = ConstantFP::get(APFloat(y));
+ vec[2] = ConstantFP::get(APFloat(z));
+ vec[3] = ConstantFP::get(APFloat(w));
+ return ConstantVector::get(m_floatVecType, vec);
+}
+
+llvm::Function * Instructions::declarePrintf()
+{
+ std::vector<const Type*> args;
+ AttrListPtr params;
+ FunctionType* funcTy = FunctionType::get(
+ /*Result=*/IntegerType::get(32),
+ /*Params=*/args,
+ /*isVarArg=*/true);
+ Function* func_printf = Function::Create(
+ /*Type=*/funcTy,
+ /*Linkage=*/GlobalValue::ExternalLinkage,
+ /*Name=*/"printf", m_mod);
+ func_printf->setCallingConv(CallingConv::C);
+ func_printf->setAttributes(params);
+ return func_printf;
}
llvm::Function * Instructions::declareFunc(int label)
@@ -763,7 +1148,7 @@ llvm::Function * Instructions::declareFunc(int label)
args.push_back(vecPtr);
args.push_back(vecPtr);
args.push_back(vecPtr);
- PAListPtr params;
+ AttrListPtr params;
FunctionType *funcType = FunctionType::get(
/*Result=*/Type::VoidTy,
/*Params=*/args,
@@ -774,31 +1159,10 @@ llvm::Function * Instructions::declareFunc(int label)
/*Linkage=*/GlobalValue::ExternalLinkage,
/*Name=*/name.c_str(), m_mod);
func->setCallingConv(CallingConv::C);
- func->setParamAttrs(params);
+ func->setAttributes(params);
return func;
}
-void Instructions::bgnSub(unsigned label)
-{
- llvm::Function *func = findFunction(label);
-
- Function::arg_iterator args = func->arg_begin();
- Value *ptr_INPUT = args++;
- ptr_INPUT->setName("INPUT");
- m_storage->pushArguments(ptr_INPUT);
-
- llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
-
- m_func = func;
- m_builder.SetInsertPoint(entry);
-}
-
-void Instructions::endSub()
-{
- m_func = 0;
- m_builder.SetInsertPoint(0);
-}
-
llvm::Function * Instructions::findFunction(int label)
{
llvm::Function *func = m_functions[label];
@@ -809,17 +1173,6 @@ llvm::Function * Instructions::findFunction(int label)
return func;
}
-llvm::Value * Instructions::constVector(float x, float y, float z, float w)
-{
- std::vector<Constant*> vec(4);
- vec[0] = ConstantFP::get(APFloat(x));
- vec[1] = ConstantFP::get(APFloat(y));
- vec[2] = ConstantFP::get(APFloat(z));
- vec[3] = ConstantFP::get(APFloat(w));
- return ConstantVector::get(m_floatVecType, vec);
-}
-
-
std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
{
std::vector<llvm::Value*> elems(4);
@@ -834,69 +1187,7 @@ std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
return elems;
}
-llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
- llvm::Function *func = m_mod->getFunction("cmp");
- assert(func);
-
- std::vector<Value*> params;
- params.push_back(in1);
- params.push_back(in2);
- params.push_back(in3);
- CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
- call->setTailCall(false);
- return call;
-}
-
-llvm::Value * Instructions::cos(llvm::Value *in)
-{
-#if 0
- llvm::Function *func = m_mod->getFunction("vcos");
- assert(func);
- CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
- call->setTailCall(false);
- return call;
-#else
- std::vector<llvm::Value*> elems = extractVector(in);
- Function *func = m_mod->getFunction("cosf");
- assert(func);
- CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
- cos->setCallingConv(CallingConv::C);
- cos->setTailCall(true);
- return vectorFromVals(cos, cos, cos, cos);
-#endif
-}
-
-llvm::Value * Instructions::scs(llvm::Value *in)
-{
- llvm::Function *func = m_mod->getFunction("scs");
- assert(func);
-
- CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
- call->setTailCall(false);
- return call;
-}
-
-llvm::Value * Instructions::kil(llvm::Value *in)
-{
- llvm::Function *func = m_mod->getFunction("kil");
- assert(func);
-
- CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
- call->setTailCall(false);
- return call;
-}
-
-llvm::Value * Instructions::sin(llvm::Value *in)
-{
- llvm::Function *func = m_mod->getFunction("vsin");
- assert(func);
-
- CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
- call->setTailCall(false);
- return call;
-}
#endif //MESA_LLVM
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index d286ce80c7..e18571251e 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -57,15 +57,24 @@ public:
llvm::BasicBlock *currentBlock() const;
llvm::Value *abs(llvm::Value *in1);
- llvm::Value *arl(llvm::Value *in1);
llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *arl(llvm::Value *in1);
void beginLoop();
void bgnSub(unsigned);
void brk();
void cal(int label, llvm::Value *input);
+ llvm::Value *ceil(llvm::Value *in);
+ llvm::Value *clamp(llvm::Value *in);
llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+ llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+ llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
llvm::Value *cos(llvm::Value *in);
llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *ddx(llvm::Value *in);
+ llvm::Value *ddy(llvm::Value *in);
+ llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
+ llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2);
llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
@@ -75,6 +84,7 @@ public:
void endLoop();
void end();
void endSub();
+ llvm::Value *exp(llvm::Value *in);
llvm::Value *ex2(llvm::Value *in);
llvm::Value *floor(llvm::Value *in);
llvm::Value *frc(llvm::Value *in);
@@ -82,32 +92,43 @@ public:
llvm::Value *kil(llvm::Value *in);
llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
llvm::Value *in3);
- llvm::Value *lit(llvm::Value *in);
llvm::Value *lg2(llvm::Value *in);
+ llvm::Value *lit(llvm::Value *in);
+ llvm::Value *log(llvm::Value *in);
llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
llvm::Value *in3);
- llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *neg(llvm::Value *in);
+ llvm::Value *nrm(llvm::Value *in);
llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
llvm::Value *rcp(llvm::Value *in);
llvm::Value *rsq(llvm::Value *in);
llvm::Value *scs(llvm::Value *in);
+ llvm::Value *seq(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2);
llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
llvm::Value *sin(llvm::Value *in);
+ llvm::Value *sle(llvm::Value *in1, llvm::Value *in2);
llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *sne(llvm::Value *in1, llvm::Value *in2);
+ llvm::Value *str(llvm::Value *in1, llvm::Value *in2);
llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
llvm::Value *trunc(llvm::Value *in);
+ llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
void printVector(llvm::Value *val);
private:
const char *name(const char *prefix);
+ llvm::Value *callCeil(llvm::Value *val);
llvm::Value *callFAbs(llvm::Value *val);
+ llvm::Value *callFExp(llvm::Value *val);
+ llvm::Value *callFLog(llvm::Value *val);
llvm::Value *callFloor(llvm::Value *val);
llvm::Value *callFSqrt(llvm::Value *val);
- llvm::Value *callFLog(llvm::Value *val);
llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -125,16 +146,18 @@ private:
llvm::Module *m_mod;
llvm::Function *m_func;
char m_name[32];
- llvm::IRBuilder m_builder;
+ llvm::IRBuilder<> m_builder;
int m_idx;
llvm::VectorType *m_floatVecType;
+ llvm::Function *m_llvmCeil;
llvm::Function *m_llvmFSqrt;
llvm::Function *m_llvmFAbs;
llvm::Function *m_llvmPow;
llvm::Function *m_llvmFloor;
llvm::Function *m_llvmFlog;
+ llvm::Function *m_llvmFexp;
llvm::Function *m_llvmLit;
llvm::Constant *m_fmtPtr;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index efddc04e81..d5600fd22d 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
return res;
}
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
- std::vector<llvm::Value*> res(4);
-
- //Extract x's
- llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
- m_storage->constantInt(0),
- name("extractX"));
- //cast it to an unsigned int
- x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
- res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
- //only x is valid. the others shouldn't be necessary
- /*
- res[1] = Constant::getNullValue(m_floatVecType);
- res[2] = Constant::getNullValue(m_floatVecType);
- res[3] = Constant::getNullValue(m_floatVecType);
- */
-
- return res;
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
- res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
- res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
- res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
- return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
- res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
- res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
- res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
- return res;
-}
-
void InstructionsSoa::end()
{
m_builder.CreateRetVoid();
}
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2,
- const std::vector<llvm::Value*> in3)
-{
- std::vector<llvm::Value*> res = mul(in1, in2);
- return add(res, in3);
-}
-
std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
{
std::vector<llvm::Value*> res(4);
@@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
return res;
}
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+ return &m_builder;
+}
+
void InstructionsSoa::createFunctionMap()
{
m_functionsMap[TGSI_OPCODE_ABS] = "abs";
@@ -181,6 +129,7 @@ void InstructionsSoa::createFunctionMap()
m_functionsMap[TGSI_OPCODE_POWER] = "pow";
m_functionsMap[TGSI_OPCODE_LIT] = "lit";
m_functionsMap[TGSI_OPCODE_RSQ] = "rsq";
+ m_functionsMap[TGSI_OPCODE_SLT] = "slt";
}
void InstructionsSoa::createDependencies()
@@ -273,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i
return callBuiltin(func, in1);
}
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+ res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+ res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+ res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+ return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+ std::vector<llvm::Value*> res(4);
+
+ //Extract x's
+ llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+ m_storage->constantInt(0),
+ name("extractX"));
+ //cast it to an unsigned int
+ x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+ res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+ //only x is valid. the others shouldn't be necessary
+ /*
+ res[1] = Constant::getNullValue(m_floatVecType);
+ res[2] = Constant::getNullValue(m_floatVecType);
+ res[3] = Constant::getNullValue(m_floatVecType);
+ */
+
+ return res;
+}
+
std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
const std::vector<llvm::Value*> in2)
{
@@ -280,6 +264,98 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
return callBuiltin(func, in1, in2);
}
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+ llvm::Function *func = function(TGSI_OPCODE_LIT);
+ return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2,
+ const std::vector<llvm::Value*> in3)
+{
+ std::vector<llvm::Value*> res = mul(in1, in2);
+ return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_MAX);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_MIN);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+ res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+ res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+ res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+ return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_POWER);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+ llvm::Function *func = function(TGSI_OPCODE_RSQ);
+ return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ llvm::Function *func = function(TGSI_OPCODE_SLT);
+ return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2)
+{
+ std::vector<llvm::Value*> res(4);
+
+ res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+ res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+ res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+ res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+ return res;
+}
+
+void checkFunction(Function *func)
+{
+ for (Function::const_iterator BI = func->begin(), BE = func->end();
+ BI != BE; ++BI) {
+ const BasicBlock &BB = *BI;
+ for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+ II != IE; ++II) {
+ const Instruction &I = *II;
+ std::cout<< "Instr = "<<I;
+ for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+ const Value *Op = I.getOperand(op);
+ std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+ //I->setOperand(op, V);
+ }
+ }
+ }
+}
+
llvm::Value * InstructionsSoa::allocaTemp()
{
VectorType *vector = VectorType::get(Type::FloatTy, 4);
@@ -399,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std
return allocaToResult(allocaPtr);
}
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_POWER);
- return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_MIN);
- return callBuiltin(func, in1, in2);
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- llvm::Function *func = function(TGSI_OPCODE_MAX);
- return callBuiltin(func, in1, in2);
-}
-
-void checkFunction(Function *func)
-{
- for (Function::const_iterator BI = func->begin(), BE = func->end();
- BI != BE; ++BI) {
- const BasicBlock &BB = *BI;
- for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
- II != IE; ++II) {
- const Instruction &I = *II;
- std::cout<< "Instr = "<<I;
- for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
- const Value *Op = I.getOperand(op);
- std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
- //I->setOperand(op, V);
- }
- }
- }
-}
-
void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
{
assert(originalFunc);
@@ -458,8 +494,8 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
originalFunc->getName(), currentModule());
func->setCallingConv(CallingConv::C);
- const PAListPtr pal;
- func->setParamAttrs(pal);
+ const AttrListPtr pal;
+ func->setAttributes(pal);
currentModule()->dump();
} else {
DenseMap<const Value*, Value *> val;
@@ -483,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
}
}
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
- const std::vector<llvm::Value*> in2)
-{
- std::vector<llvm::Value*> res(4);
-
- res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
- res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
- res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
- res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
- return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
- llvm::Function *func = function(TGSI_OPCODE_LIT);
- return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
- llvm::Function *func = function(TGSI_OPCODE_RSQ);
- return callBuiltin(func, in);
-}
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3e20b652dd..d6831e0a6b 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -69,11 +69,14 @@ public:
std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1,
const std::vector<llvm::Value*> in2);
std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1);
+ std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1,
+ const std::vector<llvm::Value*> in2);
std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1,
const std::vector<llvm::Value*> in2);
void end();
std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+ llvm::IRBuilder<>* getIRBuilder();
private:
const char * name(const char *prefix) const;
llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -96,7 +99,7 @@ private:
const std::vector<llvm::Value*> in3);
void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
private:
- llvm::IRBuilder m_builder;
+ llvm::IRBuilder<> m_builder;
StorageSoa *m_storage;
std::map<int, std::string> m_functionsMap;
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
index 78f84510e2..cb85e1734e 100644
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -36,6 +36,8 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
extern float fabsf(float val);
+/* helpers */
+
float4 absvec(float4 vec)
{
float4 res;
@@ -47,6 +49,58 @@ float4 absvec(float4 vec)
return res;
}
+float4 maxvec(float4 a, float4 b)
+{
+ return (float4){(a.x > b.x) ? a.x : b.x,
+ (a.y > b.y) ? a.y : b.y,
+ (a.z > b.z) ? a.z : b.z,
+ (a.w > b.w) ? a.w : b.w};
+}
+
+float4 minvec(float4 a, float4 b)
+{
+ return (float4){(a.x < b.x) ? a.x : b.x,
+ (a.y < b.y) ? a.y : b.y,
+ (a.z < b.z) ? a.z : b.z,
+ (a.w < b.w) ? a.w : b.w};
+}
+
+extern float powf(float num, float p);
+extern float sqrtf(float x);
+
+float4 powvec(float4 vec, float4 q)
+{
+ float4 p;
+ p.x = powf(vec.x, q.x);
+ p.y = powf(vec.y, q.y);
+ p.z = powf(vec.z, q.z);
+ p.w = powf(vec.w, q.w);
+ return p;
+}
+
+float4 sqrtvec(float4 vec)
+{
+ float4 p;
+ p.x = sqrtf(vec.x);
+ p.y = sqrtf(vec.y);
+ p.z = sqrtf(vec.z);
+ p.w = sqrtf(vec.w);
+ return p;
+}
+
+float4 sltvec(float4 v1, float4 v2)
+{
+ float4 p;
+ p.x = (v1.x < v2.x) ? 1.0 : 0.0;
+ p.y = (v1.y < v2.y) ? 1.0 : 0.0;
+ p.z = (v1.z < v2.z) ? 1.0 : 0.0;
+ p.w = (v1.w < v2.w) ? 1.0 : 0.0;
+ return p;
+}
+
+
+/* instructions */
+
void abs(float4 *res,
float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
{
@@ -69,7 +123,6 @@ void dp3(float4 *res,
res[3] = dot;
}
-
void dp4(float4 *res,
float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -83,35 +136,25 @@ void dp4(float4 *res,
res[3] = dot;
}
-extern float powf(float num, float p);
-extern float sqrtf(float x);
-
-float4 powvec(float4 vec, float4 q)
-{
- float4 p;
- p.x = powf(vec.x, q.x);
- p.y = powf(vec.y, q.y);
- p.z = powf(vec.z, q.z);
- p.w = powf(vec.w, q.w);
- return p;
-}
-
-void pow(float4 *res,
- float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
- float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+void lit(float4 *res,
+ float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
{
- res[0] = powvec(tmp0x, tmp1x);
- res[1] = res[0];
- res[2] = res[0];
- res[3] = res[0];
-}
+ const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
+ const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
+ const float4 plus128 = (float4) {128.f, 128.f, 128.f, 128.f};
-float4 minvec(float4 a, float4 b)
-{
- return (float4){(a.x < b.x) ? a.x : b.x,
- (a.y < b.y) ? a.y : b.y,
- (a.z < b.z) ? a.z : b.z,
- (a.w < b.w) ? a.w : b.w};
+ res[0] = (float4){1.0, 1.0, 1.0, 1.0};
+ if (tmp0x.x > 0) {
+ float4 tmpy = maxvec(tmp0y, zerovec);
+ float4 tmpw = minvec(tmp0w, plus128);
+ tmpw = maxvec(tmpw, min128);
+ res[1] = tmp0x;
+ res[2] = powvec(tmpy, tmpw);
+ } else {
+ res[1] = zerovec;
+ res[2] = zerovec;
+ }
+ res[3] = (float4){1.0, 1.0, 1.0, 1.0};
}
void min(float4 *res,
@@ -125,14 +168,6 @@ void min(float4 *res,
}
-float4 maxvec(float4 a, float4 b)
-{
- return (float4){(a.x > b.x) ? a.x : b.x,
- (a.y > b.y) ? a.y : b.y,
- (a.z > b.z) ? a.z : b.z,
- (a.w > b.w) ? a.w : b.w};
-}
-
void max(float4 *res,
float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -143,37 +178,14 @@ void max(float4 *res,
res[3] = maxvec(tmp0w, tmp1w);
}
-
-void lit(float4 *res,
- float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
- const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
- const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
- const float4 plus128 = (float4) {128.f, 128.f, 128.f, 128.f};
-
- res[0] = (float4){1.0, 1.0, 1.0, 1.0};
- if (tmp0x.x > 0) {
- float4 tmpy = maxvec(tmp0y, zerovec);
- float4 tmpw = minvec(tmp0w, plus128);
- tmpw = maxvec(tmpw, min128);
- res[1] = tmp0x;
- res[2] = powvec(tmpy, tmpw);
- } else {
- res[1] = zerovec;
- res[2] = zerovec;
- }
- res[3] = (float4){1.0, 1.0, 1.0, 1.0};
-}
-
-
-float4 sqrtvec(float4 vec)
+void pow(float4 *res,
+ float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+ float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
{
- float4 p;
- p.x = sqrtf(vec.x);
- p.y = sqrtf(vec.y);
- p.z = sqrtf(vec.z);
- p.w = sqrtf(vec.w);
- return p;
+ res[0] = powvec(tmp0x, tmp1x);
+ res[1] = res[0];
+ res[2] = res[0];
+ res[3] = res[0];
}
void rsq(float4 *res,
@@ -185,3 +197,14 @@ void rsq(float4 *res,
res[2] = onevec/sqrtvec(absvec(tmp0z));
res[3] = onevec/sqrtvec(absvec(tmp0w));
}
+
+void slt(float4 *res,
+ float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+ float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+ res[0] = sltvec(tmp0x, tmp1x);
+ res[1] = sltvec(tmp0y, tmp1y);
+ res[2] = sltvec(tmp0z, tmp1z);
+ res[3] = sltvec(tmp0w, tmp1w);
+}
+
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 78d754371f..4fc075cf6d 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -93,7 +93,7 @@ void StorageSoa::declareImmediates()
std::vector<float> vals(4);
std::vector<Constant*> channelArray;
- vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
+ vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
llvm::Constant *xChannel = createConstGlobalVector(vals);
vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
@@ -144,22 +144,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
return res;
}
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
{
- std::vector<llvm::Value*> res(4);
+ std::vector<llvm::Value*> x(4);
+ x[0] = m_builder->CreateExtractElement(vector,
+ constantInt(cc),
+ name("x"));
+
+ VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+ Constant *constVector = Constant::getNullValue(vectorType);
+ Value *res = m_builder->CreateInsertElement(constVector, x[0],
+ constantInt(0),
+ name("vecx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+ name("vecxx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+ name("vecxxx"));
+ res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+ name("vecxxxx"));
+ return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
+{
+ llvm::Value* res;
+ std::vector<llvm::Value*> res2(4);
llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
xChannel = elementPointer(m_consts, idx, 0);
- yChannel = elementPointer(m_consts, idx, 1);
- zChannel = elementPointer(m_consts, idx, 2);
- wChannel = elementPointer(m_consts, idx, 3);
- res[0] = alignedArrayLoad(xChannel);
- res[1] = alignedArrayLoad(yChannel);
- res[2] = alignedArrayLoad(zChannel);
- res[3] = alignedArrayLoad(wChannel);
+ res = alignedArrayLoad(xChannel);
- return res;
+ res2[0]=unpackConstElement(m_builder, res,0);
+ res2[1]=unpackConstElement(m_builder, res,1);
+ res2[2]=unpackConstElement(m_builder, res,2);
+ res2[3]=unpackConstElement(m_builder, res,3);
+
+ return res2;
}
std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
@@ -260,6 +281,12 @@ llvm::Module * StorageSoa::currentModule() const
return m_block->getParent()->getParent();
}
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+ Constant*c = ConstantFP::get(APFloat(val));
+ return c;
+}
+
llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
{
VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -278,7 +305,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
}
std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
- llvm::Value *indIdx)
+ llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
{
std::vector<llvm::Value*> val(4);
@@ -302,7 +329,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
val = tempElement(realIndex);
break;
case TGSI_FILE_CONSTANT:
- val = constElement(realIndex);
+ val = constElement(m_builder, realIndex);
break;
case TGSI_FILE_IMMEDIATE:
val = immediateElement(realIndex);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index ae2fc7c6ae..f21ca6ec43 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -29,6 +29,7 @@
#define STORAGESOA_H
#include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
#include <vector>
#include <list>
@@ -56,7 +57,7 @@ public:
std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle,
- llvm::Value *indIdx =0);
+ llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
int mask);
@@ -76,10 +77,12 @@ private:
const char *name(const char *prefix) const;
llvm::Value *alignedArrayLoad(llvm::Value *val);
llvm::Module *currentModule() const;
+ llvm::Constant *createConstGlobalFloat(const float val);
llvm::Constant *createConstGlobalVector(const std::vector<float> &vec);
std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
- std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+ llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+ std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index cc1516a45e..1191a6cae9 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -52,7 +52,7 @@ static inline FunctionType *vertexShaderFunctionType()
// pass are castable to the following:
// [4 x <4 x float>] inputs,
// [4 x <4 x float>] output,
- // [4 x [4 x float]] consts,
+ // [4 x [1 x float]] consts,
// [4 x <4 x float>] temps
std::vector<const Type*> funcArgs;
@@ -61,7 +61,7 @@ static inline FunctionType *vertexShaderFunctionType()
PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
ArrayType *floatArray = ArrayType::get(Type::FloatTy, 4);
- ArrayType *constsArray = ArrayType::get(floatArray, 4);
+ ArrayType *constsArray = ArrayType::get(floatArray, 1);
PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
funcArgs.push_back(vectorArrayPtr);//inputs
@@ -246,6 +246,7 @@ translate_instruction(llvm::Module *module,
val = storage->constElement(src->SrcRegister.Index, indIdx);
} else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
val = storage->inputElement(src->SrcRegister.Index, indIdx);
+ // FIXME we should not be generating elements for temporaries, this creates useless memory writes
} else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
val = storage->tempElement(src->SrcRegister.Index);
} else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -286,9 +287,13 @@ translate_instruction(llvm::Module *module,
out = instr->rsq(inputs[0]);
}
break;
- case TGSI_OPCODE_EXP:
+ case TGSI_OPCODE_EXP: {
+ out = instr->exp(inputs[0]);
+ }
break;
- case TGSI_OPCODE_LOG:
+ case TGSI_OPCODE_LOG: {
+ out = instr->log(inputs[0]);
+ }
break;
case TGSI_OPCODE_MUL: {
out = instr->mul(inputs[0], inputs[1]);
@@ -338,21 +343,31 @@ translate_instruction(llvm::Module *module,
out = instr->lerp(inputs[0], inputs[1], inputs[2]);
}
break;
- case TGSI_OPCODE_CND:
+ case TGSI_OPCODE_CND: {
+ out = instr->cnd(inputs[0], inputs[1], inputs[2]);
+ }
break;
- case TGSI_OPCODE_CND0:
+ case TGSI_OPCODE_CND0: {
+ out = instr->cnd0(inputs[0], inputs[1], inputs[2]);
+ }
break;
- case TGSI_OPCODE_DOT2ADD:
+ case TGSI_OPCODE_DOT2ADD: {
+ out = instr->dot2add(inputs[0], inputs[1], inputs[2]);
+ }
break;
case TGSI_OPCODE_INDEX:
break;
- case TGSI_OPCODE_NEGATE:
+ case TGSI_OPCODE_NEGATE: {
+ out = instr->neg(inputs[0]);
+ }
break;
case TGSI_OPCODE_FRAC: {
out = instr->frc(inputs[0]);
}
break;
- case TGSI_OPCODE_CLAMP:
+ case TGSI_OPCODE_CLAMP: {
+ out = instr->clamp(inputs[0]);
+ }
break;
case TGSI_OPCODE_FLOOR: {
out = instr->floor(inputs[0]);
@@ -392,9 +407,13 @@ translate_instruction(llvm::Module *module,
out = instr->cos(inputs[0]);
}
break;
- case TGSI_OPCODE_DDX:
+ case TGSI_OPCODE_DDX: {
+ out = instr->ddx(inputs[0]);
+ }
break;
- case TGSI_OPCODE_DDY:
+ case TGSI_OPCODE_DDY: {
+ out = instr->ddy(inputs[0]);
+ }
break;
case TGSI_OPCODE_KILP:
break;
@@ -408,9 +427,13 @@ translate_instruction(llvm::Module *module,
break;
case TGSI_OPCODE_RFL:
break;
- case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_SEQ: {
+ out = instr->seq(inputs[0], inputs[1]);
+ }
break;
- case TGSI_OPCODE_SFL:
+ case TGSI_OPCODE_SFL: {
+ out = instr->sfl(inputs[0], inputs[1]);
+ }
break;
case TGSI_OPCODE_SGT: {
out = instr->sgt(inputs[0], inputs[1]);
@@ -420,11 +443,17 @@ translate_instruction(llvm::Module *module,
out = instr->sin(inputs[0]);
}
break;
- case TGSI_OPCODE_SLE:
+ case TGSI_OPCODE_SLE: {
+ out = instr->sle(inputs[0], inputs[1]);
+ }
break;
- case TGSI_OPCODE_SNE:
+ case TGSI_OPCODE_SNE: {
+ out = instr->sne(inputs[0], inputs[1]);
+ }
break;
- case TGSI_OPCODE_STR:
+ case TGSI_OPCODE_STR: {
+ out = instr->str(inputs[0], inputs[1]);
+ }
break;
case TGSI_OPCODE_TEX:
break;
@@ -438,7 +467,9 @@ translate_instruction(llvm::Module *module,
break;
case TGSI_OPCODE_UP4UB:
break;
- case TGSI_OPCODE_X2D:
+ case TGSI_OPCODE_X2D: {
+ out = instr->x2d(inputs[0], inputs[1], inputs[2]);
+ }
break;
case TGSI_OPCODE_ARA:
break;
@@ -468,11 +499,18 @@ translate_instruction(llvm::Module *module,
break;
case TGSI_OPCODE_TXB:
break;
- case TGSI_OPCODE_NRM:
+ case TGSI_OPCODE_NRM4:
+ case TGSI_OPCODE_NRM: {
+ out = instr->nrm(inputs[0]);
+ }
break;
- case TGSI_OPCODE_DIV:
+ case TGSI_OPCODE_DIV: {
+ out = instr->div(inputs[0], inputs[1]);
+ }
break;
- case TGSI_OPCODE_DP2:
+ case TGSI_OPCODE_DP2: {
+ out = instr->dp2(inputs[0], inputs[1]);
+ }
break;
case TGSI_OPCODE_TXL:
break;
@@ -590,8 +628,6 @@ translate_instruction(llvm::Module *module,
break;
case TGSI_OPCODE_M3X2:
break;
- case TGSI_OPCODE_NRM4:
- break;
case TGSI_OPCODE_CALLNZ:
break;
case TGSI_OPCODE_IFC:
@@ -641,6 +677,7 @@ translate_instruction(llvm::Module *module,
if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+ // FIXME we should not be generating elements for temporaries, this creates useless memory writes
} else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
} else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -672,9 +709,8 @@ translate_instructionir(llvm::Module *module,
if (src->SrcRegister.Indirect) {
indIdx = storage->addrElement(src->SrcRegisterInd.Index);
}
-
val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
- src->SrcRegister.Index, swizzle, indIdx);
+ src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
inputs[i] = val;
}
@@ -732,6 +768,7 @@ translate_instructionir(llvm::Module *module,
}
break;
case TGSI_OPCODE_SLT: {
+ out = instr->slt(inputs[0], inputs[1]);
}
break;
case TGSI_OPCODE_SGE: {
@@ -989,7 +1026,6 @@ translate_instructionir(llvm::Module *module,
/* store results */
for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
storage->store((enum tgsi_file_type)dst->DstRegister.File,
dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 491141f190..dea1aed032 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -164,6 +164,27 @@ rem_prefix(const char *longname)
}
+static const char *
+reg_name(int reg)
+{
+ switch (reg) {
+ case SPE_REG_SP:
+ return "$sp";
+ case SPE_REG_RA:
+ return "$lr";
+ default:
+ {
+ /* cycle through four buffers to handle multiple calls per printf */
+ static char buf[4][10];
+ static int b = 0;
+ b = (b + 1) % 4;
+ sprintf(buf[b], "$%d", reg);
+ return buf[b];
+ }
+ }
+}
+
+
static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
unsigned rA, unsigned rB, const char *name)
{
@@ -176,7 +197,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
assert(p->num_inst <= p->max_inst);
if (p->print) {
indent(p);
- printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB);
+ printf("%s\t%s, %s, %s\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
}
}
@@ -194,7 +216,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
assert(p->num_inst <= p->max_inst);
if (p->print) {
indent(p);
- printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC);
+ printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+ reg_name(rA), reg_name(rB), reg_name(rC));
}
}
@@ -211,7 +234,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
assert(p->num_inst <= p->max_inst);
if (p->print) {
indent(p);
- printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
}
}
@@ -229,7 +253,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
assert(p->num_inst <= p->max_inst);
if (p->print) {
indent(p);
- printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
}
}
@@ -247,15 +272,22 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
assert(p->num_inst <= p->max_inst);
if (p->print) {
indent(p);
- if (strcmp(name, "spe_lqd") == 0 ||
- strcmp(name, "spe_stqd") == 0)
- printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA);
- else
- printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+ printf("%s\t%s, %s, 0x%x\n",
+ rem_prefix(name), reg_name(rT), reg_name(rA), imm);
}
}
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+ unsigned rA, int imm, const char *name)
+{
+ assert(imm <= 511);
+ assert(imm >= -512);
+ emit_RI10(p, op, rT, rA, imm, name);
+}
+
+
static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
int imm, const char *name)
{
@@ -267,7 +299,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
assert(p->num_inst <= p->max_inst);
if (p->print) {
indent(p);
- printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+ printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
}
}
@@ -283,7 +315,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
assert(p->num_inst <= p->max_inst);
if (p->print) {
indent(p);
- printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+ printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
}
}
@@ -332,6 +364,12 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
emit_RI10(p, _op, rT, rA, imm, __FUNCTION__); \
}
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+ emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__); \
+}
+
#define EMIT_RI16(_name, _op) \
void _name (struct spe_function *p, unsigned rT, int imm) \
{ \
@@ -353,20 +391,28 @@ void _name (struct spe_function *p, int imm) \
#include "rtasm_ppc_spe.h"
+
/**
* Initialize an spe_function.
* \param code_size size of instruction buffer to allocate, in bytes.
*/
void spe_init_func(struct spe_function *p, unsigned code_size)
{
+ unsigned int i;
+
p->store = align_malloc(code_size, 16);
p->num_inst = 0;
p->max_inst = code_size / SPE_INST_SIZE;
+ p->set_count = 0;
+ memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
/* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
*/
- p->regs[0] = ~7;
- p->regs[1] = (1U << (80 - 64)) - 1;
+ p->regs[0] = p->regs[1] = p->regs[2] = 1;
+ for (i = 80; i <= 127; i++) {
+ p->regs[i] = 1;
+ }
p->print = false;
p->indent = 0;
@@ -398,12 +444,8 @@ int spe_allocate_available_register(struct spe_function *p)
{
unsigned i;
for (i = 0; i < SPE_NUM_REGS; i++) {
- const uint64_t mask = (1ULL << (i % 64));
- const unsigned idx = i / 64;
-
- assert(idx < 2);
- if ((p->regs[idx] & mask) != 0) {
- p->regs[idx] &= ~mask;
+ if (p->regs[i] == 0) {
+ p->regs[i] = 1;
return i;
}
}
@@ -417,31 +459,84 @@ int spe_allocate_available_register(struct spe_function *p)
*/
int spe_allocate_register(struct spe_function *p, int reg)
{
- const unsigned idx = reg / 64;
- const unsigned bit = reg % 64;
-
assert(reg < SPE_NUM_REGS);
- assert((p->regs[idx] & (1ULL << bit)) != 0);
-
- p->regs[idx] &= ~(1ULL << bit);
+ assert(p->regs[reg] == 0);
+ p->regs[reg] = 1;
return reg;
}
/**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated". Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
*/
void spe_release_register(struct spe_function *p, int reg)
{
- const unsigned idx = reg / 64;
- const unsigned bit = reg % 64;
+ assert(reg < SPE_NUM_REGS);
+ assert(p->regs[reg] == 1);
- assert(idx < 2);
+ p->regs[reg] = 0;
+}
- assert(reg < SPE_NUM_REGS);
- assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers. This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+ unsigned int i;
- p->regs[idx] |= (1ULL << bit);
+ /* Keep track of the set count. If it ever wraps around to 0,
+ * we're in trouble.
+ */
+ p->set_count++;
+ assert(p->set_count > 0);
+
+ /* Increment the allocation count of all registers currently
+ * allocated. Then any registers that are allocated in this set
+ * will be the only ones with a count of 1; they'll all be released
+ * when the register set is released.
+ */
+ for (i = 0; i < SPE_NUM_REGS; i++) {
+ if (p->regs[i] > 0)
+ p->regs[i]++;
+ }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+ unsigned int i;
+
+ /* If the set count drops below zero, we're in trouble. */
+ assert(p->set_count > 0);
+ p->set_count--;
+
+ /* Drop the allocation level of all registers. Any allocated
+ * during this register set will drop to 0 and then become
+ * available.
+ */
+ for (i = 0; i < SPE_NUM_REGS; i++) {
+ if (p->regs[i] > 0)
+ p->regs[i]--;
+ }
+}
+
+
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+ unsigned i, num = 0;
+ /* only count registers in the range available to callers */
+ for (i = 2; i < 80; i++) {
+ if (p->regs[i]) {
+ used[num++] = i;
+ }
+ }
+ return num;
}
@@ -459,7 +554,7 @@ spe_indent(struct spe_function *p, int spaces)
}
-extern void
+void
spe_comment(struct spe_function *p, int rel_indent, const char *s)
{
if (p->print) {
@@ -472,6 +567,56 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
/**
+ * Load quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+ const boolean pSave = p->print;
+
+ /* offset must be a multiple of 16 */
+ assert(offset % 16 == 0);
+ /* offset must fit in 10-bit signed int field, after shifting */
+ assert((offset >> 4) <= 511);
+ assert((offset >> 4) >= -512);
+
+ p->print = FALSE;
+ emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+ p->print = pSave;
+
+ if (p->print) {
+ indent(p);
+ printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+ }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+ const boolean pSave = p->print;
+
+ /* offset must be a multiple of 16 */
+ assert(offset % 16 == 0);
+ /* offset must fit in 10-bit signed int field, after shifting */
+ assert((offset >> 4) <= 511);
+ assert((offset >> 4) >= -512);
+
+ p->print = FALSE;
+ emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+ p->print = pSave;
+
+ if (p->print) {
+ indent(p);
+ printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+ }
+}
+
+
+/**
* For branch instructions:
* \param d if 1, disable interupts if branch is taken
* \param e if 1, enable interupts if branch is taken
@@ -603,22 +748,187 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
{
/* If the whole value is in the lower 18 bits, use ila, which
* doesn't sign-extend. Otherwise, if the two halfwords of
- * the constant are identical, use ilh. Otherwise, we have
- * to use ilhu followed by iohl.
+ * the constant are identical, use ilh. Otherwise, if every byte of
+ * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+ * Bytes Immediate (fsmbi) to load the value in a single instruction.
+ * Otherwise, in the general case, we have to use ilhu followed by iohl.
*/
- if ((ui & 0xfffc0000) == ui) {
+ if ((ui & 0x0003ffff) == ui) {
spe_ila(p, rT, ui);
}
else if ((ui >> 16) == (ui & 0xffff)) {
spe_ilh(p, rT, ui & 0xffff);
}
+ else if (
+ ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+ ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+ ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+ ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+ ) {
+ unsigned int mask = 0;
+ /* fsmbi duplicates each bit in the given mask eight times,
+ * using a 16-bit value to initialize a 16-byte quadword.
+ * Each 4-bit nybble of the mask corresponds to a full word
+ * of the result; look at the value and figure out the mask
+ * (replicated for each word in the quadword), and then
+ * form the "select mask" to get the value.
+ */
+ if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+ if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+ if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+ if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+ spe_fsmbi(p, rT, mask);
+ }
else {
+ /* The general case: this usually uses two instructions, but
+ * may use only one if the low-order 16 bits of each word are 0.
+ */
spe_ilhu(p, rT, ui >> 16);
if (ui & 0xffff)
spe_iohl(p, rT, ui & 0xffff);
}
}
+/**
+ * This function is constructed identically to spe_xor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If we can, emit a single instruction, either And Byte Immediate
+ * (which uses the same constant across each byte), And Halfword Immediate
+ * (which sign-extends a 10-bit immediate to 16 bits and uses that
+ * across each halfword), or And Word Immediate (which sign-extends
+ * a 10-bit immediate to 32 bits).
+ *
+ * Otherwise, we'll need to use a temporary register.
+ */
+ unsigned int tmp;
+
+ /* If the upper 23 bits are all 0s or all 1s, sign extension
+ * will work and we can use And Word Immediate
+ */
+ tmp = ui & 0xfffffe00;
+ if (tmp == 0xfffffe00 || tmp == 0) {
+ spe_andi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric along halfword boundaries and
+ * the upper 7 bits of each halfword are all 0s or 1s, we
+ * can use And Halfword Immediate
+ */
+ tmp = ui & 0xfe00fe00;
+ if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+ spe_andhi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric in each byte, then we can use
+ * the And Byte Immediate instruction.
+ */
+ tmp = ui & 0x000000ff;
+ if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+ spe_andbi(p, rT, rA, tmp);
+ return;
+ }
+
+ /* Otherwise, we'll have to use a temporary register. */
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_and(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+}
+
+
+/**
+ * This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If we can, emit a single instruction, either Exclusive Or Byte
+ * Immediate (which uses the same constant across each byte), Exclusive
+ * Or Halfword Immediate (which sign-extends a 10-bit immediate to
+ * 16 bits and uses that across each halfword), or Exclusive Or Word
+ * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+ *
+ * Otherwise, we'll need to use a temporary register.
+ */
+ unsigned int tmp;
+
+ /* If the upper 23 bits are all 0s or all 1s, sign extension
+ * will work and we can use Exclusive Or Word Immediate
+ */
+ tmp = ui & 0xfffffe00;
+ if (tmp == 0xfffffe00 || tmp == 0) {
+ spe_xori(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric along halfword boundaries and
+ * the upper 7 bits of each halfword are all 0s or 1s, we
+ * can use Exclusive Or Halfword Immediate
+ */
+ tmp = ui & 0xfe00fe00;
+ if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+ spe_xorhi(p, rT, rA, ui & 0x000003ff);
+ return;
+ }
+
+ /* If the ui field is symmetric in each byte, then we can use
+ * the Exclusive Or Byte Immediate instruction.
+ */
+ tmp = ui & 0x000000ff;
+ if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+ spe_xorbi(p, rT, rA, tmp);
+ return;
+ }
+
+ /* Otherwise, we'll have to use a temporary register. */
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_xor(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If the comparison value is 9 bits or less, it fits inside a
+ * Compare Equal Word Immediate instruction.
+ */
+ if ((ui & 0x000001ff) == ui) {
+ spe_ceqi(p, rT, rA, ui);
+ }
+ /* Otherwise, we're going to have to load a word first. */
+ else {
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_ceq(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+ }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+ /* If the comparison value is 10 bits or less, it fits inside a
+ * Compare Logical Greater Than Word Immediate instruction.
+ */
+ if ((ui & 0x000003ff) == ui) {
+ spe_clgti(p, rT, rA, ui);
+ }
+ /* Otherwise, we're going to have to load a word first. */
+ else {
+ unsigned int tmp_reg = spe_allocate_available_register(p);
+ spe_load_uint(p, tmp_reg, ui);
+ spe_clgt(p, rT, rA, tmp_reg);
+ spe_release_register(p, tmp_reg);
+ }
+}
void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 61c7edeb60..d6a3c02f20 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -53,17 +53,26 @@ struct spe_function
uint num_inst;
uint max_inst;
- /**
- * Mask of used / unused registers
- *
- * Each set bit corresponds to an available register. Each cleared bit
- * corresponds to an allocated register.
+ /**
+ * The "set count" reflects the number of nested register sets
+ * are allowed. In the unlikely case that we exceed the set count,
+ * register allocation will start to be confused, which is critical
+ * enough that we check for it.
+ */
+ unsigned char set_count;
+
+ /**
+ * Flags for used and unused registers. Each byte corresponds to a
+ * register; a 0 in that byte means that the register is available.
+ * A value of 1 means that the register was allocated in the current
+ * register set. Any other value N means that the register was allocated
+ * N register sets ago.
*
* \sa
* spe_allocate_register, spe_allocate_available_register,
- * spe_release_register
+ * spe_allocate_register_set, spe_release_register_set, spe_release_register,
*/
- uint64_t regs[SPE_NUM_REGS / 64];
+ unsigned char regs[SPE_NUM_REGS];
boolean print; /**< print/dump instructions as they're emitted? */
int indent; /**< number of spaces to indent */
@@ -77,6 +86,11 @@ extern unsigned spe_code_size(const struct spe_function *p);
extern int spe_allocate_available_register(struct spe_function *p);
extern int spe_allocate_register(struct spe_function *p, int reg);
extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
+
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
extern void spe_print_code(struct spe_function *p, boolean enable);
extern void spe_indent(struct spe_function *p, int spaces);
@@ -105,6 +119,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
#define EMIT_RI10(_name, _op) \
extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
int imm)
+#define EMIT_RI10s(_name, _op) \
+ extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+ int imm)
#define EMIT_RI16(_name, _op) \
extern void _name (struct spe_function *p, unsigned rT, int imm)
#define EMIT_RI18(_name, _op) \
@@ -117,11 +134,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
/* Memory load / store instructions
*/
-EMIT_RI10(spe_lqd, 0x034);
EMIT_RR (spe_lqx, 0x1c4);
EMIT_RI16(spe_lqa, 0x061);
EMIT_RI16(spe_lqr, 0x067);
-EMIT_RI10(spe_stqd, 0x024);
EMIT_RR (spe_stqx, 0x144);
EMIT_RI16(spe_stqa, 0x041);
EMIT_RI16(spe_stqr, 0x047);
@@ -151,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065);
EMIT_RR (spe_ah, 0x0c8);
EMIT_RI10(spe_ahi, 0x01d);
EMIT_RR (spe_a, 0x0c0);
-EMIT_RI10(spe_ai, 0x01c);
+EMIT_RI10s(spe_ai, 0x01c);
EMIT_RR (spe_sfh, 0x048);
EMIT_RI10(spe_sfhi, 0x00d);
EMIT_RR (spe_sf, 0x040);
@@ -189,19 +204,19 @@ EMIT_R (spe_xshw, 0x2ae);
EMIT_R (spe_xswd, 0x2a6);
EMIT_RR (spe_and, 0x0c1);
EMIT_RR (spe_andc, 0x2c1);
-EMIT_RI10(spe_andbi, 0x016);
-EMIT_RI10(spe_andhi, 0x015);
-EMIT_RI10(spe_andi, 0x014);
+EMIT_RI10s(spe_andbi, 0x016);
+EMIT_RI10s(spe_andhi, 0x015);
+EMIT_RI10s(spe_andi, 0x014);
EMIT_RR (spe_or, 0x041);
EMIT_RR (spe_orc, 0x2c9);
-EMIT_RI10(spe_orbi, 0x006);
-EMIT_RI10(spe_orhi, 0x005);
-EMIT_RI10(spe_ori, 0x004);
+EMIT_RI10s(spe_orbi, 0x006);
+EMIT_RI10s(spe_orhi, 0x005);
+EMIT_RI10s(spe_ori, 0x004);
EMIT_R (spe_orx, 0x1f0);
EMIT_RR (spe_xor, 0x241);
-EMIT_RI10(spe_xorbi, 0x026);
-EMIT_RI10(spe_xorhi, 0x025);
-EMIT_RI10(spe_xori, 0x024);
+EMIT_RI10s(spe_xorbi, 0x026);
+EMIT_RI10s(spe_xorhi, 0x025);
+EMIT_RI10s(spe_xori, 0x024);
EMIT_RR (spe_nand, 0x0c9);
EMIT_RR (spe_nor, 0x049);
EMIT_RR (spe_eqv, 0x249);
@@ -279,6 +294,12 @@ EMIT_RI16(spe_brz, 0x040);
EMIT_RI16(spe_brhnz, 0x046);
EMIT_RI16(spe_brhz, 0x044);
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
@@ -307,6 +328,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
extern void
spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
/** Replicate word 0 of rA across rT. */
extern void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
@@ -388,6 +425,7 @@ EMIT_R (spe_wrch, 0x10d);
#undef EMIT_RI7
#undef EMIT_RI8
#undef EMIT_RI10
+#undef EMIT_RI10s
#undef EMIT_RI16
#undef EMIT_RI18
#undef EMIT_I16
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 3bba9dcc07..99ee74cf14 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -371,7 +371,11 @@ void x86_jcc( struct x86_function *p,
DUMP_I(cc);
if (offset < 0) {
- assert(p->csr - p->store > -offset);
+ /*assert(p->csr - p->store > -offset);*/
+ if (p->csr - p->store <= -offset) {
+ /* probably out of memory (using the error_overflow buffer) */
+ return;
+ }
}
if (offset <= 127 && offset >= -128) {
@@ -675,6 +679,44 @@ void x86_and( struct x86_function *p,
* SSE instructions
*/
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+ DUMP_R( ptr );
+ assert(ptr.mod != mod_REG);
+ emit_2ub(p, 0x0f, 0x18);
+ emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+ DUMP_R( ptr );
+ assert(ptr.mod != mod_REG);
+ emit_2ub(p, 0x0f, 0x18);
+ emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+ DUMP_R( ptr );
+ assert(ptr.mod != mod_REG);
+ emit_2ub(p, 0x0f, 0x18);
+ emit_modrm_noreg(p, 2, ptr);
+}
+
+void sse_movntps( struct x86_function *p,
+ struct x86_reg dst,
+ struct x86_reg src)
+{
+ DUMP_RR( dst, src );
+
+ assert(dst.mod != mod_REG);
+ assert(src.mod == mod_REG);
+ emit_2ub(p, 0x0f, 0x2b);
+ emit_modrm(p, src, dst);
+}
+
+
+
void sse_movss( struct x86_function *p,
struct x86_reg dst,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 510aa1b0de..1b5eaaca85 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -185,6 +185,13 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar
void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
+void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4fdad3a5c7..f79170b9d6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,6 +25,10 @@
*
**************************************************************************/
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+
#include "pipe/p_debug.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_math.h"
@@ -36,8 +40,6 @@
#include "rtasm/rtasm_x86sse.h"
-#ifdef PIPE_ARCH_X86
-
/* for 1/sqrt()
*
* This costs about 100fps (close to 10%) in gears:
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index d3951e4e7d..b3d1045a8f 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
u_gen_mipmap.c \
u_handle_table.c \
u_hash_table.c \
+ u_keymap.c \
u_math.c \
u_mm.c \
u_rect.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index e65c17b1cc..8a04955a16 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -11,13 +11,14 @@ util = env.ConvenienceLibrary(
'u_gen_mipmap.c',
'u_handle_table.c',
'u_hash_table.c',
+ 'u_keymap.c',
'u_math.c',
'u_mm.c',
'u_rect.c',
'u_simple_shaders.c',
'u_snprintf.c',
- 'u_stream_stdc.c',
- 'u_stream_wd.c',
+ 'u_stream_stdc.c',
+ 'u_stream_wd.c',
'u_tile.c',
'u_time.c',
])
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 0000000000..01b17ddb1b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+ struct cso_hash *cso;
+ unsigned key_size;
+ unsigned max_entries; /* XXX not obeyed net */
+ unsigned num_entries;
+ keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+ void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+ const void *key, void *data, void *user)
+{
+ FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+ return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+ unsigned i, hash;
+
+ keySize /= 4; /* convert from bytes to uints */
+
+ hash = 0;
+ for (i = 0; i < keySize; i++) {
+ hash ^= (i + 1) * ((const unsigned *) key)[i];
+ }
+
+ /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+ return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize size of the keys in bytes
+ * \param maxEntries max number of entries to allow (~0 = infinity)
+ * \param deleteFunc optional callback to call when entries
+ * are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+ keymap_delete_func deleteFunc)
+{
+ struct keymap *map = MALLOC_STRUCT(keymap);
+ if (!map)
+ return NULL;
+
+ map->cso = cso_hash_create();
+ if (!map->cso) {
+ FREE(map);
+ return NULL;
+ }
+
+ map->max_entries = maxEntries;
+ map->num_entries = 0;
+ map->key_size = keySize;
+ map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+ return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries. The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+ util_keymap_remove_all(map, user);
+ cso_hash_delete(map->cso);
+ FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+ unsigned key_hash)
+{
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ iter = cso_hash_find(map->cso, key_hash);
+ while (!cso_hash_iter_is_null(iter)) {
+ item = (struct keymap_item *) cso_hash_iter_data(iter);
+ if (!memcmp(item->key, key, map->key_size))
+ break;
+ iter = cso_hash_iter_next(iter);
+ }
+
+ return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+ unsigned key_hash)
+{
+ struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+ if (cso_hash_iter_is_null(iter)) {
+ return NULL;
+ }
+ else {
+ return hash_table_item(iter);
+ }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+ const void *data, void *user)
+{
+ unsigned key_hash;
+ struct keymap_item *item;
+ struct cso_hash_iter iter;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ item = hash_table_find_item(map, key, key_hash);
+ if (item) {
+ /* call delete callback for old entry/item */
+ map->delete_func(map, item->key, item->value, user);
+ item->value = (void *) data;
+ return TRUE;
+ }
+
+ item = MALLOC_STRUCT(keymap_item);
+ if (!item)
+ return FALSE;
+
+ item->key = mem_dup(key, map->key_size);
+ item->value = (void *) data;
+
+ iter = cso_hash_insert(map->cso, key_hash, item);
+ if (cso_hash_iter_is_null(iter)) {
+ FREE(item);
+ return FALSE;
+ }
+
+ map->num_entries++;
+
+ return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+ unsigned key_hash;
+ struct keymap_item *item;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ item = hash_table_find_item(map, key, key_hash);
+ if (!item)
+ return NULL;
+
+ return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+ unsigned key_hash;
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ assert(map);
+
+ key_hash = hash(key, map->key_size);
+
+ iter = hash_table_find_iter(map, key, key_hash);
+ if (cso_hash_iter_is_null(iter))
+ return;
+
+ item = hash_table_item(iter);
+ assert(item);
+ map->delete_func(map, item->key, item->value, user);
+ FREE(item->key);
+ FREE(item);
+
+ map->num_entries--;
+
+ cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+ struct cso_hash_iter iter;
+ struct keymap_item *item;
+
+ assert(map);
+
+ iter = cso_hash_first_node(map->cso);
+ while (!cso_hash_iter_is_null(iter)) {
+ item = (struct keymap_item *)
+ cso_hash_take(map->cso, cso_hash_iter_key(iter));
+ map->delete_func(map, item->key, item->value, user);
+ FREE(item->key);
+ FREE(item);
+ iter = cso_hash_first_node(map->cso);
+ }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+ debug_printf("Keymap %p: %u of max %u entries\n",
+ (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 0000000000..8d60a76fc3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+ const void *key, void *data,
+ void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+ keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+ const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
index 0729114d6a..5b3cab4642 100644
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -30,7 +30,7 @@
#include "util/u_math.h"
-
+/** 2^x, for x in [-1.0, 1.0[ */
float pow2_table[POW2_TABLE_SIZE];
@@ -38,9 +38,21 @@ static void
init_pow2_table(void)
{
int i;
- for (i = 0; i < POW2_TABLE_SIZE; i++) {
- pow2_table[i] = (float) pow(2.0, i / POW2_TABLE_SCALE);
- }
+ for (i = 0; i < POW2_TABLE_SIZE; i++)
+ pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+}
+
+
+/** log2(x), for x in [1.0, 2.0[ */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void
+init_log2_table(void)
+{
+ unsigned i;
+ for (i = 0; i < LOG2_TABLE_SIZE; i++)
+ log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SIZE));
}
@@ -53,6 +65,7 @@ util_init_math(void)
static boolean initialized = FALSE;
if (!initialized) {
init_pow2_table();
+ init_log2_table();
initialized = TRUE;
}
}
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 084655e6dd..be7303e550 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -174,8 +174,10 @@ static INLINE float logf( float f )
-#define POW2_TABLE_SIZE 256
-#define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1))
+#define POW2_TABLE_SIZE_LOG2 9
+#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
+#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
+#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
extern float pow2_table[POW2_TABLE_SIZE];
@@ -186,98 +188,78 @@ util_init_math(void);
union fi {
float f;
- int i;
- unsigned ui;
+ int32_t i;
+ uint32_t ui;
};
/**
- * Fast approximation to exp(x).
- * Compute with base 2 exponents: exp(x) = exp2(log2(e) * x)
- * Note: log2(e) is a constant, k = 1.44269
- * So, exp(x) = exp2(k * x);
+ * Fast version of 2^x
* Identity: exp2(a + b) = exp2(a) * exp2(b)
- * Let ipart = int(k*x)
- * Let fpart = k*x - ipart;
- * So, exp2(k*x) = exp2(ipart) * exp2(fpart)
+ * Let ipart = int(x)
+ * Let fpart = x - ipart;
+ * So, exp2(x) = exp2(ipart) * exp2(fpart)
* Compute exp2(ipart) with i << ipart
* Compute exp2(fpart) with lookup table.
*/
static INLINE float
-util_fast_exp(float x)
+util_fast_exp2(float x)
{
- if (x >= 0.0f) {
- float k = 1.44269f; /* = log2(e) */
- float kx = k * x;
- int ipart = (int) kx;
- float fpart = kx - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return y;
- }
- else {
- /* exp(-x) = 1.0 / exp(x) */
- float k = -1.44269f;
- float kx = k * x;
- int ipart = (int) kx;
- float fpart = kx - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return 1.0f / y;
- }
+ int32_t ipart;
+ float fpart, mpart;
+ union fi epart;
+
+ if(x > 129.00000f)
+ return 3.402823466e+38f;
+
+ if(x < -126.99999f)
+ return 0.0f;
+
+ ipart = (int32_t) x;
+ fpart = x - (float) ipart;
+
+ /* same as
+ * epart.f = (float) (1 << ipart)
+ * but faster and without integer overflow for ipart > 31 */
+ epart.i = (ipart + 127 ) << 23;
+
+ mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
+
+ return epart.f * mpart;
}
/**
- * Fast version of 2^x
- * XXX the above function could be implemented in terms of this one.
+ * Fast approximation to exp(x).
*/
static INLINE float
-util_fast_exp2(float x)
+util_fast_exp(float x)
{
- if (x >= 0.0f) {
- int ipart = (int) x;
- float fpart = x - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return y;
- }
- else {
- /* exp(-x) = 1.0 / exp(x) */
- int ipart = (int) -x;
- float fpart = -x - (float) ipart;
- float y = (float) (1 << ipart)
- * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
- return 1.0f / y;
- }
+ const float k = 1.44269f; /* = log2(e) */
+ return util_fast_exp2(k * x);
}
-/**
- * Based on code from http://www.flipcode.com/archives/Fast_log_Function.shtml
- */
+#define LOG2_TABLE_SIZE_LOG2 8
+#define LOG2_TABLE_SIZE (1 << LOG2_TABLE_SIZE_LOG2)
+extern float log2_table[LOG2_TABLE_SIZE];
+
+
static INLINE float
-util_fast_log2(float val)
+util_fast_log2(float x)
{
union fi num;
- int log_2;
- num.f = val;
- log_2 = ((num.i >> 23) & 255) - 128;
- num.i &= ~(255 << 23);
- num.i += 127 << 23;
- num.f = ((-1.0f/3) * num.f + 2) * num.f - 2.0f/3;
- return num.f + log_2;
+ float epart, mpart;
+ num.f = x;
+ epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
+ mpart = log2_table[(num.i & 0x007fffff) >> (23 - LOG2_TABLE_SIZE_LOG2)];
+ return epart + mpart;
}
static INLINE float
util_fast_pow(float x, float y)
{
- /* XXX these tests may need adjustment */
- if (y >= 3.0f && (-0.02f <= x && x <= 0.02f))
- return 0.0f;
- if (y >= 50.0f && (-0.9f <= x && x <= 0.9f))
- return 0.0f;
return util_fast_exp2(util_fast_log2(x) * y);
}
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 0c8356cd05..e2a8491e62 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -37,6 +37,10 @@
#ifndef U_SSE_H_
#define U_SSE_H_
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
#include <xmmintrin.h>
#include <emmintrin.h>
@@ -66,7 +70,8 @@ _mm_castps_si128(__m128 a)
return u.m128i;
}
-#endif
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
#endif /* U_SSE_H_ */
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 99329fd8e2..b0169b8e32 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -67,6 +67,7 @@
#define CELL_MAX_SPUS 6
#define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12 /* 2k x 2k */
#define TILE_SIZE 32
@@ -94,6 +95,7 @@
#define CELL_CMD_STATE_BIND_VS 18
#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
#define CELL_CMD_STATE_ATTRIB_FETCH 20
+#define CELL_CMD_STATE_FS_CONSTANTS 21
#define CELL_CMD_VS_EXECUTE 22
#define CELL_CMD_FLUSH_BUFFER_RANGE 23
@@ -127,7 +129,7 @@ struct cell_command_fragment_ops
/** Max instructions for fragment programs */
-#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
/**
* Command to send a fragment program to SPUs.
@@ -227,6 +229,7 @@ struct cell_command_render
float xmin, ymin, xmax, ymax; /* XXX another dummy field */
uint min_index;
boolean inline_verts;
+ uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
};
@@ -248,9 +251,12 @@ struct cell_command_sampler
struct cell_command_texture
{
uint64_t opcode; /**< CELL_CMD_STATE_TEXTURE */
+ uint target; /**< PIPE_TEXTURE_x */
uint unit;
- void *start; /**< Address in main memory */
- ushort width, height;
+ void *start[CELL_MAX_TEXTURE_LEVELS]; /**< Address in main memory */
+ ushort width[CELL_MAX_TEXTURE_LEVELS];
+ ushort height[CELL_MAX_TEXTURE_LEVELS];
+ ushort depth[CELL_MAX_TEXTURE_LEVELS];
};
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 62e213ea35..b66aa9c9d9 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -62,6 +62,8 @@ cell_destroy_context( struct pipe_context *pipe )
{
struct cell_context *cell = cell_context(pipe);
+ util_delete_keymap(cell->fragment_ops_cache, NULL);
+
cell_spu_exit(cell);
align_free(cell);
@@ -126,11 +128,14 @@ cell_create_context(struct pipe_screen *screen,
cell_init_state_functions(cell);
cell_init_shader_functions(cell);
cell_init_surface_functions(cell);
- cell_init_texture_functions(cell);
cell_init_vertex_functions(cell);
cell->draw = cell_draw_create(cell);
+ /* Create cache of fragment ops generated code */
+ cell->fragment_ops_cache =
+ util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
cell_init_vbuf(cell);
draw_set_rasterize_stage(cell->draw, cell->vbuf);
@@ -156,5 +161,8 @@ cell_create_context(struct pipe_screen *screen,
cell_init_batch_buffers(cell);
+ /* make sure SPU initializations are done before proceeding */
+ cell_flush_int(cell, CELL_FLUSH_WAIT);
+
return &cell->pipe;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 3dc15c9233..80a9b3d7e1 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -38,6 +38,7 @@
#include "cell/common.h"
#include "rtasm/rtasm_ppc_spe.h"
#include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
struct cell_vbuf_render;
@@ -67,6 +68,19 @@ struct cell_fragment_shader_state
/**
+ * Key for mapping per-fragment state to cached SPU machine code.
+ * keymap(cell_fragment_ops_key) => cell_command_fragment_ops
+ */
+struct cell_fragment_ops_key
+{
+ struct pipe_blend_state blend;
+ struct pipe_depth_stencil_alpha_state dsa;
+ enum pipe_format color_format;
+ enum pipe_format zs_format;
+};
+
+
+/**
* Per-context state, subclass of pipe_context.
*/
struct cell_context
@@ -107,6 +121,9 @@ struct cell_context
uint dirty;
+ /** Cache of code generated for per-fragment ops */
+ struct keymap *fragment_ops_cache;
+
/** The primitive drawing context */
struct draw_context *draw;
struct draw_stage *render_stage;
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 8d2d4f2a0f..3dfd5f673d 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -77,13 +77,15 @@ struct codegen
/** Per-instruction temps / intermediate temps */
int num_itemps;
- int itemps[10];
+ int itemps[12];
/** Current IF/ELSE/ENDIF nesting level */
int if_nesting;
/** Index of execution mask register */
int exec_mask_reg;
+ int frame_size; /**< Stack frame size, in words */
+
struct spe_function *f;
boolean error;
};
@@ -165,6 +167,37 @@ get_exec_mask_reg(struct codegen *gen)
}
+static boolean
+is_register_src(struct codegen *gen, int channel,
+ const struct tgsi_full_src_register *src)
+{
+ int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+ int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+ if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+ return FALSE;
+ }
+ if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+ src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+ return TRUE;
+ }
+ return FALSE;
+}
+
+
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+ const struct tgsi_full_dst_register *dst)
+{
+ if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+ return TRUE;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
/**
* Return the index of the SPU temporary containing the named TGSI
* source register. If the TGSI register is a TGSI_FILE_TEMPORARY we
@@ -185,41 +218,48 @@ get_src_reg(struct codegen *gen,
assert(swizzle >= TGSI_SWIZZLE_X);
assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
- switch (src->SrcRegister.File) {
- case TGSI_FILE_TEMPORARY:
- reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
- break;
- case TGSI_FILE_INPUT:
- {
- if(swizzle == TGSI_EXTSWIZZLE_ONE)
- {
- /* Load const one float and early out */
- reg = get_const_one_reg(gen);
- }
- else if(swizzle == TGSI_EXTSWIZZLE_ZERO)
+ if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+ /* Load const one float and early out */
+ reg = get_const_one_reg(gen);
+ }
+ else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+ /* Load const zero float and early out */
+ reg = get_itemp(gen);
+ spe_xor(gen->f, reg, reg, reg);
+ }
+ else {
+ assert(swizzle < 4);
+
+ switch (src->SrcRegister.File) {
+ case TGSI_FILE_TEMPORARY:
+ reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+ break;
+ case TGSI_FILE_INPUT:
{
- /* Load const zero float and early out */
+ /* offset is measured in quadwords, not bytes */
+ int offset = src->SrcRegister.Index * 4 + swizzle;
reg = get_itemp(gen);
- spe_xor(gen->f, reg, reg, reg);
+ reg_is_itemp = TRUE;
+ /* Load: reg = memory[(machine_reg) + offset] */
+ spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
}
- else
+ break;
+ case TGSI_FILE_IMMEDIATE:
+ reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+ break;
+ case TGSI_FILE_CONSTANT:
{
/* offset is measured in quadwords, not bytes */
int offset = src->SrcRegister.Index * 4 + swizzle;
reg = get_itemp(gen);
reg_is_itemp = TRUE;
/* Load: reg = memory[(machine_reg) + offset] */
- spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+ spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
}
+ break;
+ default:
+ assert(0);
}
- break;
- case TGSI_FILE_IMMEDIATE:
- reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
- break;
- case TGSI_FILE_CONSTANT:
- /* xxx fall-through for now / fix */
- default:
- assert(0);
}
/*
@@ -243,7 +283,7 @@ get_src_reg(struct codegen *gen,
}
/* mask with bit 31 set, the rest cleared */
- spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+ spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
spe_andc(gen->f, result_reg, reg, bit31mask_reg);
@@ -318,6 +358,7 @@ store_dest_reg(struct codegen *gen,
}
else {
/* we're not inside a condition or loop: do nothing special */
+
}
break;
case TGSI_FILE_OUTPUT:
@@ -330,17 +371,17 @@ store_dest_reg(struct codegen *gen,
/* First read the current value from memory:
* Load: curval = memory[(machine_reg) + offset]
*/
- spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+ spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
/* Mix curval with newvalue according to exec mask:
* d[i] = mask_reg[i] ? value_reg : d_reg
*/
spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
/* Store: memory[(machine_reg) + offset] = curval */
- spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
+ spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
}
else {
/* Store: memory[(machine_reg) + offset] = reg */
- spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+ spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
}
}
break;
@@ -350,18 +391,95 @@ store_dest_reg(struct codegen *gen,
}
+
+static void
+emit_prologue(struct codegen *gen)
+{
+ gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+ spe_comment(gen->f, -4, "Function prologue:");
+
+ /* save $lr on stack # stqd $lr,16($sp) */
+ spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+ if (gen->frame_size >= 512) {
+ /* offset is too large for ai instruction */
+ int offset_reg = spe_allocate_available_register(gen->f);
+ int sp_reg = spe_allocate_available_register(gen->f);
+ /* offset = -framesize */
+ spe_load_int(gen->f, offset_reg, -gen->frame_size);
+ /* sp = $sp */
+ spe_move(gen->f, sp_reg, SPE_REG_SP);
+ /* $sp = $sp + offset_reg */
+ spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+ /* save $sp in stack frame */
+ spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+ /* clean up */
+ spe_release_register(gen->f, offset_reg);
+ spe_release_register(gen->f, sp_reg);
+ }
+ else {
+ /* save stack pointer # stqd $sp,-frameSize($sp) */
+ spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+ /* adjust stack pointer # ai $sp,$sp,-frameSize */
+ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+ }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+ spe_comment(gen->f, -4, "Function epilogue:");
+
+ if (gen->frame_size >= 512) {
+ /* offset is too large for ai instruction */
+ int offset_reg = spe_allocate_available_register(gen->f);
+ /* offset = framesize */
+ spe_load_int(gen->f, offset_reg, gen->frame_size);
+ /* $sp = $sp + offset */
+ spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+ /* clean up */
+ spe_release_register(gen->f, offset_reg);
+ }
+ else {
+ /* restore stack pointer # ai $sp,$sp,frameSize */
+ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+ }
+
+ /* restore $lr # lqd $lr,16($sp) */
+ spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+ /* return from function call */
+ spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
static boolean
emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, src_reg[4], dst_reg[4];
+
spe_comment(gen->f, -4, "MOV:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* XXX we don't always need to actually emit a mov instruction here */
- spe_move(gen->f, dst_reg, src_reg);
- store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+ src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+ is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+ /* special-case: register to memory store */
+ store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+ else {
+ spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+ store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
free_itemps(gen);
}
}
@@ -376,22 +494,25 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
spe_comment(gen->f, -4, "ADD:");
- /* Loop over Red/Green/Blue/Alpha channels */
+ /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
for (ch = 0; ch < 4; ch++) {
/* If the dest R, G, B or A writemask is enabled... */
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* get indexes of the two src, one dest SPE registers */
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
/* Emit actual SPE instruction: d = s1 + s2 */
- spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-
+ spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
/* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
/* Free any intermediate temps we allocated */
free_itemps(gen);
}
@@ -405,23 +526,20 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
spe_comment(gen->f, -4, "SUB:");
- /* Loop over Red/Green/Blue/Alpha channels */
for (ch = 0; ch < 4; ch++) {
- /* If the dest R, G, B or A writemask is enabled... */
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- /* get indexes of the two src, one dest SPE registers */
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
- /* Emit actual SPE instruction: d = s1 - s2 */
- spe_fs(gen->f, d_reg, s1_reg, s2_reg);
-
- /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- /* Free any intermediate temps we allocated */
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ /* d = s1 - s2 */
+ spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
}
@@ -434,17 +552,21 @@ emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
spe_comment(gen->f, -4, "MAD:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
/* d = s1 * s2 + s3 */
- spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
}
@@ -458,21 +580,37 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
spe_comment(gen->f, -4, "LERP:");
+ /* setup/get src/dst/temp regs */
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* d = s3 + s1(s2 - s3) */
- spe_fs(gen->f, d_reg, s2_reg, s3_reg);
- spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
+ }
+
+ /* d = s3 + s1(s2 - s3) */
+ /* do all subtracts, then all fma, then all stores to better pipeline */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
}
}
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ free_itemps(gen);
return true;
}
@@ -482,16 +620,20 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s1_reg[4], s2_reg[4], d_reg[4];
spe_comment(gen->f, -4, "MUL:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
/* d = s1 * s2 */
- spe_fm(gen->f, d_reg, s1_reg, s2_reg);
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+ spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
}
@@ -557,7 +699,7 @@ emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
const int bit31mask_reg = get_itemp(gen);
/* mask with bit 31 set, the rest cleared */
- spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+ spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
/* d = sign bit cleared in s1 */
spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
@@ -576,27 +718,36 @@ static boolean
emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ int s1x_reg, s1y_reg, s1z_reg;
+ int s2x_reg, s2y_reg, s2z_reg;
+ int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
spe_comment(gen->f, -4, "DP3:");
- int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
- int tmp_reg = get_itemp(gen);
- /* t = x0 * x1 */
- spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+ s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
- s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
- s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
- /* t = y0 * y1 + t */
- spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+ /* t0 = x0 * x1 */
+ spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
- s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
- s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
- /* t = z0 * z1 + t */
- spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+ /* t1 = y0 * y1 */
+ spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+ /* t0 = z0 * z1 + t0 */
+ spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+ /* t0 = t0 + t1 */
+ spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, t0_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
}
}
@@ -611,32 +762,41 @@ static boolean
emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
+ int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+ int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+ int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
spe_comment(gen->f, -4, "DP4:");
- int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
- int tmp_reg = get_itemp(gen);
- /* t = x0 * x1 */
- spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+ s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+ s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+ s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+ s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+ s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+ s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+ s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+ s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
- s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
- s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
- /* t = y0 * y1 + t */
- spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+ /* t0 = x0 * x1 */
+ spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
- s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
- s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
- /* t = z0 * z1 + t */
- spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+ /* t1 = y0 * y1 */
+ spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
- s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
- s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
- /* t = w0 * w1 + t */
- spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+ /* t0 = z0 * z1 + t0 */
+ spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+ /* t1 = w0 * w1 + t1 */
+ spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+ /* t0 = t0 + t1 */
+ spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, t0_reg);
+ store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
}
}
@@ -650,6 +810,7 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
+ /* XXX rewrite this function to look more like DP3/DP4 */
int ch;
spe_comment(gen->f, -4, "DPH:");
@@ -676,6 +837,8 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ spe_move(gen->f, d_reg, tmp_reg);
store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
}
}
@@ -1016,15 +1179,15 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
int tmp_reg = get_itemp(gen);
/* If negative, subtract 1.0 */
- spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
- spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
- spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+ spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+ spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
/* Convert float to int */
- spe_cflts(gen->f, d_reg, d_reg, 0);
+ spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
/* Convert int to float */
- spe_csflt(gen->f, d_reg, d_reg, 0);
+ spe_csflt(gen->f, d_reg, tmp_reg, 0);
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
@@ -1035,15 +1198,14 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
}
/**
- * Emit frac.
- * Input - FLR(Input)
+ * Compute frac = Input - FLR(Input)
*/
static boolean
emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
int ch;
- spe_comment(gen->f, -4, "FLR:");
+ spe_comment(gen->f, -4, "FRC:");
int zero_reg = get_itemp(gen);
spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
@@ -1055,18 +1217,18 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
int tmp_reg = get_itemp(gen);
/* If negative, subtract 1.0 */
- spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
- spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
- spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+ spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+ spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+ spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
/* Convert float to int */
- spe_cflts(gen->f, d_reg, d_reg, 0);
+ spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
/* Convert int to float */
- spe_csflt(gen->f, d_reg, d_reg, 0);
+ spe_csflt(gen->f, tmp_reg, tmp_reg, 0);
/* d = s1 - FLR(s1) */
- spe_fs(gen->f, d_reg, s1_reg, d_reg);
+ spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
@@ -1091,6 +1253,21 @@ print_functions(struct cell_context *cell)
#endif
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+ const struct cell_spu_function_info *funcs = &cell->spu_functions;
+ uint i, addr = 0;
+ for (i = 0; i < funcs->num; i++) {
+ if (strcmp(funcs->names[i], funcname) == 0) {
+ addr = funcs->addrs[i];
+ }
+ }
+ assert(addr && "spu function not found");
+ return addr / 4; /* discard 2 least significant bits */
+}
+
+
/**
* Emit code to call a SPU function.
* Used to implement instructions like SIN/COS/POW/TEX/etc.
@@ -1100,77 +1277,56 @@ emit_function_call(struct codegen *gen,
const struct tgsi_full_instruction *inst,
char *funcname, uint num_args)
{
- const struct cell_spu_function_info *funcs = &gen->cell->spu_functions;
+ const uint addr = lookup_function(gen->cell, funcname);
char comment[100];
- uint addr;
int ch;
- /* XXX temporary value */
- const int frameSize = 64; /* stack frame (activation record) size */
-
assert(num_args <= 3);
- /* lookup function address */
- {
- uint i;
- addr = 0;
- for (i = 0; i < funcs->num; i++) {
- if (strcmp(funcs->names[i], funcname) == 0) {
- addr = funcs->addrs[i];
- }
- }
- assert(addr && "spu function not found");
- }
-
- addr /= 4; /* discard 2 least significant bits */
-
snprintf(comment, sizeof(comment), "CALL %s:", funcname);
spe_comment(gen->f, -4, comment);
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- int s_regs[3];
- uint a;
+ int s_regs[3], d_reg;
+ ubyte usedRegs[SPE_NUM_REGS];
+ uint a, i, numUsed;
+
for (a = 0; a < num_args; a++) {
s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
}
+ d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
- /* Basically:
- * save registers on stack
- * move parameters to registers 3, 4, 5...
- * call function
- * save return value (reg 3)
- * restore registers from stack
- */
-
- /* XXX hack: load first function param */
- spe_move(gen->f, 3, s_regs[0]);
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 2);
- /* save $lr on stack # stqd $lr,16($sp) */
- spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
- /* save stack pointer # stqd $sp,-frameSize($sp) */
- spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
-
- /* XXX save registers to stack here */
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
- /* adjust stack pointer # ai $sp,$sp,-frameSize */
- spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
+ /* setup function arguments */
+ for (a = 0; a < num_args; a++) {
+ spe_move(gen->f, 3 + a, s_regs[a]);
+ }
/* branch to function, save return addr */
spe_brasl(gen->f, SPE_REG_RA, addr);
- /* restore stack pointer # ai $sp,$sp,frameSize */
- spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize);
-
- /* XXX restore registers from stack here */
-
- /* restore $lr # lqd $lr,16($sp) */
- spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-
- /* XXX hack: save function's return value */
+ /* save function's return value */
spe_move(gen->f, d_reg, 3);
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_reg) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+ }
+
store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
free_itemps(gen);
}
@@ -1180,31 +1336,114 @@ emit_function_call(struct codegen *gen,
}
+static boolean
+emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+ const uint addr = lookup_function(gen->cell, "spu_txp");
+ const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+ int ch;
+ int coord_regs[4], d_regs[4];
+
+ assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
+ spe_comment(gen->f, -4, "CALL txp:");
+
+ /* get src/dst reg info */
+ for (ch = 0; ch < 4; ch++) {
+ coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ }
+
+ {
+ ubyte usedRegs[SPE_NUM_REGS];
+ uint i, numUsed;
+
+ numUsed = spe_get_registers_used(gen->f, usedRegs);
+ assert(numUsed < gen->frame_size / 16 - 2);
+
+ /* save registers to stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ int offset = 2 + i;
+ spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+
+ /* setup function arguments */
+ for (i = 0; i < 4; i++) {
+ spe_move(gen->f, 3 + i, coord_regs[i]);
+ }
+ spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+ /* branch to function, save return addr */
+ spe_brasl(gen->f, SPE_REG_RA, addr);
+
+ /* save function's return values (four pixel's colors) */
+ for (i = 0; i < 4; i++) {
+ spe_move(gen->f, d_regs[i], 3 + i);
+ }
+
+ /* restore registers from stack */
+ for (i = 0; i < numUsed; i++) {
+ uint reg = usedRegs[i];
+ if (reg != d_regs[0] &&
+ reg != d_regs[1] &&
+ reg != d_regs[2] &&
+ reg != d_regs[3]) {
+ int offset = 2 + i;
+ spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+ }
+ }
+ }
+
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+ free_itemps(gen);
+ }
+ }
+
+ return TRUE;
+}
+
+
/**
* Emit max. See emit_SGT for comments.
*/
static boolean
emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
spe_comment(gen->f, -4, "MAX:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
+ }
- /* d = (s1 > s2) ? s1 : s2 */
- spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
- spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+ /* d = (s0 > s1) ? s0 : s1 */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+ }
+ }
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
}
+ free_itemps(gen);
return true;
}
@@ -1214,25 +1453,38 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
static boolean
emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
{
- int ch;
+ int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
spe_comment(gen->f, -4, "MIN:");
for (ch = 0; ch < 4; ch++) {
if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
- int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
- int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
- int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+ s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+ d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+ tmp_reg[ch] = get_itemp(gen);
+ }
+ }
- /* d = (s2 > s1) ? s1 : s2 */
- spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
- spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+ /* d = (s1 > s0) ? s0 : s1 */
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+ }
+ }
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+ }
+ }
- store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
- free_itemps(gen);
+ for (ch = 0; ch < 4; ch++) {
+ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+ store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
}
}
+ free_itemps(gen);
return true;
}
@@ -1339,8 +1591,7 @@ static boolean
emit_END(struct codegen *gen)
{
spe_comment(gen->f, -4, "END:");
- /* return from function call */
- spe_bi(gen->f, SPE_REG_RA, 0, 0);
+ emit_epilogue(gen);
return true;
}
@@ -1413,6 +1664,18 @@ emit_instruction(struct codegen *gen,
return emit_function_call(gen, inst, "spu_sin", 1);
case TGSI_OPCODE_POW:
return emit_function_call(gen, inst, "spu_pow", 2);
+ case TGSI_OPCODE_EXPBASE2:
+ return emit_function_call(gen, inst, "spu_exp2", 1);
+ case TGSI_OPCODE_LOGBASE2:
+ return emit_function_call(gen, inst, "spu_log2", 1);
+ case TGSI_OPCODE_TEX:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXD:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXB:
+ /* fall-through for now */
+ case TGSI_OPCODE_TXP:
+ return emit_TXP(gen, inst);
case TGSI_OPCODE_IF:
return emit_IF(gen, inst);
@@ -1456,16 +1719,23 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
for (ch = 0; ch < 4; ch++) {
float val = immed->u.ImmediateFloat32[ch].Float;
- int reg = spe_allocate_available_register(gen->f);
- if (reg < 0)
- return false;
+ if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+ /* re-use previous register */
+ gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+ }
+ else {
+ int reg = spe_allocate_available_register(gen->f);
+
+ if (reg < 0)
+ return false;
- /* update immediate map */
- gen->imm_regs[gen->num_imm][ch] = reg;
+ /* update immediate map */
+ gen->imm_regs[gen->num_imm][ch] = reg;
- /* emit initializer instruction */
- spe_load_float(gen->f, reg, val);
+ /* emit initializer instruction */
+ spe_load_float(gen->f, reg, val);
+ }
}
gen->num_imm++;
@@ -1488,12 +1758,6 @@ emit_declaration(struct cell_context *cell,
switch (decl->Declaration.File) {
case TGSI_FILE_TEMPORARY:
- if (cell->debug_flags & CELL_DEBUG_ASM) {
- printf("Declare temp reg %d .. %d\n",
- decl->DeclarationRange.First,
- decl->DeclarationRange.Last);
- }
-
for (i = decl->DeclarationRange.First;
i <= decl->DeclarationRange.Last;
i++) {
@@ -1508,12 +1772,12 @@ emit_declaration(struct cell_context *cell,
* to SPU memory. someday...
*/
- if (cell->debug_flags & CELL_DEBUG_ASM) {
- printf(" SPE regs: %d %d %d %d\n",
- gen->temp_regs[i][0],
- gen->temp_regs[i][1],
- gen->temp_regs[i][2],
- gen->temp_regs[i][3]);
+ {
+ char buf[100];
+ sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+ gen->temp_regs[i][0], gen->temp_regs[i][1],
+ gen->temp_regs[i][2], gen->temp_regs[i][3]);
+ spe_comment(gen->f, -4, buf);
}
}
break;
@@ -1525,6 +1789,7 @@ emit_declaration(struct cell_context *cell,
}
+
/**
* Translate TGSI shader code to SPE instructions. This is done when
* the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -1564,12 +1829,14 @@ cell_gen_fragment_program(struct cell_context *cell,
tgsi_parse_init(&parse, tokens);
+ emit_prologue(&gen);
+
while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
tgsi_parse_token(&parse);
switch (parse.FullToken.Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
- if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+ if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
gen.error = true;
break;
@@ -1588,7 +1855,6 @@ cell_gen_fragment_program(struct cell_context *cell,
}
}
-
if (gen.error) {
/* terminate the SPE code */
return emit_END(&gen);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 653afc235d..4e1e53ecdc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,10 +54,12 @@
* \param ifragZ_reg register containing integer fragment Z values (in)
* \param ifbZ_reg register containing integer frame buffer Z values (in/out)
* \param zmask_reg register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
*/
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
- struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
{
/* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
@@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
* framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
*/
spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+ return true;
}
+
+ return false;
}
@@ -238,22 +243,35 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
* it and have to allocate and load it again unnecessarily.
*/
static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
{
if (*is_already_set) return;
*r = spe_allocate_available_register(f);
- spe_load_float(f, *r, value);
*is_already_set = true;
}
static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
{
if (!*is_already_set) return;
spe_release_register(f, r);
*is_already_set = false;
}
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+ if (*is_already_set) return;
+ setup_optional_register(f, is_already_set, r);
+ spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+ release_optional_register(f, is_already_set, r);
+}
+
/**
* Generate SPE code to implement the given blend mode for a quad of pixels.
* \param f SPE function to append instruction onto.
@@ -1117,6 +1135,666 @@ gen_colormask(struct spe_function *f,
spe_release_register(f, colormask_reg);
}
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value. As such, we have
+ * access to the Compare Immediate instructions where we don't in
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test. The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
+ unsigned int mask_reg, unsigned int fbS_reg,
+ unsigned int stencil_pass_reg)
+{
+ /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+ * register, taking into account whether each fragment was active to begin with.
+ */
+ switch (state->func) {
+ case PIPE_FUNC_EQUAL:
+ /* stencil_pass = mask & (s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ break;
+
+ case PIPE_FUNC_NOTEQUAL:
+ /* stencil_pass = mask & ~(s == reference) */
+ spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ break;
+
+ case PIPE_FUNC_GREATER:
+ /* stencil_pass = mask & (s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ break;
+
+ case PIPE_FUNC_LESS: {
+ /* stencil_pass = mask & (reference > s) */
+ /* There's no convenient Compare Less Than Immediate instruction, so
+ * we'll have to do this one the harder way, by loading a register and
+ * comparing directly. Compare Logical Greater Than Word (clgt)
+ * treats its operands as unsigned - no sign extension.
+ */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ break;
+ }
+
+ case PIPE_FUNC_LEQUAL:
+ /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+ spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+ spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ break;
+
+ case PIPE_FUNC_GEQUAL: {
+ /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+ /* As above, we have to do this by loading a register */
+ unsigned int tmp_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, tmp_reg, state->ref_value);
+ spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+ spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+ spe_release_register(f, tmp_reg);
+ break;
+ }
+
+ case PIPE_FUNC_NEVER:
+ /* stencil_pass = mask & 0 = 0 */
+ spe_load_uint(f, stencil_pass_reg, 0);
+ break;
+
+ case PIPE_FUNC_ALWAYS:
+ /* stencil_pass = mask & 1 = mask */
+ spe_move(f, stencil_pass_reg, mask_reg);
+ break;
+ }
+
+ /* The fragments that passed the stencil test are now in stencil_pass_reg.
+ * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+ */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply. It does not
+ * apply any tests. It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+ unsigned int stencil_ref_value, unsigned int stencil_max_value,
+ unsigned int fbS_reg, unsigned int newS_reg)
+{
+ /* The code below assumes that newS_reg and fbS_reg are not the same
+ * register; if they can be, the calculations below will have to use
+ * an additional temporary register. For now, mark the assumption
+ * with an assertion that will fail if they are the same.
+ */
+ ASSERT(fbS_reg != newS_reg);
+
+ /* The code also assumes the the stencil_max_value is of the form
+ * 2^n-1 and can therefore be used as a mask for the valid bits in
+ * addition to a maximum. Make sure this is the case as well.
+ * The clever math below exploits the fact that incrementing a
+ * binary number serves to flip all the bits of a number starting at
+ * the LSB and continuing to (and including) the first zero bit
+ * found. That means that a number and its increment will always
+ * have at least one bit in common (the high order bit, if nothing
+ * else) *unless* the number is zero, *or* the number is of a form
+ * consisting of some number of 1s in the low-order bits followed
+ * by nothing but 0s in the high-order bits. The latter case
+ * implies it's of the form 2^n-1.
+ */
+ ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+ switch(stencil_op) {
+ case PIPE_STENCIL_OP_KEEP:
+ /* newS = S */
+ spe_move(f, newS_reg, fbS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_ZERO:
+ /* newS = 0 */
+ spe_zero(f, newS_reg);
+ break;
+
+ case PIPE_STENCIL_OP_REPLACE:
+ /* newS = stencil reference value */
+ spe_load_uint(f, newS_reg, stencil_ref_value);
+ break;
+
+ case PIPE_STENCIL_OP_INCR: {
+ /* newS = (s == max ? max : s + 1) */
+ unsigned int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+ /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_DECR: {
+ /* newS = (s == 0 ? 0 : s - 1) */
+ unsigned int equals_reg = spe_allocate_available_register(f);
+
+ spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+ /* Add Word Immediate with a (-1) value works */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ /* Select from the current value or the new value based on the equality test */
+ spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+ spe_release_register(f, equals_reg);
+ break;
+ }
+ case PIPE_STENCIL_OP_INCR_WRAP:
+ /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+ * do a normal add and mask off the correct bits
+ */
+ spe_ai(f, newS_reg, fbS_reg, 1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_DECR_WRAP:
+ /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+ spe_ai(f, newS_reg, fbS_reg, -1);
+ spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+ break;
+
+ case PIPE_STENCIL_OP_INVERT:
+ /* newS = ~s. We take advantage of the mask/max value to invert only
+ * the valid bits for the field so we don't have to do an extra "and".
+ */
+ spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+ break;
+
+ default:
+ ASSERT(0);
+ }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values. For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+ unsigned int fbS_reg,
+ unsigned int *fail_reg, unsigned int *zfail_reg,
+ unsigned int *zpass_reg, unsigned int *back_fail_reg,
+ unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+ unsigned zfail_op, back_zfail_op;
+
+ /* Stenciling had better be enabled here */
+ ASSERT(dsa->stencil[0].enabled);
+
+ /* If the depth test is not enabled, it is treated as though it always
+ * passes. In particular, that means that the "zfail_op" (and the backfacing
+ * counterpart, if active) are not considered - a failing stencil test will
+ * trigger the "fail_op", and a passing stencil test will trigger the
+ * "zpass_op".
+ *
+ * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+ * we keep them from being calculated.
+ */
+ if (dsa->depth.enabled) {
+ zfail_op = dsa->stencil[0].zfail_op;
+ back_zfail_op = dsa->stencil[1].zfail_op;
+ }
+ else {
+ zfail_op = PIPE_STENCIL_OP_KEEP;
+ back_zfail_op = PIPE_STENCIL_OP_KEEP;
+ }
+
+ /* One-sided or front-facing stencil */
+ if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+ *fail_reg = fbS_reg;
+ }
+ else {
+ *fail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value,
+ 0xff, fbS_reg, *fail_reg);
+ }
+
+ if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+ *zfail_reg = fbS_reg;
+ }
+ else if (zfail_op == dsa->stencil[0].fail_op) {
+ *zfail_reg = *fail_reg;
+ }
+ else {
+ *zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value,
+ 0xff, fbS_reg, *zfail_reg);
+ }
+
+ if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+ *zpass_reg = fbS_reg;
+ }
+ else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+ *zpass_reg = *fail_reg;
+ }
+ else if (dsa->stencil[0].zpass_op == zfail_op) {
+ *zpass_reg = *zfail_reg;
+ }
+ else {
+ *zpass_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value,
+ 0xff, fbS_reg, *zpass_reg);
+ }
+
+ /* If two-sided stencil is enabled, we have more work to do. */
+ if (!dsa->stencil[1].enabled) {
+ /* This just flags that the registers need not be deallocated later */
+ *back_fail_reg = fbS_reg;
+ *back_zfail_reg = fbS_reg;
+ *back_zpass_reg = fbS_reg;
+ }
+ else {
+ /* Same calculations as above, but for the back stencil */
+ if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+ *back_fail_reg = fbS_reg;
+ }
+ else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+ *back_fail_reg = *fail_reg;
+ }
+ else if (dsa->stencil[1].fail_op == zfail_op) {
+ *back_fail_reg = *zfail_reg;
+ }
+ else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+ *back_fail_reg = *zpass_reg;
+ }
+ else {
+ *back_fail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value,
+ 0xff, fbS_reg, *back_fail_reg);
+ }
+
+ if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+ *back_zfail_reg = fbS_reg;
+ }
+ else if (back_zfail_op == dsa->stencil[0].fail_op) {
+ *back_zfail_reg = *fail_reg;
+ }
+ else if (back_zfail_op == zfail_op) {
+ *back_zfail_reg = *zfail_reg;
+ }
+ else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+ *back_zfail_reg = *zpass_reg;
+ }
+ else if (back_zfail_op == dsa->stencil[1].fail_op) {
+ *back_zfail_reg = *back_fail_reg;
+ }
+ else {
+ *back_zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value,
+ 0xff, fbS_reg, *back_zfail_reg);
+ }
+
+ if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+ *back_zpass_reg = fbS_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+ *back_zpass_reg = *fail_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == zfail_op) {
+ *back_zpass_reg = *zfail_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+ *back_zpass_reg = *zpass_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+ *back_zpass_reg = *back_fail_reg;
+ }
+ else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+ *back_zpass_reg = *back_zfail_reg;
+ }
+ else {
+ *back_zfail_reg = spe_allocate_available_register(f);
+ gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value,
+ 0xff, fbS_reg, *back_zpass_reg);
+ }
+ } /* End of calculations for back-facing stencil */
+}
+
+/* Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled. This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f,
+ const struct pipe_depth_stencil_alpha_state *dsa,
+ const int const facing_reg,
+ const int mask_reg, const int fragZ_reg,
+ const int fbZ_reg, const int fbS_reg)
+{
+ /* True if we've generated code that could require writeback to the
+ * depth and/or stencil buffers
+ */
+ boolean modified_buffers = false;
+
+ boolean need_to_calculate_stencil_values;
+ boolean need_to_writemask_stencil_values;
+
+ /* Registers. We may or may not actually allocate these, depending
+ * on whether the state values indicate that we need them.
+ */
+ unsigned int stencil_pass_reg, stencil_fail_reg;
+ unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+ unsigned int stencil_writemask_reg;
+ unsigned int zmask_reg;
+ unsigned int newS_reg;
+
+ /* Stenciling is quite complex: up to six different configurable stencil
+ * operations/calculations can be required (three each for front-facing
+ * and back-facing fragments). Many of those operations will likely
+ * be identical, so there's good reason to try to avoid calculating
+ * the same values more than once (which unfortunately makes the code less
+ * straightforward).
+ *
+ * To make register management easier, we start a new
+ * register set; we can release all the registers in the set at
+ * once, and avoid having to keep track of exactly which registers
+ * we allocate. We can still allocate and free registers as
+ * desired (if we know we no longer need a register), but we don't
+ * have to spend the complexity to track the more difficult variant
+ * register usage scenarios.
+ */
+ spe_comment(f, 0, "Allocating stencil register set");
+ spe_allocate_register_set(f);
+
+ /* Calculate the writemask. If the writemask is trivial (either
+ * all 0s, meaning that we don't need to calculate any stencil values
+ * because they're not going to change the stencil anyway, or all 1s,
+ * meaning that we have to calculate the stencil values but do not
+ * need to mask them), we can avoid generating code. Don't forget
+ * that we need to consider backfacing stencil, if enabled.
+ */
+ if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+ /* Trivial: don't need to calculate stencil values, and don't need to
+ * write them back to the framebuffer.
+ */
+ need_to_calculate_stencil_values = false;
+ need_to_writemask_stencil_values = false;
+ }
+ else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+ /* Still trivial, but a little less so. We need to write the stencil
+ * values, but we don't need to mask them.
+ */
+ need_to_calculate_stencil_values = true;
+ need_to_writemask_stencil_values = false;
+ }
+ else {
+ /* The general case: calculate, mask, and write */
+ need_to_calculate_stencil_values = true;
+ need_to_writemask_stencil_values = true;
+
+ /* While we're here, generate code that calculates what the
+ * writemask should be. If backface stenciling is enabled,
+ * and the backface writemask is not the same as the frontface
+ * writemask, we'll have to generate code that merges the
+ * two masks into a single effective mask based on fragment facing.
+ */
+ spe_comment(f, 0, "Computing stencil writemask");
+ stencil_writemask_reg = spe_allocate_available_register(f);
+ spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+ if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+ unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+ spe_comment(f, 0, "Resolving two-sided stencil writemask");
+ spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+ spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+ spe_release_register(f, back_write_mask_reg);
+ }
+ }
+
+ /* At least one-sided stenciling must be on. Generate code that
+ * runs the stencil test on the basic/front-facing stencil, leaving
+ * the mask of passing stencil bits in stencil_pass_reg. This mask will
+ * be used both to mask the set of active pixels, and also to
+ * determine how the stencil buffer changes.
+ *
+ * This test will *not* change the value in mask_reg (because we don't
+ * yet know whether to apply the two-sided stencil or one-sided stencil).
+ */
+ spe_comment(f, 0, "Running basic stencil test");
+ stencil_pass_reg = spe_allocate_available_register(f);
+ gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+ /* If two-sided stenciling is on, generate code to run the stencil
+ * test on the backfacing stencil as well, and combine the two results
+ * into the one correct result based on facing.
+ */
+ if (dsa->stencil[1].enabled) {
+ unsigned int temp_reg = spe_allocate_available_register(f);
+ spe_comment(f, 0, "Running backface stencil test");
+ gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+ spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+ spe_release_register(f, temp_reg);
+ }
+
+ /* Generate code that, given the mask of valid fragments and the
+ * mask of valid fragments that passed the stencil test, computes
+ * the mask of valid fragments that failed the stencil test. We
+ * have to do this before we run a depth test (because the
+ * depth test should not be performed on fragments that failed the
+ * stencil test, and because the depth test will update the
+ * mask of valid fragments based on the results of the depth test).
+ */
+ spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+ stencil_fail_reg = spe_allocate_available_register(f);
+ spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+ /* Now remove the stenciled-out pixels from the valid fragment mask,
+ * so we can later use the valid fragment mask in the depth test.
+ */
+ spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+ /* We may not need to calculate stencil values, if the writemask is off */
+ if (need_to_calculate_stencil_values) {
+ unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+ unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+ /* Generate code that calculates exactly which stencil values we need,
+ * without calculating the same value twice (say, if two different
+ * stencil ops have the same value). This code will work for one-sided
+ * and two-sided stenciling (so that we take into account that operations
+ * may match between front and back stencils), and will also take into
+ * account whether the depth test is enabled (if the depth test is off,
+ * we don't need any of the zfail results, because the depth test always
+ * is considered to pass if it is disabled). Any register value that
+ * does not need to be calculated will come back with the same value
+ * that's in fbS_reg.
+ *
+ * This function will allocate a variant number of registers that
+ * will be released as part of the register set.
+ */
+ spe_comment(f, 0, "Computing stencil values");
+ gen_get_stencil_values(f, dsa, fbS_reg,
+ &front_stencil_fail_values, &front_stencil_pass_depth_fail_values,
+ &front_stencil_pass_depth_pass_values, &back_stencil_fail_values,
+ &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+ /* Tricky, tricky, tricky - the things we do to create optimal
+ * code...
+ *
+ * The various stencil values registers may overlap with each other
+ * and with fbS_reg arbitrarily (as any particular operation is
+ * only calculated once and stored in one register, no matter
+ * how many times it is used). So we can't change the values
+ * within those registers directly - if we change a value in a
+ * register that's being referenced by two different calculations,
+ * we've just unwittingly changed the second value as well...
+ *
+ * Avoid this by allocating new registers to hold the results
+ * (there may be 2, if the depth test is off, or 3, if it is on).
+ * These will be released as part of the register set.
+ */
+ if (!dsa->stencil[1].enabled) {
+ /* The easy case: if two-sided stenciling is *not* enabled, we
+ * just use the front-sided values.
+ */
+ stencil_fail_values = front_stencil_fail_values;
+ stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+ stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+ }
+ else { /* two-sided stencil enabled */
+ spe_comment(f, 0, "Resolving backface stencil values");
+ /* Allocate new registers for the needed merged values */
+ stencil_fail_values = spe_allocate_available_register(f);
+ spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+ if (dsa->depth.enabled) {
+ stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+ spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+ }
+ else {
+ stencil_pass_depth_fail_values = fbS_reg;
+ }
+ stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+ spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+ }
+ }
+
+ /* We now have all the stencil values we need. We also need
+ * the results of the depth test to figure out which
+ * stencil values will become the new stencil values. (Even if
+ * we aren't actually calculating stencil values, we need to apply
+ * the depth test if it's enabled.)
+ *
+ * The code generated by gen_depth_test() returns the results of the
+ * test in the given register, but also alters the mask_reg based
+ * on the results of the test.
+ */
+ if (dsa->depth.enabled) {
+ spe_comment(f, 0, "Running stencil depth test");
+ zmask_reg = spe_allocate_available_register(f);
+ modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+ }
+
+ if (need_to_calculate_stencil_values) {
+
+ /* If we need to writemask the stencil values before going into
+ * the stencil buffer, we'll have to use a new register to
+ * hold the new values. If not, we can just keep using the
+ * current register.
+ */
+ if (need_to_writemask_stencil_values) {
+ newS_reg = spe_allocate_available_register(f);
+ spe_comment(f, 0, "Saving current stencil values for writemasking");
+ spe_move(f, newS_reg, fbS_reg);
+ }
+ else {
+ newS_reg = fbS_reg;
+ }
+
+ /* Merge in the selected stencil fail values */
+ if (stencil_fail_values != fbS_reg) {
+ spe_comment(f, 0, "Loading stencil fail values");
+ spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+ modified_buffers = true;
+ }
+
+ /* Same for the stencil pass/depth fail values. If this calculation
+ * is not needed (say, if depth test is off), then the
+ * stencil_pass_depth_fail_values register will be equal to fbS_reg
+ * and we'll skip the calculation.
+ */
+ if (stencil_pass_depth_fail_values != fbS_reg) {
+ /* We don't actually have a stencil pass/depth fail mask yet.
+ * Calculate it here from the stencil passing mask and the
+ * depth passing mask. Note that zmask_reg *must* have been
+ * set above if we're here.
+ */
+ unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+ spe_comment(f, 0, "Loading stencil pass/depth fail values");
+ spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+ spe_release_register(f, stencil_pass_depth_fail_mask);
+ modified_buffers = true;
+ }
+
+ /* Same for the stencil pass/depth pass mask. Note that we
+ * *can* get here with zmask_reg being unset (if the depth
+ * test is off but the stencil test is on). In this case,
+ * we assume the depth test passes, and don't need to mask
+ * the stencil pass mask with the Z mask.
+ */
+ if (stencil_pass_depth_pass_values != fbS_reg) {
+ if (dsa->depth.enabled) {
+ unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+ /* We'll need a separate register */
+ spe_comment(f, 0, "Loading stencil pass/depth pass values");
+ spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+ spe_release_register(f, stencil_pass_depth_pass_mask);
+ }
+ else {
+ /* We can use the same stencil-pass register */
+ spe_comment(f, 0, "Loading stencil pass values");
+ spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+ }
+ modified_buffers = true;
+ }
+
+ /* Almost done. If we need to writemask, do it now, leaving the
+ * results in the fbS_reg register passed in. If we don't need
+ * to writemask, then the results are *already* in the fbS_reg,
+ * so there's nothing more to do.
+ */
+
+ if (need_to_writemask_stencil_values && modified_buffers) {
+ /* The Select Bytes command makes a fine writemask. Where
+ * the mask is 0, the first (original) values are retained,
+ * effectively masking out changes. Where the mask is 1, the
+ * second (new) values are retained, incorporating changes.
+ */
+ spe_comment(f, 0, "Writemasking new stencil values");
+ spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+ }
+
+ } /* done calculating stencil values */
+
+ /* The stencil and/or depth values have been applied, and the
+ * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+ * We're all done, except that we've allocated a fair number
+ * of registers that we didn't bother tracking. Release all
+ * those registers as part of the register set, and go home.
+ */
+ spe_comment(f, 0, "Releasing stencil register set");
+ spe_release_register_set(f);
+
+ /* Return true if we could have modified the stencil and/or
+ * depth buffers.
+ */
+ return modified_buffers;
+}
+
+
/**
* Generate SPE code to implement the fragment operations (alpha test,
* depth test, stencil test, blending, colormask, and final
@@ -1156,6 +1834,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
const int fragB_reg = 10; /* vector float */
const int fragA_reg = 11; /* vector float */
const int mask_reg = 12; /* vector uint */
+ const int facing_reg = 13; /* uint */
/* offset of quad from start of tile
* XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1183,6 +1862,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_allocate_register(f, fragB_reg);
spe_allocate_register(f, fragA_reg);
spe_allocate_register(f, mask_reg);
+ spe_allocate_register(f, facing_reg);
quad_offset_reg = spe_allocate_available_register(f);
fbRGBA_reg = spe_allocate_available_register(f);
@@ -1195,6 +1875,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
ASSERT(TILE_SIZE == 32);
+ spe_comment(f, 0, "Compute quad offset within tile");
spe_rotmi(f, y2_reg, y_reg, -1); /* y2 = y / 2 */
spe_rotmi(f, x2_reg, x_reg, -1); /* x2 = x / 2 */
spe_shli(f, y2_reg, y2_reg, 4); /* y2 *= 16 */
@@ -1205,130 +1886,188 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_release_register(f, y2_reg);
}
-
if (dsa->alpha.enabled) {
gen_alpha_test(dsa, f, mask_reg, fragA_reg);
}
+ /* If we need the stencil buffers (because one- or two-sided stencil is
+ * enabled) or the depth buffer (because the depth test is enabled),
+ * go grab them. Note that if either one- or two-sided stencil is
+ * enabled, dsa->stencil[0].enabled will be true.
+ */
if (dsa->depth.enabled || dsa->stencil[0].enabled) {
const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
boolean write_depth_stencil;
- int fbZ_reg = spe_allocate_available_register(f); /* Z values */
- int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+ /* We may or may not need to allocate a register for Z or stencil values */
+ boolean fbS_reg_set = false, fbZ_reg_set = false;
+ unsigned int fbS_reg, fbZ_reg = 0;
+
+ spe_comment(f, 0, "Fetching Z/stencil quad from tile");
/* fetch quad of depth/stencil values from tile at (x,y) */
/* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
- if (dsa->depth.enabled) {
- /* Extract Z bits from fbZS_reg into fbZ_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- int mask_reg = spe_allocate_available_register(f);
- spe_fsmbi(f, mask_reg, 0x7777); /* mask[0,1,2,3] = 0x00ffffff */
- spe_and(f, fbZ_reg, fbZS_reg, mask_reg); /* fbZ = fbZS & mask */
- spe_release_register(f, mask_reg);
- /* OK, fbZ_reg has four 24-bit Z values now */
- }
- else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- spe_rotmi(f, fbZ_reg, fbZS_reg, -8); /* fbZ = fbZS >> 8 */
- /* OK, fbZ_reg has four 24-bit Z values now */
- }
- else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
- spe_move(f, fbZ_reg, fbZS_reg);
- /* OK, fbZ_reg has four 32-bit Z values now */
- }
- else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- spe_move(f, fbZ_reg, fbZS_reg);
- /* OK, fbZ_reg has four 16-bit Z values now */
- }
- else {
- ASSERT(0); /* invalid format */
- }
-
- /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM ||
- zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- /* fragZ = fragZ >> 8 */
- spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- }
- else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
- /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- }
- else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- /* fragZ = fragZ >> 16 */
- spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
- }
- }
- else {
- /* no Z test, but set Z to zero so we don't OR-in garbage below */
- spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+ /* From the Z/stencil buffer format, pull out the bits we need for
+ * Z and/or stencil. We'll also convert the incoming fragment Z
+ * value in fragZ_reg from a floating point value in [0.0..1.0] to
+ * an unsigned integer value with the appropriate resolution.
+ */
+ switch(zs_format) {
+
+ case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+ case PIPE_FORMAT_X8Z24_UNORM:
+ if (dsa->depth.enabled) {
+ /* We need the Z part at least */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* four 24-bit Z values in the low-order bits */
+ spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+ }
+ if (dsa->stencil[0].enabled) {
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+ /* four 8-bit Z values in the high-order bits */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+ }
+ break;
+
+ case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+ case PIPE_FORMAT_Z24X8_UNORM:
+ if (dsa->depth.enabled) {
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* shift by 8 to get the upper 24-bit values */
+ spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 24-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+ }
+ if (dsa->stencil[0].enabled) {
+ setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+ /* 8-bit stencil in the low-order bits - mask them out */
+ spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+ }
+ break;
+
+ case PIPE_FORMAT_Z32_UNORM:
+ if (dsa->depth.enabled) {
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 32-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ }
+ /* No stencil, so can't do anything there */
+ break;
+
+ case PIPE_FORMAT_Z16_UNORM:
+ if (dsa->depth.enabled) {
+ /* XXX Not sure this is correct, but it was here before, so we're
+ * going with it for now
+ */
+ setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+ /* Copy over 4 32-bit values */
+ spe_move(f, fbZ_reg, fbZS_reg);
+
+ /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+ * to a 16-bit unsigned integer
+ */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+ }
+ /* No stencil */
+ break;
+
+ default:
+ ASSERT(0); /* invalid format */
}
-
+ /* If stencil is enabled, use the stencil-specific code
+ * generator to generate both the stencil and depth (if needed)
+ * tests. Otherwise, if only depth is enabled, generate
+ * a quick depth test. The test generators themselves will
+ * report back whether the depth/stencil buffer has to be
+ * written back.
+ */
if (dsa->stencil[0].enabled) {
- /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
- if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
- zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- /* XXX extract with a shift */
- ASSERT(0);
- }
- else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
- zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- /* XXX extract with a mask */
- ASSERT(0);
- }
- }
- else {
- /* no stencil test, but set to zero so we don't OR-in garbage below */
- spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
- }
+ /* This will perform the stencil and depth tests, and update
+ * the mask_reg, fbZ_reg, and fbS_reg as required by the
+ * tests.
+ */
+ ASSERT(fbS_reg_set);
+ spe_comment(f, 0, "Perform stencil test");
- if (dsa->stencil[0].enabled) {
- /* XXX this may involve depth testing too */
- // gen_stencil_test(dsa, f, ... );
- ASSERT(0);
+ /* Note that fbZ_reg may not be set on entry, if stenciling
+ * is enabled but there's no Z-buffer. The
+ * gen_stencil_depth_test() function must ignore the
+ * fbZ_reg register if depth is not enabled.
+ */
+ write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
}
else if (dsa->depth.enabled) {
int zmask_reg = spe_allocate_available_register(f);
- gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+ ASSERT(fbZ_reg_set);
+ spe_comment(f, 0, "Perform depth test");
+ write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
spe_release_register(f, zmask_reg);
}
-
- /* do we need to write Z and/or Stencil back into framebuffer? */
- write_depth_stencil = (dsa->depth.writemask |
- dsa->stencil[0].write_mask |
- dsa->stencil[1].write_mask);
+ else {
+ write_depth_stencil = false;
+ }
if (write_depth_stencil) {
/* Merge latest Z and Stencil values into fbZS_reg.
* fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
* fbS_reg has four 8-bit Z values in bits [7..0].
*/
+ spe_comment(f, 0, "Store quad's depth/stencil values in tile");
if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
zs_format == PIPE_FORMAT_X8Z24_UNORM) {
- spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ if (fbS_reg_set && fbZ_reg_set) {
+ spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
+ else if (fbS_reg_set) {
+ spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+ }
+ else {
+ spe_move(f, fbZS_reg, fbZ_reg);
+ }
}
else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
zs_format == PIPE_FORMAT_Z24X8_UNORM) {
- spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
- spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ if (fbS_reg_set && fbZ_reg_set) {
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+ }
+ else if (fbS_reg_set) {
+ spe_move(f, fbZS_reg, fbS_reg);
+ }
+ else {
+ spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+ }
}
else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
- spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+ if (fbZ_reg_set) {
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+ }
}
else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
- spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+ if (fbZ_reg_set) {
+ spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+ }
}
else if (zs_format == PIPE_FORMAT_S8_UNORM) {
ASSERT(0); /* XXX to do */
@@ -1341,21 +2080,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
}
- spe_release_register(f, fbZ_reg);
- spe_release_register(f, fbS_reg);
+ release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+ release_optional_register(f, &fbS_reg_set, fbS_reg);
}
-
/* Get framebuffer quad/colors. We'll need these for blending,
* color masking, and to obey the quad/pixel mask.
* Load: fbRGBA_reg = memory[color_tile + quad_offset]
* Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
* we could skip this load.
*/
+ spe_comment(f, 0, "Fetch quad colors from tile");
spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
-
if (blend->blend_enable) {
+ spe_comment(f, 0, "Perform blending");
gen_blend(blend, blend_color, f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
}
@@ -1369,19 +2108,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
int rgba_reg = spe_allocate_available_register(f);
/* Pack four float colors as four 32-bit int colors */
+ spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
gen_pack_colors(f, color_format,
fragR_reg, fragG_reg, fragB_reg, fragA_reg,
rgba_reg);
if (blend->logicop_enable) {
+ spe_comment(f, 0, "Compute logic op");
gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
}
if (blend->colormask != PIPE_MASK_RGBA) {
+ spe_comment(f, 0, "Compute color mask");
gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
}
-
/* Mix fragment colors with framebuffer colors using the quad/pixel mask:
* if (mask[i])
* rgba[i] = rgba[i];
@@ -1393,6 +2134,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
/* Store updated quad in tile:
* memory[color_tile + quad_offset] = rgba_reg;
*/
+ spe_comment(f, 0, "Store quad colors into color tile");
spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
spe_release_register(f, rgba_reg);
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 8c55b8e093..2e3086c4fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -258,8 +258,6 @@ cell_set_sampler_textures(struct pipe_context *pipe,
}
cell->num_textures = num;
- cell_update_texture_mapping(cell);
-
cell->dirty |= CELL_NEW_TEXTURE;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
struct cell_command_render *render = &cell_global.command[i].render;
render->prim_type = PIPE_PRIM_TRIANGLES;
render->num_verts = cell->prim_buffer.num_verts;
+ render->front_winding = cell->rasterizer->front_winding;
render->vertex_size = cell->vertex_info->size * 4;
render->xmin = cell->prim_buffer.xmin;
render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 47ba6fa290..d223557950 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -76,11 +76,11 @@ cell_get_param(struct pipe_screen *screen, int param)
case PIPE_CAP_TEXTURE_SHADOW_MAP:
return 10;
case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
- return 12; /* max 2Kx2K */
+ return CELL_MAX_TEXTURE_LEVELS;
case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
return 8; /* max 128x128x128 */
case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
- return 12; /* max 2Kx2K */
+ return CELL_MAX_TEXTURE_LEVELS;
default:
return 10;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a3..b193170f9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
#define CELL_NEW_TEXTURE 0x800
#define CELL_NEW_VERTEX 0x1000
#define CELL_NEW_VS 0x2000
-#define CELL_NEW_CONSTANTS 0x4000
-#define CELL_NEW_VERTEX_INFO 0x8000
+#define CELL_NEW_VS_CONSTANTS 0x4000
+#define CELL_NEW_FS_CONSTANTS 0x8000
+#define CELL_NEW_VERTEX_INFO 0x10000
extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index f35893537b..bb694aa107 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,6 +25,7 @@
*
**************************************************************************/
+#include "pipe/p_inlines.h"
#include "util/u_memory.h"
#include "cell_context.h"
#include "cell_gen_fragment.h"
@@ -36,6 +37,79 @@
#include "draw/draw_private.h"
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+ struct cell_fragment_ops_key key;
+ struct cell_command_fragment_ops *ops;
+
+ /*
+ * Build key
+ */
+ memset(&key, 0, sizeof(key));
+ key.blend = *cell->blend;
+ key.dsa = *cell->depth_stencil;
+
+ if (cell->framebuffer.cbufs[0])
+ key.color_format = cell->framebuffer.cbufs[0]->format;
+ else
+ key.color_format = PIPE_FORMAT_NONE;
+
+ if (cell->framebuffer.zsbuf)
+ key.zs_format = cell->framebuffer.zsbuf->format;
+ else
+ key.zs_format = PIPE_FORMAT_NONE;
+
+ /*
+ * Look up key in cache.
+ */
+ ops = (struct cell_command_fragment_ops *)
+ util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+ /*
+ * If not found, create/save new fragment ops command.
+ */
+ if (!ops) {
+ struct spe_function spe_code;
+
+ if (0)
+ debug_printf("**** Create New Fragment Ops\n");
+
+ /* Prepare the buffer that will hold the generated code. */
+ spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+ /* generate new code */
+ cell_gen_fragment_function(cell, &spe_code);
+
+ /* alloc new fragment ops command */
+ ops = CALLOC_STRUCT(cell_command_fragment_ops);
+
+ /* populate the new cell_command_fragment_ops object */
+ ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+ memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+ ops->dsa = *cell->depth_stencil;
+ ops->blend = *cell->blend;
+
+ /* insert cell_command_fragment_ops object into keymap/cache */
+ util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+ /* release rtasm buffer */
+ spe_release_func(&spe_code);
+ }
+ else {
+ if (0)
+ debug_printf("**** Re-use Fragment Ops\n");
+ }
+
+ return ops;
+}
+
+
+
static void
emit_state_cmd(struct cell_context *cell, uint cmd,
const void *state, uint state_size)
@@ -89,31 +163,31 @@ cell_emit_state(struct cell_context *cell)
}
}
+ if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+ const uint shader = PIPE_SHADER_FRAGMENT;
+ const uint num_const = cell->constants[shader].size / sizeof(float);
+ uint i, j;
+ float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+ uint64_t *ibuf = (uint64_t *) buf;
+ const float *constants = pipe_buffer_map(cell->pipe.screen,
+ cell->constants[shader].buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+ ibuf[1] = num_const;
+ j = 4;
+ for (i = 0; i < num_const; i++) {
+ buf[j++] = constants[i];
+ }
+ pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+ }
+
if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
CELL_NEW_DEPTH_STENCIL |
CELL_NEW_BLEND)) {
- /* XXX we don't want to always do codegen here. We should have
- * a hash/lookup table to cache previous results...
- */
- struct cell_command_fragment_ops *fops
- = cell_batch_alloc(cell, sizeof(*fops));
- struct spe_function spe_code;
-
- /* Prepare the buffer that will hold the generated code. */
- spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
- /* generate new code */
- cell_gen_fragment_function(cell, &spe_code);
-
- /* put the new code into the batch buffer */
- fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
- memcpy(&fops->code, spe_code.store,
- SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
- fops->dsa = *cell->depth_stencil;
- fops->blend = *cell->blend;
-
- /* free codegen buffer */
- spe_release_func(&spe_code);
+ struct cell_command_fragment_ops *fops, *fops_cmd;
+ fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+ fops = lookup_fragment_ops(cell);
+ memcpy(fops_cmd, fops, sizeof(*fops));
}
if (cell->dirty & CELL_NEW_SAMPLER) {
@@ -137,14 +211,24 @@ cell_emit_state(struct cell_context *cell)
texture->opcode = CELL_CMD_STATE_TEXTURE;
texture->unit = i;
if (cell->texture[i]) {
- texture->start = cell->texture[i]->tiled_data;
- texture->width = cell->texture[i]->base.width[0];
- texture->height = cell->texture[i]->base.height[0];
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ texture->start[level] = cell->texture[i]->tiled_data[level];
+ texture->width[level] = cell->texture[i]->base.width[level];
+ texture->height[level] = cell->texture[i]->base.height[level];
+ texture->depth[level] = cell->texture[i]->base.depth[level];
+ }
+ texture->target = cell->texture[i]->base.target;
}
else {
- texture->start = NULL;
- texture->width = 1;
- texture->height = 1;
+ uint level;
+ for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+ texture->start[level] = NULL;
+ texture->width[level] = 0;
+ texture->height[level] = 0;
+ texture->depth[level] = 0;
+ }
+ texture->target = 0;
}
}
}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2..54a17eaf2b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe,
buf->buffer);
cell->constants[shader].size = buf->size;
- cell->dirty |= CELL_NEW_CONSTANTS;
+ if (shader == PIPE_SHADER_VERTEX)
+ cell->dirty |= CELL_NEW_VS_CONSTANTS;
+ else if (shader == PIPE_SHADER_FRAGMENT)
+ cell->dirty |= CELL_NEW_FS_CONSTANTS;
}
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86..230e192573 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -52,20 +52,22 @@ static unsigned minify( unsigned d )
static void
-cell_texture_layout(struct cell_texture * spt)
+cell_texture_layout(struct cell_texture *ct)
{
- struct pipe_texture *pt = &spt->base;
+ struct pipe_texture *pt = &ct->base;
unsigned level;
unsigned width = pt->width[0];
unsigned height = pt->height[0];
unsigned depth = pt->depth[0];
- spt->buffer_size = 0;
+ ct->buffer_size = 0;
for ( level = 0 ; level <= pt->last_level ; level++ ) {
unsigned size;
unsigned w_tile, h_tile;
+ assert(level < CELL_MAX_TEXTURE_LEVELS);
+
/* width, height, rounded up to tile size */
w_tile = align(width, TILE_SIZE);
h_tile = align(height, TILE_SIZE);
@@ -76,9 +78,9 @@ cell_texture_layout(struct cell_texture * spt)
pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);
pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);
- spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+ ct->stride[level] = pt->nblocksx[level] * pt->block.size;
- spt->level_offset[level] = spt->buffer_size;
+ ct->level_offset[level] = ct->buffer_size;
size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
if (pt->target == PIPE_TEXTURE_CUBE)
@@ -86,7 +88,7 @@ cell_texture_layout(struct cell_texture * spt)
else
size *= depth;
- spt->buffer_size += size;
+ ct->buffer_size += size;
width = minify(width);
height = minify(height);
@@ -100,26 +102,25 @@ cell_texture_create(struct pipe_screen *screen,
const struct pipe_texture *templat)
{
struct pipe_winsys *ws = screen->winsys;
- struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
- if (!spt)
+ struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+ if (!ct)
return NULL;
- spt->base = *templat;
- spt->base.refcount = 1;
- spt->base.screen = screen;
+ ct->base = *templat;
+ ct->base.refcount = 1;
+ ct->base.screen = screen;
- cell_texture_layout(spt);
+ cell_texture_layout(ct);
- spt->buffer = ws->buffer_create(ws, 32,
- PIPE_BUFFER_USAGE_PIXEL,
- spt->buffer_size);
+ ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+ ct->buffer_size);
- if (!spt->buffer) {
- FREE(spt);
+ if (!ct->buffer) {
+ FREE(ct);
return NULL;
}
- return &spt->base;
+ return &ct->base;
}
@@ -135,29 +136,116 @@ cell_texture_release(struct pipe_screen *screen,
__FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
*/
if (--(*pt)->refcount <= 0) {
- struct cell_texture *spt = cell_texture(*pt);
+ struct cell_texture *ct = cell_texture(*pt);
+ uint i;
/*
- DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
+ DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
*/
- pipe_buffer_reference(screen, &spt->buffer, NULL);
+ pipe_buffer_reference(screen, &ct->buffer, NULL);
+
+ for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+ if (ct->tiled_data[i]) {
+ align_free(ct->tiled_data[i]);
+ }
+ }
- FREE(spt);
+ FREE(ct);
}
*pt = NULL;
}
-#if 0
+
+/**
+ * Convert image from linear layout to tiled layout. 4-byte pixels.
+ */
+static void
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+ uint src_stride, const uint *src)
+{
+ const uint tile_size2 = tile_size * tile_size;
+ const uint h_t = (h + tile_size - 1) / tile_size;
+ const uint w_t = (w + tile_size - 1) / tile_size;
+
+ uint it, jt; /* tile counters */
+ uint i, j; /* intra-tile counters */
+
+ src_stride /= 4; /* convert from bytes to pixels */
+
+ /* loop over dest tiles */
+ for (it = 0; it < h_t; it++) {
+ for (jt = 0; jt < w_t; jt++) {
+ /* start of dest tile: */
+ uint *tdst = dst + (it * w_t + jt) * tile_size2;
+
+ /* compute size of this tile (may be smaller than tile_size) */
+ /* XXX note: a compiler bug was found here. That's why the code
+ * looks as it does.
+ */
+ uint tile_width = w - jt * tile_size;
+ tile_width = MIN2(tile_width, tile_size);
+ uint tile_height = h - it * tile_size;
+ tile_height = MIN2(tile_height, tile_size);
+
+ /* loop over texels in the tile */
+ for (i = 0; i < tile_height; i++) {
+ for (j = 0; j < tile_width; j++) {
+ const uint srci = it * tile_size + i;
+ const uint srcj = jt * tile_size + j;
+ ASSERT(srci < h);
+ ASSERT(srcj < w);
+ tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+ }
+ }
+ }
+ }
+}
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
static void
-cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
- uint face, uint levelsMask)
+cell_twiddle_texture(struct pipe_screen *screen,
+ struct pipe_surface *surface)
{
- /* XXX TO DO: re-tile the texture data ... */
+ struct cell_texture *ct = cell_texture(surface->texture);
+ const uint level = surface->level;
+ const uint texWidth = ct->base.width[level];
+ const uint texHeight = ct->base.height[level];
+ const uint bufWidth = align(texWidth, TILE_SIZE);
+ const uint bufHeight = align(texHeight, TILE_SIZE);
+ const void *map = pipe_buffer_map(screen, surface->buffer,
+ PIPE_BUFFER_USAGE_CPU_READ);
+ const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+ switch (ct->base.format) {
+ case PIPE_FORMAT_A8R8G8B8_UNORM:
+ {
+ int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+ int offset = bufWidth * bufHeight * 4 * surface->face;
+ uint *dst;
+
+ if (!ct->tiled_data[level]) {
+ ct->tiled_data[level] =
+ align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+ }
+
+ dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+ twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+ surface->stride, src);
+ }
+ break;
+ default:
+ printf("Cell: twiddle unsupported texture format\n");
+ ;
+ }
+
+ pipe_buffer_unmap(screen, surface->buffer);
}
-#endif
static struct pipe_surface *
@@ -167,22 +255,22 @@ cell_get_tex_surface(struct pipe_screen *screen,
unsigned usage)
{
struct pipe_winsys *ws = screen->winsys;
- struct cell_texture *spt = cell_texture(pt);
+ struct cell_texture *ct = cell_texture(pt);
struct pipe_surface *ps;
ps = ws->surface_alloc(ws);
if (ps) {
assert(ps->refcount);
assert(ps->winsys);
- winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
+ winsys_buffer_reference(ws, &ps->buffer, ct->buffer);
ps->format = pt->format;
ps->block = pt->block;
ps->width = pt->width[level];
ps->height = pt->height[level];
ps->nblocksx = pt->nblocksx[level];
ps->nblocksy = pt->nblocksy[level];
- ps->stride = spt->stride[level];
- ps->offset = spt->level_offset[level];
+ ps->stride = ct->stride[level];
+ ps->offset = ct->level_offset[level];
ps->usage = usage;
/* XXX may need to override usage flags (see sp_texture.c) */
@@ -206,118 +294,12 @@ cell_get_tex_surface(struct pipe_screen *screen,
}
-
-/**
- * Copy tile data from linear layout to tiled layout.
- * XXX this should be rolled into the future surface-creation code.
- * XXX also need "untile" code...
- */
-static void
-tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
-{
- const uint tile_size2 = tile_size * tile_size;
- const uint h_t = h / tile_size, w_t = w / tile_size;
-
- uint it, jt; /* tile counters */
- uint i, j; /* intra-tile counters */
-
- /* loop over dest tiles */
- for (it = 0; it < h_t; it++) {
- for (jt = 0; jt < w_t; jt++) {
- /* start of dest tile: */
- uint *tdst = dst + (it * w_t + jt) * tile_size2;
- /* loop over texels in the tile */
- for (i = 0; i < tile_size; i++) {
- for (j = 0; j < tile_size; j++) {
- const uint srci = it * tile_size + i;
- const uint srcj = jt * tile_size + j;
- *tdst++ = src[srci * w + srcj];
- }
- }
- }
- }
-}
-
-
-
-/**
- * Convert linear texture image data to tiled format for SPU usage.
- * XXX recast this in terms of pipe_surfaces (aka texture views).
- */
-static void
-cell_tile_texture(struct cell_context *cell,
- struct cell_texture *texture)
-{
- struct pipe_screen *screen = cell->pipe.screen;
- uint face = 0, level = 0, zslice = 0;
- struct pipe_surface *surf;
- const uint w = texture->base.width[0], h = texture->base.height[0];
- const uint *src;
-
- /* temporary restrictions: */
- assert(w >= TILE_SIZE);
- assert(h >= TILE_SIZE);
- assert(w % TILE_SIZE == 0);
- assert(h % TILE_SIZE == 0);
-
- surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
- PIPE_BUFFER_USAGE_CPU_WRITE);
- ASSERT(surf);
-
- src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
-
- if (texture->tiled_data) {
- align_free(texture->tiled_data);
- }
- texture->tiled_data = align_malloc(w * h * 4, 16);
-
- tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
-
- pipe_surface_unmap(surf);
-
- pipe_surface_reference(&surf, NULL);
-}
-
-
-void
-cell_update_texture_mapping(struct cell_context *cell)
-{
-#if 0
- uint face = 0, level = 0, zslice = 0;
-#endif
- uint i;
-
- for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
- if (cell->texture[i])
- cell_tile_texture(cell, cell->texture[i]);
- }
-
-#if 0
- if (cell->tex_surf && cell->tex_map) {
- pipe_surface_unmap(cell->tex_surf);
- cell->tex_map = NULL;
- }
-
- /* XXX free old surface */
-
- cell->tex_surf = cell_get_tex_surface(&cell->pipe,
- &cell->texture[0]->base,
- face, level, zslice);
-
- cell->tex_map = pipe_surface_map(cell->tex_surf);
-#endif
-}
-
-
static void
cell_tex_surface_release(struct pipe_screen *screen,
struct pipe_surface **s)
{
- /* Effectively do the texture_update work here - if texture images
- * needed post-processing to put them into hardware layout, this is
- * where it would happen. For softpipe, nothing to do.
- */
- assert ((*s)->texture);
+ /* XXX if done rendering to teximage, re-tile */
+
pipe_texture_reference(&(*s)->texture, NULL);
screen->winsys->surface_release(screen->winsys, s);
@@ -325,9 +307,9 @@ cell_tex_surface_release(struct pipe_screen *screen,
static void *
-cell_surface_map( struct pipe_screen *screen,
- struct pipe_surface *surface,
- unsigned flags )
+cell_surface_map(struct pipe_screen *screen,
+ struct pipe_surface *surface,
+ unsigned flags)
{
ubyte *map;
@@ -339,22 +321,8 @@ cell_surface_map( struct pipe_screen *screen,
map = pipe_buffer_map( screen, surface->buffer, flags );
if (map == NULL)
return NULL;
-
- /* May want to different things here depending on read/write nature
- * of the map:
- */
- if (surface->texture &&
- (flags & PIPE_BUFFER_USAGE_CPU_WRITE))
- {
- /* Do something to notify sharing contexts of a texture change.
- * In softpipe, that would mean flushing the texture cache.
- */
-#if 0
- cell_screen(screen)->timestamp++;
-#endif
- }
-
- return map + surface->offset;
+ else
+ return (void *) (map + surface->offset);
}
@@ -362,17 +330,21 @@ static void
cell_surface_unmap(struct pipe_screen *screen,
struct pipe_surface *surface)
{
- pipe_buffer_unmap( screen, surface->buffer );
-}
+ struct cell_texture *ct = cell_texture(surface->texture);
+ assert(ct);
-void
-cell_init_texture_functions(struct cell_context *cell)
-{
- /*cell->pipe.texture_update = cell_texture_update;*/
+ if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+ (surface->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+ /* convert from linear to tiled layout */
+ cell_twiddle_texture(screen, surface);
+ }
+
+ pipe_buffer_unmap( screen, surface->buffer );
}
+
void
cell_init_screen_texture_funcs(struct pipe_screen *screen)
{
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebc..a0757091b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,15 @@ struct cell_texture
{
struct pipe_texture base;
- unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
- unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+ unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+ unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
/* The data is held here:
*/
struct pipe_buffer *buffer;
unsigned long buffer_size;
- void *tiled_data; /* XXX this may be temporary */ /*ALIGN16*/
+ void *tiled_data[CELL_MAX_TEXTURE_LEVELS]; /* XXX this may be temporary */ /*ALIGN16*/
};
@@ -62,14 +62,6 @@ cell_texture(struct pipe_texture *pt)
extern void
-cell_update_texture_mapping(struct cell_context *cell);
-
-
-extern void
-cell_init_texture_functions(struct cell_context *cell);
-
-
-extern void
cell_init_screen_texture_funcs(struct pipe_screen *screen);
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..578ddf62dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
render->opcode = CELL_CMD_RENDER;
render->prim_type = cvbr->prim;
+ render->front_winding = cell->rasterizer->front_winding;
render->num_indexes = nr_indices;
render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e..18969005b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
int col3;
- spe_lqd(p, shuf_hi, shuf_ptr, 3);
- spe_lqd(p, shuf_lo, shuf_ptr, 4);
+ spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+ spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
spe_shufb(p, t1, row0, row2, shuf_hi);
spe_shufb(p, t2, row0, row2, shuf_lo);
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
*/
switch (count) {
case 4:
- spe_stqd(p, col3, dest_ptr, 3);
+ spe_stqd(p, col3, dest_ptr, 3 * 16);
case 3:
- spe_stqd(p, col2, dest_ptr, 2);
+ spe_stqd(p, col2, dest_ptr, 2 * 16);
case 2:
- spe_stqd(p, col1, dest_ptr, 1);
+ spe_stqd(p, col1, dest_ptr, 1 * 16);
case 1:
- spe_stqd(p, col0, dest_ptr, 0);
+ spe_stqd(p, col0, dest_ptr, 0 * 16);
}
@@ -166,17 +166,17 @@ emit_fetch(struct spe_function *p,
float scale_signed = 0.0;
float scale_unsigned = 0.0;
- spe_lqd(p, v0, in_ptr, 0 + offset[0]);
- spe_lqd(p, v1, in_ptr, 1 + offset[0]);
- spe_lqd(p, v2, in_ptr, 2 + offset[0]);
- spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+ spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+ spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+ spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+ spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
offset[0] += 4;
switch (bytes) {
case 1:
scale_signed = 1.0f / 127.0f;
scale_unsigned = 1.0f / 255.0f;
- spe_lqd(p, tmp, shuf_ptr, 1);
+ spe_lqd(p, tmp, shuf_ptr, 1 * 16);
spe_shufb(p, v0, v0, v0, tmp);
spe_shufb(p, v1, v1, v1, tmp);
spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +185,7 @@ emit_fetch(struct spe_function *p,
case 2:
scale_signed = 1.0f / 32767.0f;
scale_unsigned = 1.0f / 65535.0f;
- spe_lqd(p, tmp, shuf_ptr, 2);
+ spe_lqd(p, tmp, shuf_ptr, 2 * 16);
spe_shufb(p, v0, v0, v0, tmp);
spe_shufb(p, v1, v1, v1, tmp);
spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +241,11 @@ emit_fetch(struct spe_function *p,
switch (count) {
case 1:
- spe_stqd(p, float_zero, out_ptr, 1);
+ spe_stqd(p, float_zero, out_ptr, 1 * 16);
case 2:
- spe_stqd(p, float_zero, out_ptr, 2);
+ spe_stqd(p, float_zero, out_ptr, 2 * 16);
case 3:
- spe_stqd(p, float_one, out_ptr, 3);
+ spe_stqd(p, float_one, out_ptr, 3 * 16);
}
if (float_zero != -1) {
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index fd8dc6ded3..d7ce005524 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -31,6 +31,7 @@
#define SPU_COLORPACK_H
+#include <transpose_matrix4x4.h>
#include <spu_intrinsics.h>
@@ -84,10 +85,10 @@ spu_unpack_B8G8R8A8(uint color)
vector unsigned int color_u4 = spu_splats(color);
color_u4 = spu_shuffle(color_u4, color_u4,
((vector unsigned char) {
- 10, 10, 10, 10,
- 5, 5, 5, 5,
+ 2, 2, 2, 2,
+ 1, 1, 1, 1,
0, 0, 0, 0,
- 15, 15, 15, 15}) );
+ 3, 3, 3, 3}) );
return spu_convtf(color_u4, 32);
}
@@ -98,13 +99,47 @@ spu_unpack_A8R8G8B8(uint color)
vector unsigned int color_u4 = spu_splats(color);
color_u4 = spu_shuffle(color_u4, color_u4,
((vector unsigned char) {
- 5, 5, 5, 5,
- 10, 10, 10, 10,
- 15, 15, 15, 15,
+ 1, 1, 1, 1,
+ 2, 2, 2, 2,
+ 3, 3, 3, 3,
0, 0, 0, 0}) );
-
return spu_convtf(color_u4, 32);
}
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+ vector float color_out[4])
+{
+ vector unsigned int c0;
+
+ c0 = spu_shuffle(color_in[0], color_in[0],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[0] = spu_convtf(c0, 32);
+
+ c0 = spu_shuffle(color_in[1], color_in[1],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[1] = spu_convtf(c0, 32);
+
+ c0 = spu_shuffle(color_in[2], color_in[2],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[2] = spu_convtf(c0, 32);
+
+ c0 = spu_shuffle(color_in[3], color_in[3],
+ ((vector unsigned char) {
+ 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 0, 0}) );
+ color_out[3] = spu_convtf(c0, 32);
+
+ _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
#endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ec9da5d887..c28677ebf8 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -59,6 +59,14 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+static INLINE int
+align(int value, int alignment)
+{
+ return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
/**
* Tell the PPU that this SPU has finished copying a buffer to
* local store and that it may be reused by the PPU.
@@ -231,6 +239,25 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
}
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+ const uint num_const = buffer[pos + 1];
+ const float *constants = (const float *) &buffer[pos + 2];
+ uint i;
+
+ DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+ /* Expand each float to float[4] for SOA execution */
+ for (i = 0; i < num_const; i++) {
+ spu.constants[i] = spu_splats(constants[i]);
+ }
+
+ /* return new buffer pos (in 8-byte words) */
+ return pos + 2 + num_const / 2;
+}
+
+
static void
cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
{
@@ -276,16 +303,96 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
}
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+ const struct pipe_sampler_state *sampler,
+ uint unit)
+{
+ uint i;
+
+ for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+ int width = texture->level[i].width;
+ int height = texture->level[i].height;
+
+ if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+ texture->level[i].mask_s = spu_splats(width - 1);
+ else
+ texture->level[i].mask_s = spu_splats(~0);
+
+ if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+ texture->level[i].mask_t = spu_splats(height - 1);
+ else
+ texture->level[i].mask_t = spu_splats(~0);
+
+ if (sampler->normalized_coords) {
+ texture->level[i].scale_s = spu_splats((float) width);
+ texture->level[i].scale_t = spu_splats((float) height);
+ }
+ else {
+ texture->level[i].scale_s = spu_splats(1.0f);
+ texture->level[i].scale_t = spu_splats(1.0f);
+ }
+ }
+
+ /* XXX temporary hack */
+ if (texture->target == PIPE_TEXTURE_CUBE) {
+ spu.sample_texture4[unit] = sample_texture4_cube;
+ }
+}
+
+
static void
cmd_state_sampler(const struct cell_command_sampler *sampler)
{
- DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+ uint unit = sampler->unit;
+
+ DEBUG_PRINTF("SAMPLER [%u]\n", unit);
+
+ spu.sampler[unit] = sampler->state;
+
+ switch (spu.sampler[unit].min_img_filter) {
+ case PIPE_TEX_FILTER_LINEAR:
+ spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+ break;
+ case PIPE_TEX_FILTER_ANISO:
+ /* fall-through, for now */
+ case PIPE_TEX_FILTER_NEAREST:
+ spu.min_sample_texture4[unit] = sample_texture4_nearest;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ switch (spu.sampler[sampler->unit].mag_img_filter) {
+ case PIPE_TEX_FILTER_LINEAR:
+ spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+ break;
+ case PIPE_TEX_FILTER_ANISO:
+ /* fall-through, for now */
+ case PIPE_TEX_FILTER_NEAREST:
+ spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ switch (spu.sampler[sampler->unit].min_mip_filter) {
+ case PIPE_TEX_MIPFILTER_NEAREST:
+ case PIPE_TEX_MIPFILTER_LINEAR:
+ spu.sample_texture4[unit] = sample_texture4_lod;
+ break;
+ case PIPE_TEX_MIPFILTER_NONE:
+ spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+ break;
+ default:
+ ASSERT(0);
+ }
- spu.sampler[sampler->unit] = sampler->state;
- if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
- spu.sample_texture[sampler->unit] = sample_texture_bilinear;
- else
- spu.sample_texture[sampler->unit] = sample_texture_nearest;
+ update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
}
@@ -293,24 +400,44 @@ static void
cmd_state_texture(const struct cell_command_texture *texture)
{
const uint unit = texture->unit;
- const uint width = texture->width;
- const uint height = texture->height;
+ uint i;
+
+ //if (spu.init.id==0) Debug=1;
+
+ DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
+
+ spu.texture[unit].max_level = 0;
+ spu.texture[unit].target = texture->target;
- DEBUG_PRINTF("TEXTURE [%u] at %p size %u x %u\n",
- texture->unit, texture->start,
- texture->width, texture->height);
+ for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+ uint width = texture->width[i];
+ uint height = texture->height[i];
+ uint depth = texture->depth[i];
- spu.texture[unit].start = texture->start;
- spu.texture[unit].width = width;
- spu.texture[unit].height = height;
+ DEBUG_PRINTF(" LEVEL %u: at %p size[0] %u x %u\n", i,
+ texture->start[i], texture->width[i], texture->height[i]);
- spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+ spu.texture[unit].level[i].start = texture->start[i];
+ spu.texture[unit].level[i].width = width;
+ spu.texture[unit].level[i].height = height;
+ spu.texture[unit].level[i].depth = depth;
- spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
- spu.texture[unit].tex_size_mask = (vector unsigned int)
- { width - 1, height - 1, 0, 0 };
- spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
- spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+ spu.texture[unit].level[i].tiles_per_row =
+ (width + TILE_SIZE - 1) / TILE_SIZE;
+
+ spu.texture[unit].level[i].bytes_per_image =
+ 4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+ spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+ spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+ if (texture->start[i])
+ spu.texture[unit].max_level = i;
+ }
+
+ update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+
+ //Debug=0;
}
@@ -456,6 +583,9 @@ cmd_batch(uint opcode)
pos += sizeof(*fp) / 8;
}
break;
+ case CELL_CMD_STATE_FS_CONSTANTS:
+ pos = cmd_state_fs_constants(buffer, pos);
+ break;
case CELL_CMD_STATE_SAMPLER:
{
struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index b57ad3f3b8..5c3ee305d4 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -35,53 +35,96 @@
#include <string.h>
#include <libmisc.h>
-#include <cos8_v.h>
-#include <sin8_v.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <transpose_matrix4x4.h>
#include "cell/common.h"
#include "spu_main.h"
#include "spu_funcs.h"
-#define M_PI 3.1415926
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+ vector float v[4];
+};
static vector float
spu_cos(vector float x)
{
-#if 0
- static const float scale = 1.0 / (2.0 * M_PI);
- x = x * spu_splats(scale); /* normalize */
- return _cos8_v(x);
-#else
- /* just pass-through to avoid trashing caller's stack */
- return x;
-#endif
+ return _cos14_v(x);
}
static vector float
spu_sin(vector float x)
{
-#if 0
- static const float scale = 1.0 / (2.0 * M_PI);
- x = x * spu_splats(scale); /* normalize */
- return _sin8_v(x); /* 8-bit accuracy enough?? */
-#else
- /* just pass-through to avoid trashing caller's stack */
- return x;
-#endif
+ return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+ float z0 = powf(spu_extract(x,0), spu_extract(y,0));
+ float z1 = powf(spu_extract(x,1), spu_extract(y,1));
+ float z2 = powf(spu_extract(x,2), spu_extract(y,2));
+ float z3 = powf(spu_extract(x,3), spu_extract(y,3));
+ return (vector float) {z0, z1, z2, z3};
}
+static vector float
+spu_exp2(vector float x)
+{
+ float z0 = powf(2.0f, spu_extract(x,0));
+ float z1 = powf(2.0f, spu_extract(x,1));
+ float z2 = powf(2.0f, spu_extract(x,2));
+ float z3 = powf(2.0f, spu_extract(x,3));
+ return (vector float) {z0, z1, z2, z3};
+}
+static vector float
+spu_log2(vector float x)
+{
+ /*
+ * log_base_2(x) = log(x) / log(2)
+ * 1.442695 = 1/log(2).
+ */
+ static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
+ float z0 = logf(spu_extract(x,0));
+ float z1 = logf(spu_extract(x,1));
+ float z2 = logf(spu_extract(x,2));
+ float z3 = logf(spu_extract(x,3));
+ vector float v = (vector float) {z0, z1, z2, z3};
+ return spu_mul(v, k);
+}
+
+
+static struct vec_4x4
+spu_txp(vector float s, vector float t, vector float r, vector float q,
+ unsigned unit)
+{
+ struct vec_4x4 colors;
+ spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
+ return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
static void
-add_func(struct cell_spu_function_info *spu_functions,
- const char *name, void *addr)
+export_func(struct cell_spu_function_info *spu_functions,
+ const char *name, void *addr)
{
uint n = spu_functions->num;
ASSERT(strlen(name) < 16);
strcpy(spu_functions->names[n], name);
spu_functions->addrs[n] = (uint) addr;
spu_functions->num++;
+ ASSERT(spu_functions->num <= 16);
}
@@ -99,8 +142,12 @@ return_function_info(void)
ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
funcs.num = 0;
- add_func(&funcs, "spu_cos", &spu_cos);
- add_func(&funcs, "spu_sin", &spu_sin);
+ export_func(&funcs, "spu_cos", &spu_cos);
+ export_func(&funcs, "spu_sin", &spu_sin);
+ export_func(&funcs, "spu_pow", &spu_pow);
+ export_func(&funcs, "spu_exp2", &spu_exp2);
+ export_func(&funcs, "spu_log2", &spu_log2);
+ export_func(&funcs, "spu_txp", &spu_txp);
/* Send the function info back to the PPU / main memory */
mfc_put((void *) &funcs, /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 29a305232e..eff43b870c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,9 @@
#define MAX_HEIGHT 1024
+#define CELL_MAX_CONSTANTS 32 /**< number of float[4] constants */
+
+
/**
* A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
* The data may be addressed through several different types.
@@ -61,8 +64,13 @@ typedef union {
/** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
- vector float texcoord);
+typedef void (*spu_sample_texture4_func)(vector float s,
+ vector float t,
+ vector float r,
+ vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
/** Function for performing per-fragment ops */
typedef void (*spu_fragment_ops_func)(uint x, uint y,
@@ -73,7 +81,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
vector float fragGreen,
vector float fragBlue,
vector float fragAlpha,
- vector unsigned int mask);
+ vector unsigned int mask,
+ uint facing);
/** Function for running fragment program */
typedef void (*spu_fragment_program_func)(vector float *inputs,
@@ -98,15 +107,27 @@ struct spu_framebuffer
} ALIGN16_ATTRIB;
-struct spu_texture
+/** per-texture level info */
+struct spu_texture_level
{
void *start;
- ushort width, height;
+ ushort width, height, depth;
ushort tiles_per_row;
- vector float tex_size;
- vector unsigned int tex_size_mask; /**< == int(size - 1) */
- vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
- vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+ uint bytes_per_image;
+ /** texcoord scale factors */
+ vector float scale_s, scale_t, scale_r;
+ /** texcoord masks (if REPEAT then size-1, else ~0) */
+ vector signed int mask_s, mask_t, mask_r;
+ /** texcoord clamp limits */
+ vector signed int max_s, max_t, max_r;
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+ struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+ uint max_level;
+ uint target; /**< PIPE_TEXTURE_x */
} ALIGN16_ATTRIB;
@@ -154,11 +175,12 @@ struct spu_global
spu_fragment_program_func fragment_program;
/** Current texture sampler function */
- spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+ spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
+ spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
+ spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
- /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
- vector float constants[MAX_CONSTANTS];
+ /** Fragment program constants */
+ vector float constants[4 * CELL_MAX_CONSTANTS];
} ALIGN16_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f107764fb2..d252fa6dc1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fragG,
vector float fragB,
vector float fragA,
- vector unsigned int mask)
+ vector unsigned int mask,
+ uint facing)
{
vector float frag_aos[4];
unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
/* Form bitmask depending on color buffer format and colormask bits */
switch (spu.fb.color_format) {
case PIPE_FORMAT_A8R8G8B8_UNORM:
- if (spu.blend.colormask & (1<<0))
+ if (spu.blend.colormask & PIPE_MASK_R)
cmask |= 0x00ff0000; /* red */
- if (spu.blend.colormask & (1<<1))
+ if (spu.blend.colormask & PIPE_MASK_G)
cmask |= 0x0000ff00; /* green */
- if (spu.blend.colormask & (1<<2))
+ if (spu.blend.colormask & PIPE_MASK_B)
cmask |= 0x000000ff; /* blue */
- if (spu.blend.colormask & (1<<3))
+ if (spu.blend.colormask & PIPE_MASK_A)
cmask |= 0xff000000; /* alpha */
break;
case PIPE_FORMAT_B8G8R8A8_UNORM:
- if (spu.blend.colormask & (1<<0))
+ if (spu.blend.colormask & PIPE_MASK_R)
cmask |= 0x0000ff00; /* red */
- if (spu.blend.colormask & (1<<1))
+ if (spu.blend.colormask & PIPE_MASK_G)
cmask |= 0x00ff0000; /* green */
- if (spu.blend.colormask & (1<<2))
+ if (spu.blend.colormask & PIPE_MASK_B)
cmask |= 0xff000000; /* blue */
- if (spu.blend.colormask & (1<<3))
+ if (spu.blend.colormask & PIPE_MASK_A)
cmask |= 0x000000ff; /* alpha */
break;
default:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf046..a61689c83a 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fragGreen,
vector float fragBlue,
vector float fragAlpha,
- vector unsigned int mask);
+ vector unsigned int mask,
+ uint facing);
#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..82dbeb26b7 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
- drawn += tri_draw(v0, v1, v2, tx, ty);
+ drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
}
//printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
printf("SPU %u: RENDER done\n",
spu.init.id);
}
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f8..42eb06a362 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
**************************************************************************/
+#include <math.h>
+
#include "pipe/p_compiler.h"
#include "spu_main.h"
#include "spu_texture.h"
@@ -40,37 +42,19 @@
void
invalidate_tex_cache(void)
{
- uint unit = 0;
- uint bytes = 4 * spu.texture[unit].width
- * spu.texture[unit].height;
-
- spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
-}
+ uint lvl;
+ for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+ uint unit = 0;
+ uint bytes = 4 * spu.texture[unit].level[lvl].width
+ * spu.texture[unit].level[lvl].height;
+ if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+ bytes *= 6;
+ else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+ bytes *= spu.texture[unit].level[lvl].depth;
-/**
- * XXX look into getting texels for all four pixels in a quad at once.
- */
-static uint
-get_texel(uint unit, vec_uint4 coordinate)
-{
- /*
- * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
- * SIMD since X and Y are already in a SIMD register.
- */
- const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
- ushort x = spu_extract(coordinate, 0);
- ushort y = spu_extract(coordinate, 1);
- unsigned tile_offset = sizeof(tile_t)
- * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
- ushort texel_offset = (ushort) 4
- * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
- vec_uint4 tmp;
-
- spu_dcache_fetch_unaligned((qword *) & tmp,
- texture_ea + tile_offset + texel_offset,
- 4);
- return spu_extract(tmp, 0);
+ spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+ }
}
@@ -88,15 +72,17 @@ get_texel(uint unit, vec_uint4 coordinate)
* a time.
*/
static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
+ vec_uint4 *texels)
{
- const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
- vec_uint4 tile_x = spu_rlmask(x, -5);
- vec_uint4 tile_y = spu_rlmask(y, -5);
- const qword offset_x = si_andi((qword) x, 0x1f);
- const qword offset_y = si_andi((qword) y, 0x1f);
+ const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+ unsigned texture_ea = (uintptr_t) tlevel->start;
+ const vec_int4 tile_x = spu_rlmask(x, -5); /* tile_x = x / 32 */
+ const vec_int4 tile_y = spu_rlmask(y, -5); /* tile_y = y / 32 */
+ const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+ const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
- const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+ const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -107,6 +93,8 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+ texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
spu_dcache_fetch_unaligned((qword *) & texels[0],
texture_ea + spu_extract(offset, 0), 4);
spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -118,83 +106,496 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
}
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+ static const vector signed int zero = {0,0,0,0};
+ vector unsigned int c;
+ c = spu_cmpgt(vec, zero); /* c = vec > zero ? ~0 : 0 */
+ vec = spu_sel(zero, vec, c);
+ c = spu_cmpgt(vec, max); /* c = vec > max ? ~0 : 0 */
+ vec = spu_sel(vec, max, c);
+ return vec;
+}
+
+
+
/**
- * Get texture sample at texcoord.
+ * Do nearest texture sampling for four pixels.
+ * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
*/
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
+void
+sample_texture4_nearest(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4])
{
- vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
- vector unsigned int itc = spu_convtu(tc, 0); /* convert to int */
- itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
- uint texel = get_texel(unit, itc);
- return spu_unpack_A8R8G8B8(texel);
+ const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+ vector float ss = spu_mul(s, tlevel->scale_s);
+ vector float tt = spu_mul(t, tlevel->scale_t);
+ vector signed int is = spu_convts(ss, 0);
+ vector signed int it = spu_convts(tt, 0);
+ vec_uint4 texels[4];
+
+ /* PIPE_TEX_WRAP_REPEAT */
+ is = spu_and(is, tlevel->mask_s);
+ it = spu_and(it, tlevel->mask_t);
+
+ /* PIPE_TEX_WRAP_CLAMP */
+ is = spu_clamp(is, tlevel->max_s);
+ it = spu_clamp(it, tlevel->max_t);
+
+ get_four_texels(unit, level, face, is, it, texels);
+
+ /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+ spu_unpack_A8R8G8B8_transpose4(texels, colors);
}
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture4_bilinear(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4])
{
- static const vec_uint4 offset_x = {0, 0, 1, 1};
- static const vec_uint4 offset_y = {0, 1, 0, 1};
+ const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+ static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
- vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
- tc = spu_add(tc, spu_splats(-0.5f)); /* half texel bias */
+ vector float ss = spu_madd(s, tlevel->scale_s, half);
+ vector float tt = spu_madd(t, tlevel->scale_t, half);
- /* integer texcoords S,T: */
- vec_uint4 itc = spu_convtu(tc, 0); /* convert to int */
+ vector signed int is0 = spu_convts(ss, 0);
+ vector signed int it0 = spu_convts(tt, 0);
- vec_uint4 texels[4];
-
- /* setup texcoords for quad:
- * +-----+-----+
- * |x0,y0|x1,y1|
- * +-----+-----+
- * |x2,y2|x3,y3|
- * +-----+-----+
- */
- vec_uint4 x = spu_splats(spu_extract(itc, 0));
- vec_uint4 y = spu_splats(spu_extract(itc, 1));
- x = spu_add(x, offset_x);
- y = spu_add(y, offset_y);
+ /* is + 1, it + 1 */
+ vector signed int is1 = spu_add(is0, 1);
+ vector signed int it1 = spu_add(it0, 1);
+
+ /* PIPE_TEX_WRAP_REPEAT */
+ is0 = spu_and(is0, tlevel->mask_s);
+ it0 = spu_and(it0, tlevel->mask_t);
+ is1 = spu_and(is1, tlevel->mask_s);
+ it1 = spu_and(it1, tlevel->mask_t);
- /* GL_REPEAT wrap mode: */
- x = spu_and(x, spu.texture[unit].tex_size_x_mask);
- y = spu_and(y, spu.texture[unit].tex_size_y_mask);
+ /* PIPE_TEX_WRAP_CLAMP */
+ is0 = spu_clamp(is0, tlevel->max_s);
+ it0 = spu_clamp(it0, tlevel->max_t);
+ is1 = spu_clamp(is1, tlevel->max_s);
+ it1 = spu_clamp(it1, tlevel->max_t);
- get_four_texels(unit, x, y, texels);
+ /* get packed int texels */
+ vector unsigned int texels[16];
+ get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */
+ get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */
+ get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */
+ get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
- /* integer A8R8G8B8 to float texel conversion */
- vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
- vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
- vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
- vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+ /* XXX possibly rework following code to compute the weighted sample
+ * colors with integer arithmetic for fewer int->float conversions.
+ */
+ /* convert packed int texels to float colors */
+ vector float ftexels[16];
+ spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+ spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+ spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+ spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
/* Compute weighting factors in [0,1]
* Multiply texcoord by 1024, AND with 1023, convert back to float.
*/
- vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
- vector signed int itc1024 = spu_convts(tc1024, 0);
- itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
- vector float weight = spu_convtf(itc1024, 10);
-
- /* smeared frac and 1-frac */
- vector float sfrac = spu_splats(spu_extract(weight, 0));
- vector float tfrac = spu_splats(spu_extract(weight, 1));
- vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
- vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
- /* multiply the samples (colors) by the S/T weights */
- texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
- texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
- texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
- texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
- /* compute sum of weighted samples */
- vector float texel_sum = spu_add(texel00, texel01);
- texel_sum = spu_add(texel_sum, texel10);
- texel_sum = spu_add(texel_sum, texel11);
-
- return texel_sum;
+ vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+ vector signed int iss1024 = spu_convts(ss1024, 0);
+ iss1024 = spu_and(iss1024, 1023);
+ vector float sWeights0 = spu_convtf(iss1024, 10);
+
+ vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+ vector signed int itt1024 = spu_convts(tt1024, 0);
+ itt1024 = spu_and(itt1024, 1023);
+ vector float tWeights0 = spu_convtf(itt1024, 10);
+
+ /* 1 - sWeight and 1 - tWeight */
+ vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+ vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+ /* reds, for four pixels */
+ ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+ spu_add(ftexels[8], ftexels[12]));
+
+ /* greens, for four pixels */
+ ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+ spu_add(ftexels[9], ftexels[13]));
+
+ /* blues, for four pixels */
+ ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+ spu_add(ftexels[10], ftexels[14]));
+
+ /* alphas, for four pixels */
+ ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+ ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+ ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+ ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+ colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+ spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+ vector unsigned int *mOut1,
+ vector unsigned int *mOut2,
+ vector unsigned int *mOut3,
+ vector unsigned int *mIn)
+{
+ vector unsigned int abcd, efgh, ijkl, mnop; /* input vectors */
+ vector unsigned int aeim, bfjn, cgko, dhlp; /* output vectors */
+ vector unsigned int aibj, ckdl, emfn, gohp; /* intermediate vectors */
+
+ vector unsigned char shufflehi = ((vector unsigned char) {
+ 0x00, 0x01, 0x02, 0x03,
+ 0x10, 0x11, 0x12, 0x13,
+ 0x04, 0x05, 0x06, 0x07,
+ 0x14, 0x15, 0x16, 0x17});
+ vector unsigned char shufflelo = ((vector unsigned char) {
+ 0x08, 0x09, 0x0A, 0x0B,
+ 0x18, 0x19, 0x1A, 0x1B,
+ 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x1C, 0x1D, 0x1E, 0x1F});
+ abcd = *(mIn+0);
+ efgh = *(mIn+1);
+ ijkl = *(mIn+2);
+ mnop = *(mIn+3);
+
+ aibj = spu_shuffle(abcd, ijkl, shufflehi);
+ ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+ emfn = spu_shuffle(efgh, mnop, shufflehi);
+ gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+ aeim = spu_shuffle(aibj, emfn, shufflehi);
+ bfjn = spu_shuffle(aibj, emfn, shufflelo);
+ cgko = spu_shuffle(ckdl, gohp, shufflehi);
+ dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+ *mOut0 = aeim;
+ *mOut1 = bfjn;
+ *mOut2 = cgko;
+ *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int intead of float arithmetic
+ */
+void
+sample_texture4_bilinear_2(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4])
+{
+ const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+ static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+ /* Scale texcoords by size of texture, and add half pixel bias */
+ vector float ss = spu_madd(s, tlevel->scale_s, half);
+ vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+ /* convert float coords to fixed-pt coords with 8 fraction bits */
+ vector signed int is = spu_convts(ss, 8);
+ vector signed int it = spu_convts(tt, 8);
+
+ /* compute integer texel weights in [0, 255] */
+ vector signed int sWeights0 = spu_and(is, 255);
+ vector signed int tWeights0 = spu_and(it, 255);
+ vector signed int sWeights1 = spu_sub(255, sWeights0);
+ vector signed int tWeights1 = spu_sub(255, tWeights0);
+
+ /* texel coords: is0 = is / 256, it0 = is / 256 */
+ vector signed int is0 = spu_rlmask(is, -8);
+ vector signed int it0 = spu_rlmask(it, -8);
+
+ /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+ vector signed int is1 = spu_add(is0, 1);
+ vector signed int it1 = spu_add(it0, 1);
+
+ /* PIPE_TEX_WRAP_REPEAT */
+ is0 = spu_and(is0, tlevel->mask_s);
+ it0 = spu_and(it0, tlevel->mask_t);
+ is1 = spu_and(is1, tlevel->mask_s);
+ it1 = spu_and(it1, tlevel->mask_t);
+
+ /* PIPE_TEX_WRAP_CLAMP */
+ is0 = spu_clamp(is0, tlevel->max_s);
+ it0 = spu_clamp(it0, tlevel->max_t);
+ is1 = spu_clamp(is1, tlevel->max_s);
+ it1 = spu_clamp(it1, tlevel->max_t);
+
+ /* get packed int texels */
+ vector unsigned int texels[16];
+ get_four_texels(unit, level, face, is0, it0, texels + 0); /* upper-left */
+ get_four_texels(unit, level, face, is1, it0, texels + 4); /* upper-right */
+ get_four_texels(unit, level, face, is0, it1, texels + 8); /* lower-left */
+ get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
+
+ /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+ {
+ static const unsigned char ZERO = 0x80;
+ int i;
+ for (i = 0; i < 16; i++) {
+ texels[i] = spu_shuffle(texels[i], texels[i],
+ ((vector unsigned char) {
+ ZERO, ZERO, ZERO, 1,
+ ZERO, ZERO, ZERO, 2,
+ ZERO, ZERO, ZERO, 3,
+ ZERO, ZERO, ZERO, 0}));
+ }
+ }
+
+ /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+ vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+ texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+ transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+ transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+ transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+ transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+ /* computed weighted colors */
+ vector unsigned int c0, c1, c2, c3, cSum;
+
+ /* red */
+ c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[0] = spu_convtf(cSum, 24);
+
+ /* green */
+ c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[1] = spu_convtf(cSum, 24);
+
+ /* blue */
+ c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[2] = spu_convtf(cSum, 24);
+
+ /* alpha */
+ c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+ c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+ c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+ c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+ cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+ colors[3] = spu_convtf(cSum, 24);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static float
+compute_lambda(uint unit, vector float s, vector float t)
+{
+ uint baseLevel = 0;
+ float width = spu.texture[unit].level[baseLevel].width;
+ float height = spu.texture[unit].level[baseLevel].width;
+ float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+ float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+ float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+ float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+ float x = dsdx * dsdx + dtdx * dtdx;
+ float y = dsdy * dsdy + dtdy * dtdy;
+ float rho = x > y ? x : y;
+ rho = sqrtf(rho);
+ float lambda = logf(rho) * 1.442695f;
+ return lambda;
+}
+
+
+
+/**
+ * Texture sampling with level of detail selection.
+ */
+void
+sample_texture4_lod(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level_ignored, uint face,
+ vector float colors[4])
+{
+ /*
+ * Note that we're computing a lambda/lod here that's used for all
+ * four pixels in the quad.
+ */
+ float lambda = compute_lambda(unit, s, t);
+
+ /* apply lod bias */
+ lambda += spu.sampler[unit].lod_bias;
+
+ /* clamp */
+ if (lambda < spu.sampler[unit].min_lod)
+ lambda = spu.sampler[unit].min_lod;
+ else if (lambda > spu.sampler[unit].max_lod)
+ lambda = spu.sampler[unit].max_lod;
+
+ if (lambda <= 0.0f) {
+ /* magnify */
+ spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
+ }
+ else {
+ /* minify */
+ int level = (int) (lambda + 0.5f);
+ if (level > (int) spu.texture[unit].max_level)
+ level = spu.texture[unit].max_level;
+ spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
+ /* XXX to do: mipmap level interpolation */
+ }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+ /*
+ major axis
+ direction target sc tc ma
+ ---------- ------------------------------- --- --- ---
+ +rx TEXTURE_CUBE_MAP_POSITIVE_X_EXT -rz -ry rx
+ -rx TEXTURE_CUBE_MAP_NEGATIVE_X_EXT +rz -ry rx
+ +ry TEXTURE_CUBE_MAP_POSITIVE_Y_EXT +rx +rz ry
+ -ry TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT +rx -rz ry
+ +rz TEXTURE_CUBE_MAP_POSITIVE_Z_EXT +rx -ry rz
+ -rz TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT -rx -ry rz
+ */
+ const float arx = fabsf(rx);
+ const float ary = fabsf(ry);
+ const float arz = fabsf(rz);
+ unsigned face;
+ float sc, tc, ma;
+
+ if (arx > ary && arx > arz) {
+ if (rx >= 0.0F) {
+ face = PIPE_TEX_FACE_POS_X;
+ sc = -rz;
+ tc = -ry;
+ ma = arx;
+ }
+ else {
+ face = PIPE_TEX_FACE_NEG_X;
+ sc = rz;
+ tc = -ry;
+ ma = arx;
+ }
+ }
+ else if (ary > arx && ary > arz) {
+ if (ry >= 0.0F) {
+ face = PIPE_TEX_FACE_POS_Y;
+ sc = rx;
+ tc = rz;
+ ma = ary;
+ }
+ else {
+ face = PIPE_TEX_FACE_NEG_Y;
+ sc = rx;
+ tc = -rz;
+ ma = ary;
+ }
+ }
+ else {
+ if (rz > 0.0F) {
+ face = PIPE_TEX_FACE_POS_Z;
+ sc = rx;
+ tc = -ry;
+ ma = arz;
+ }
+ else {
+ face = PIPE_TEX_FACE_NEG_Z;
+ sc = -rx;
+ tc = -ry;
+ ma = arz;
+ }
+ }
+
+ *newS = (sc / ma + 1.0F) * 0.5F;
+ *newT = (tc / ma + 1.0F) * 0.5F;
+
+ return face;
+}
+
+
+
+void
+sample_texture4_cube(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face_ignored,
+ vector float colors[4])
+{
+ static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
+ uint p, faces[4];
+ float newS[4], newT[4];
+
+ /* Compute cube face referenced by the four sets of texcoords.
+ * XXX we should SIMD-ize this.
+ */
+ for (p = 0; p < 4; p++) {
+ float rx = spu_extract(s, p);
+ float ry = spu_extract(t, p);
+ float rz = spu_extract(r, p);
+ faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+ }
+
+ if (faces[0] == faces[1] &&
+ faces[0] == faces[2] &&
+ faces[0] == faces[3]) {
+ /* GOOD! All four texcoords refer to the same cube face */
+ s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+ t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+ sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+ }
+ else {
+ /* BAD! The four texcoords refer to different faces */
+ for (p = 0; p < 4; p++) {
+ vector float c[4];
+
+ sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+ zero, zero, unit, level, faces[p], c);
+
+ float red = spu_extract(c[0], p);
+ float green = spu_extract(c[1], p);
+ float blue = spu_extract(c[2], p);
+ float alpha = spu_extract(c[3], p);
+
+ colors[0] = spu_insert(red, colors[0], p);
+ colors[1] = spu_insert(green, colors[1], p);
+ colors[2] = spu_insert(blue, colors[2], p);
+ colors[3] = spu_insert(alpha, colors[3], p);
+ }
+ }
}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be8..387484c3ad 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,12 +36,38 @@ extern void
invalidate_tex_cache(void);
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
+extern void
+sample_texture4_nearest(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
+
+extern void
+sample_texture4_bilinear(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
+extern void
+sample_texture4_bilinear_2(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
+
+extern void
+sample_texture4_lod(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level, uint face,
+ vector float colors[4]);
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
+
+extern void
+sample_texture4_cube(vector float s, vector float t,
+ vector float r, vector float q,
+ uint unit, uint level_ignored, uint face_ignored,
+ vector float colors[4]);
#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0a8fb56a62..03f094373d 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -116,21 +116,15 @@ struct setup_stage {
struct edge etop;
struct edge emaj;
- float oneoverarea;
+ float oneOverArea;
- uint tx, ty;
+ uint facing;
+
+ uint tx, ty; /**< position of current tile (x, y) */
int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
-#if 0
- struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#else
struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#endif
-
-#if 0
- struct quad_header quad;
-#endif
struct {
int left[2]; /**< [0] = row0, [1] = row1 */
@@ -142,101 +136,61 @@ struct setup_stage {
};
-
static struct setup_stage setup;
-
-
-#if 0
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
- return (struct setup_stage *)stage;
-}
-#endif
-
-#if 0
-/**
- * Clip setup.quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip(struct setup_stage *setup)
-{
- const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
- const int minx = (int) cliprect->minx;
- const int maxx = (int) cliprect->maxx;
- const int miny = (int) cliprect->miny;
- const int maxy = (int) cliprect->maxy;
-
- if (setup.quad.x0 >= maxx ||
- setup.quad.y0 >= maxy ||
- setup.quad.x0 + 1 < minx ||
- setup.quad.y0 + 1 < miny) {
- /* totally clipped */
- setup.quad.mask = 0x0;
- return;
- }
- if (setup.quad.x0 < minx)
- setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
- if (setup.quad.y0 < miny)
- setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
- if (setup.quad.x0 == maxx - 1)
- setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
- if (setup.quad.y0 == maxy - 1)
- setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
-#endif
-
-#if 0
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad(struct setup_stage *setup)
-{
- quad_clip(setup);
- if (setup.quad.mask) {
- struct softpipe_context *sp = setup.softpipe;
- sp->quad.first->run(sp->quad.first, &setup.quad);
- }
-}
-#endif
-
/**
* Evaluate attribute coefficients (plane equations) to compute
* attribute values for the four fragments in a quad.
* Eg: four colors will be computed (in AoS format).
*/
static INLINE void
-eval_coeff(uint slot, float x, float y, vector float result[4])
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
{
- switch (spu.vertex_info.interp_mode[slot]) {
+ switch (spu.vertex_info.attrib[slot].interp_mode) {
case INTERP_CONSTANT:
result[QUAD_TOP_LEFT] =
result[QUAD_TOP_RIGHT] =
result[QUAD_BOTTOM_LEFT] =
result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
break;
-
case INTERP_LINEAR:
- /* fall-through, for now */
- default:
{
- register vector float dadx = setup.coef[slot].dadx.v;
- register vector float dady = setup.coef[slot].dady.v;
- register vector float topLeft
- = spu_add(setup.coef[slot].a0.v,
- spu_add(spu_mul(spu_splats(x), dadx),
- spu_mul(spu_splats(y), dady)));
+ vector float dadx = setup.coef[slot].dadx.v;
+ vector float dady = setup.coef[slot].dady.v;
+ vector float topLeft =
+ spu_add(setup.coef[slot].a0.v,
+ spu_add(spu_mul(spu_splats(x), dadx),
+ spu_mul(spu_splats(y), dady)));
result[QUAD_TOP_LEFT] = topLeft;
result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
}
+ break;
+ case INTERP_PERSPECTIVE:
+ {
+ vector float dadx = setup.coef[slot].dadx.v;
+ vector float dady = setup.coef[slot].dady.v;
+ vector float topLeft =
+ spu_add(setup.coef[slot].a0.v,
+ spu_add(spu_mul(spu_splats(x), dadx),
+ spu_mul(spu_splats(y), dady)));
+
+ vector float wInv = spu_re(w); /* 1.0 / w */
+
+ result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+ result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+ result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+ result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+ }
+ break;
+ case INTERP_POS:
+ case INTERP_NONE:
+ break;
+ default:
+ ASSERT(0);
}
}
@@ -246,14 +200,14 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
* XXX this will all be re-written someday.
*/
static INLINE void
-eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
{
- eval_coeff(slot, x, y, result);
+ eval_coeff(slot, x, y, w, result);
_transpose_matrix4x4(result, result);
}
-
+/** Evalute coefficients to get Z for four pixels in a quad */
static INLINE vector float
eval_z(float x, float y)
{
@@ -267,6 +221,20 @@ eval_z(float x, float y)
}
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+ const uint slot = 0;
+ const float dwdx = setup.coef[slot].dadx.f[3];
+ const float dwdy = setup.coef[slot].dady.f[3];
+ const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
+ const vector float topLeftv = spu_splats(topLeft);
+ const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+ return spu_add(topLeftv, derivs);
+}
+
+
/**
* Emit a quad (pass to next stage). No clipping is done.
* Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -274,7 +242,7 @@ eval_z(float x, float y)
* overall.
*/
static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
{
/* If any bits in mask are set... */
if (spu_extract(spu_orx(mask), 0)) {
@@ -284,84 +252,21 @@ emit_quad( int x, int y, mask_t mask )
spu.cur_ctile_status = TILE_STATUS_DIRTY;
spu.cur_ztile_status = TILE_STATUS_DIRTY;
- if (spu.texture[0].start) {
- /*
- * Temporary texture mapping path
- * This will go away when fragment programs support TEX inst.
- */
- const uint unit = 0;
- vector float colors[4];
- vector float texcoords[4];
- eval_coeff(2, (float) x, (float) y, texcoords);
-
- if (spu_extract(mask, 0))
- colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
- if (spu_extract(mask, 1))
- colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
- if (spu_extract(mask, 2))
- colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
- if (spu_extract(mask, 3))
- colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
- if (spu.texture[1].start) {
- /* multi-texture mapping */
- const uint unit = 1;
- vector float colors1[4];
-
- eval_coeff(2, (float) x, (float) y, texcoords);
-
- if (spu_extract(mask, 0))
- colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
- if (spu_extract(mask, 1))
- colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
- if (spu_extract(mask, 2))
- colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
- if (spu_extract(mask, 3))
- colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
- /* hack: modulate first texture by second */
- colors[0] = spu_mul(colors[0], colors1[0]);
- colors[1] = spu_mul(colors[1], colors1[1]);
- colors[2] = spu_mul(colors[2], colors1[2]);
- colors[3] = spu_mul(colors[3], colors1[3]);
- }
-
- {
- /* Convert fragment data from AoS to SoA format.
- * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
- * This is temporary!
- */
- vector float soa_frag[4];
- _transpose_matrix4x4(soa_frag, colors);
-
- vector float fragZ = eval_z((float) x, (float) y);
-
- /* Do all per-fragment/quad operations here, including:
- * alpha test, z test, stencil test, blend and framebuffer writing.
- */
- spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
- fragZ,
- soa_frag[0], soa_frag[1],
- soa_frag[2], soa_frag[3],
- mask);
- }
-
- }
- else {
+ {
/*
* Run fragment shader, execute per-fragment ops, update fb/tile.
*/
vector float inputs[4*4], outputs[2*4];
vector float fragZ = eval_z((float) x, (float) y);
+ vector float fragW = eval_w((float) x, (float) y);
/* setup inputs */
#if 0
- eval_coeff_soa(1, (float) x, (float) y, inputs);
+ eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
#else
uint i;
for (i = 0; i < spu.vertex_info.num_attribs; i++) {
- eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
+ eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
}
#endif
ASSERT(spu.fragment_program);
@@ -379,7 +284,8 @@ emit_quad( int x, int y, mask_t mask )
outputs[0*4+1],
outputs[0*4+2],
outputs[0*4+3],
- mask);
+ mask,
+ setup.facing);
}
}
}
@@ -389,7 +295,8 @@ emit_quad( int x, int y, mask_t mask )
* Given an X or Y coordinate, return the block/quad coordinate that it
* belongs to.
*/
-static INLINE int block( int x )
+static INLINE int
+block(int x)
{
return x & ~1;
}
@@ -400,7 +307,8 @@ static INLINE int block( int x )
* the triangle's bounds.
* The mask is a uint4 vector and each element will be 0 or 0xffffffff.
*/
-static INLINE mask_t calculate_mask( int x )
+static INLINE mask_t
+calculate_mask(int x)
{
/* This is a little tricky.
* Use & instead of && to avoid branches.
@@ -418,7 +326,8 @@ static INLINE mask_t calculate_mask( int x )
/**
* Render a horizontal span of quads
*/
-static void flush_spans( void )
+static void
+flush_spans(void)
{
int minleft, maxright;
int x;
@@ -446,7 +355,6 @@ static void flush_spans( void )
return;
}
-
/* OK, we're very likely to need the tile data now.
* clear or finish waiting if needed.
*/
@@ -482,9 +390,7 @@ static void flush_spans( void )
* calculate_mask() could be simplified a bit...
*/
for (x = block(minleft); x <= block(maxright); x += 2) {
-#if 1
- emit_quad( x, setup.span.y, calculate_mask( x ) );
-#endif
+ emit_quad( x, setup.span.y, calculate_mask( x ));
}
setup.span.y = 0;
@@ -493,8 +399,10 @@ static void flush_spans( void )
setup.span.right[1] = 0;
}
+
#if DEBUG_VERTS
-static void print_vertex(const struct vertex_header *v)
+static void
+print_vertex(const struct vertex_header *v)
{
int i;
fprintf(stderr, "Vertex: (%p)\n", v);
@@ -506,11 +414,11 @@ static void print_vertex(const struct vertex_header *v)
#endif
-static boolean setup_sort_vertices(const struct vertex_header *v0,
- const struct vertex_header *v1,
- const struct vertex_header *v2)
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+ const struct vertex_header *v1,
+ const struct vertex_header *v2)
{
-
#if DEBUG_VERTS
fprintf(stderr, "Triangle:\n");
print_vertex(v0);
@@ -599,13 +507,13 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
* use the prim->det value because its sign is correct.
*/
{
- const float area = (setup.emaj.dx * setup.ebot.dy -
- setup.ebot.dx * setup.emaj.dy);
+ const float area = (setup.emaj.dx * setup.ebot.dy -
+ setup.ebot.dx * setup.emaj.dy);
- setup.oneoverarea = 1.0f / area;
+ setup.oneOverArea = 1.0f / area;
/*
_mesa_printf("%s one-over-area %f area %f det %f\n",
- __FUNCTION__, setup.oneoverarea, area, prim->det );
+ __FUNCTION__, setup.oneOverArea, area, prim->det );
*/
}
@@ -628,7 +536,7 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
* \param slot which attribute slot
*/
static INLINE void
-const_coeff(uint slot)
+const_coeff4(uint slot)
{
setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
@@ -637,58 +545,6 @@ const_coeff(uint slot)
/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static INLINE void
-tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
-{
- uint i;
- const float *vmin_d = (float *) &setup.vmin->data[slot];
- const float *vmid_d = (float *) &setup.vmid->data[slot];
- const float *vmax_d = (float *) &setup.vmax->data[slot];
- const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
- const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
-
- for (i = firstComp; i < lastComp; i++) {
- float botda = vmid_d[i] - vmin_d[i];
- float majda = vmax_d[i] - vmin_d[i];
- float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
- float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-
- ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
- setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
- setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-
- /* calculate a0 as the value which would be sampled for the
- * fragment at (0,0), taking into account that we want to sample at
- * pixel centers, in other words (0.5, 0.5).
- *
- * this is neat but unfortunately not a good way to do things for
- * triangles with very large values of dadx or dady as it will
- * result in the subtraction and re-addition from a0 of a very
- * large number, which means we'll end up loosing a lot of the
- * fractional bits and precision from a0. the way to fix this is
- * to define a0 as the sample at a pixel center somewhere near vmin
- * instead - i'll switch to this later.
- */
- setup.coef[slot].a0.f[i] = (vmin_d[i] -
- (setup.coef[slot].dadx.f[i] * x +
- setup.coef[slot].dady.f[i] * y));
- }
-
- /*
- _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
- slot, "xyzw"[i],
- setup.coef[slot].a0[i],
- setup.coef[slot].dadx.f[i],
- setup.coef[slot].dady.f[i]);
- */
-}
-
-
-/**
* As above, but interp setup all four vector components.
*/
static INLINE void
@@ -708,8 +564,8 @@ tri_linear_coeff4(uint slot)
vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
spu_mul(majda, spu_splats(setup.ebot.dx)));
- setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
- setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+ setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
@@ -718,8 +574,6 @@ tri_linear_coeff4(uint slot)
}
-
-#if 0
/**
* Compute a0, dadx and dady for a perspective-corrected interpolant,
* for a triangle.
@@ -728,82 +582,76 @@ tri_linear_coeff4(uint slot)
* Later, when we compute the value at a particular fragment position we'll
* divide the interpolated value by the interpolated W at that fragment.
*/
-static void tri_persp_coeff( unsigned slot,
- unsigned i )
+static void
+tri_persp_coeff4(uint slot)
{
- /* premultiply by 1/w:
- */
- float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
- float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
- float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
-
- float botda = mida - mina;
- float majda = maxa - mina;
- float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
- float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-
- /*
- printf("tri persp %d,%d: %f %f %f\n", slot, i,
- setup.vmin->data[slot][i],
- setup.vmid->data[slot][i],
- setup.vmax->data[slot][i]
- );
- */
+ const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+ const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+ const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+ const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+ const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+ vector float vmin_d = setup.vmin->data[slot];
+ vector float vmid_d = setup.vmid->data[slot];
+ vector float vmax_d = setup.vmax->data[slot];
- assert(slot < PIPE_MAX_SHADER_INPUTS);
- assert(i <= 3);
+ vmin_d = spu_mul(vmin_d, vmin_w);
+ vmid_d = spu_mul(vmid_d, vmid_w);
+ vmax_d = spu_mul(vmax_d, vmax_w);
- setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
- setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
- setup.coef[slot].a0.f[i] = (mina -
- (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) +
- setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+ vector float botda = vmid_d - vmin_d;
+ vector float majda = vmax_d - vmin_d;
+
+ vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+ spu_mul(botda, spu_splats(setup.emaj.dy)));
+ vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+ spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+ setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+ setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+
+ vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+ vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+
+ setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
}
-#endif
+
/**
* Compute the setup.coef[] array dadx, dady, a0 values.
* Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
*/
-static void setup_tri_coefficients(void)
+static void
+setup_tri_coefficients(void)
{
-#if 1
uint i;
for (i = 0; i < spu.vertex_info.num_attribs; i++) {
- switch (spu.vertex_info.interp_mode[i]) {
+ switch (spu.vertex_info.attrib[i].interp_mode) {
case INTERP_NONE:
break;
- case INTERP_POS:
- /*tri_linear_coeff(i, 2, 3);*/
- /* XXX interp W if PERSPECTIVE... */
- tri_linear_coeff4(i);
- break;
case INTERP_CONSTANT:
- const_coeff(i);
+ const_coeff4(i);
break;
+ case INTERP_POS:
+ /* fall-through */
case INTERP_LINEAR:
tri_linear_coeff4(i);
break;
case INTERP_PERSPECTIVE:
- tri_linear_coeff4(i); /* temporary */
+ tri_persp_coeff4(i);
break;
default:
ASSERT(0);
}
}
-#else
- ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
- ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
- spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
- tri_linear_coeff(0, 2, 3); /* slot 0, z */
- tri_linear_coeff(1, 0, 4); /* slot 1, color */
-#endif
}
-static void setup_tri_edges(void)
+static void
+setup_tri_edges(void)
{
float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
@@ -833,9 +681,8 @@ static void setup_tri_edges(void)
* Render the upper or lower half of a triangle.
* Scissoring/cliprect is applied here too.
*/
-static void subtriangle( struct edge *eleft,
- struct edge *eright,
- unsigned lines )
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
{
const int minx = setup.cliprect_minx;
const int maxx = setup.cliprect_maxx;
@@ -903,12 +750,27 @@ static void subtriangle( struct edge *eleft,
}
+static float
+determinant(const float *v0, const float *v1, const float *v2)
+{
+ /* edge vectors e = v0 - v2, f = v1 - v2 */
+ const float ex = v0[0] - v2[0];
+ const float ey = v0[1] - v2[1];
+ const float fx = v1[0] - v2[0];
+ const float fy = v1[1] - v2[1];
+
+ /* det = cross(e,f).z */
+ return ex * fy - ey * fx;
+}
+
+
/**
* Draw triangle into tile at (tx, ty) (tile coords)
* The tile data should have already been fetched.
*/
boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2,
+ uint tx, uint ty, uint front_winding)
{
setup.tx = tx;
setup.ty = ty;
@@ -919,6 +781,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+ /* Before we sort vertices, determine the facing of the triangle,
+ * which will be needed for front/back-face stencil application
+ */
+ float det = determinant(v0, v1, v2);
+ setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
if (!setup_sort_vertices((struct vertex_header *) v0,
(struct vertex_header *) v1,
(struct vertex_header *) v2)) {
@@ -932,19 +800,14 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
setup.span.y_flags = 0;
setup.span.right[0] = 0;
setup.span.right[1] = 0;
- /* setup.span.z_mode = tri_z_mode( setup.ctx ); */
- /* init_constant_attribs( setup ); */
-
- if (setup.oneoverarea < 0.0) {
- /* emaj on left:
- */
+ if (setup.oneOverArea < 0.0) {
+ /* emaj on left */
subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
}
else {
- /* emaj on right:
- */
+ /* emaj on right */
subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c9..abc3d35160 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
#endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index d194c2fb15..8f1f58b2dd 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -77,9 +77,9 @@ emit_hw_vertex( struct i915_context *i915,
assert(!i915->dirty);
for (i = 0; i < vinfo->num_attribs; i++) {
- const uint j = vinfo->src_index[i];
+ const uint j = vinfo->attrib[i].src_index;
const float *attrib = vertex->data[j];
- switch (vinfo->emit[i]) {
+ switch (vinfo->attrib[i].emit) {
case EMIT_1F:
OUT_BATCH( fui(attrib[0]) );
count++;
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
index 488615067c..178d4e8781 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -88,12 +88,12 @@ static void calculate_vertex_layout( struct i915_context *i915 )
if (needW) {
draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
vinfo.hwfmt[0] |= S4_VFMT_XYZW;
- vinfo.emit[0] = EMIT_4F;
+ vinfo.attrib[0].emit = EMIT_4F;
}
else {
draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
vinfo.hwfmt[0] |= S4_VFMT_XYZ;
- vinfo.emit[0] = EMIT_3F;
+ vinfo.attrib[0].emit = EMIT_3F;
}
/* hardware point size */
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 496ed43df2..0111469405 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
#include "tgsi/tgsi_sse2.h"
-#ifdef PIPE_ARCH_X86
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
#include "rtasm/rtasm_x86sse.h"
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index bc8263c33e..13d8017393 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -773,10 +773,10 @@ static void setup_tri_coefficients( struct setup_context *setup )
/* setup interpolation for all the remaining attributes:
*/
for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
- const uint vertSlot = vinfo->src_index[fragSlot];
+ const uint vertSlot = vinfo->attrib[fragSlot].src_index;
uint j;
- switch (vinfo->interp_mode[fragSlot]) {
+ switch (vinfo->attrib[fragSlot].interp_mode) {
case INTERP_CONSTANT:
for (j = 0; j < NUM_CHANNELS; j++)
const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -1084,10 +1084,10 @@ setup_line_coefficients(struct setup_context *setup,
/* setup interpolation for all the remaining attributes:
*/
for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
- const uint vertSlot = vinfo->src_index[fragSlot];
+ const uint vertSlot = vinfo->attrib[fragSlot].src_index;
uint j;
- switch (vinfo->interp_mode[fragSlot]) {
+ switch (vinfo->attrib[fragSlot].interp_mode) {
case INTERP_CONSTANT:
for (j = 0; j < NUM_CHANNELS; j++)
const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -1331,10 +1331,10 @@ setup_point( struct setup_context *setup,
const_coeff(setup, &setup->posCoef, 0, 3);
for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
- const uint vertSlot = vinfo->src_index[fragSlot];
+ const uint vertSlot = vinfo->attrib[fragSlot].src_index;
uint j;
- switch (vinfo->interp_mode[fragSlot]) {
+ switch (vinfo->attrib[fragSlot].interp_mode) {
case INTERP_CONSTANT:
/* fall-through */
case INTERP_LINEAR:
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index af3746c026..ef05547819 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -85,6 +85,14 @@
#define PIPE_ARCH_X86_64
#endif
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_CC_GCC) && !defined(__SSE2__)
+/* #warning SSE2 support requires -msse -msse2 compiler options */
+#else
+#define PIPE_ARCH_SSE
+#endif
+#endif
+
#if 0 /* FIXME */
#define PIPE_ARCH_PPC
#endif