diff options
Diffstat (limited to 'src/gallium/auxiliary')
39 files changed, 2498 insertions, 1096 deletions
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 78249054f2..b439bc4059 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -274,6 +274,14 @@ draw_enable_point_sprites(struct draw_context *draw, boolean enable) } +void +draw_set_force_passthrough( struct draw_context *draw, boolean enable ) +{ + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + draw->force_passthrough = enable; +} + + /** * Ask the draw module for the location/slot of the given vertex attribute in * a post-transformed vertex. diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index 0ab3681b64..3eeb453531 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -160,6 +160,9 @@ void draw_set_render( struct draw_context *draw, void draw_set_driver_clipping( struct draw_context *draw, boolean bypass_clipping ); +void draw_set_force_passthrough( struct draw_context *draw, + boolean enable ); + /******************************************************************************* * Draw pipeline */ diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c index c0cf4269db..9825e116c3 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c +++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c @@ -231,9 +231,9 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint prim ) unsigned emit_sz = 0; unsigned src_buffer = 0; unsigned output_format; - unsigned src_offset = (vbuf->vinfo->src_index[i] * 4 * sizeof(float) ); + unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) ); - switch (vbuf->vinfo->emit[i]) { + switch (vbuf->vinfo->attrib[i].emit) { case EMIT_4F: output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit_sz = 4 * sizeof(float); diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 626a2e3e30..5d531146c5 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -163,12 +163,15 @@ struct draw_context struct { boolean bypass_clipping; + boolean bypass_vs; } driver; boolean flushing; /**< debugging/sanity */ boolean suspend_flushing; /**< internally set */ boolean bypass_clipping; /**< set if either api or driver bypass_clipping true */ + boolean force_passthrough; /**< never clip or shade */ + /* pipe state that we need: */ const struct pipe_rasterizer_state *rasterizer; struct pipe_viewport_state viewport; @@ -193,7 +196,7 @@ struct draw_context const float (*aligned_constants)[4]; - float (*aligned_constant_storage)[4]; + const float (*aligned_constant_storage)[4]; unsigned const_storage_size; diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 669c11c993..87ec6ae20c 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -69,26 +69,26 @@ draw_pt_arrays(struct draw_context *draw, return TRUE; } - - if (!draw->render) { - opt |= PT_PIPELINE; - } - - if (draw_need_pipeline(draw, - draw->rasterizer, - prim)) { - opt |= PT_PIPELINE; - } - - if (!draw->bypass_clipping && !draw->pt.test_fse) { - opt |= PT_CLIPTEST; + if (!draw->force_passthrough) { + if (!draw->render) { + opt |= PT_PIPELINE; + } + + if (draw_need_pipeline(draw, + draw->rasterizer, + prim)) { + opt |= PT_PIPELINE; + } + + if (!draw->bypass_clipping && !draw->pt.test_fse) { + opt |= PT_CLIPTEST; + } + + if (!draw->rasterizer->bypass_vs) { + opt |= PT_SHADE; + } } - - if (!draw->rasterizer->bypass_vs) { - opt |= PT_SHADE; - } - - + if (opt == 0) middle = draw->pt.middle.fetch_emit; else if (opt == PT_SHADE && !draw->pt.no_fse) diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index d4eca80588..d520b05869 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -84,11 +84,11 @@ void draw_pt_emit_prepare( struct pt_emit *emit, unsigned emit_sz = 0; unsigned src_buffer = 0; unsigned output_format; - unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) ); + unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) ); - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_4F: output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit_sz = 4 * sizeof(float); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index 5a4db6cfe5..3966ad48ba 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -121,7 +121,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, memset(&key, 0, sizeof(key)); for (i = 0; i < vinfo->num_attribs; i++) { - const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->src_index[i]]; + const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->attrib[i].src_index]; unsigned emit_sz = 0; unsigned input_format = src->src_format; @@ -129,7 +129,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, unsigned input_offset = src->src_offset; unsigned output_format; - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_4F: output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit_sz = 4 * sizeof(float); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index 73fc70c1bc..f7e6a1a8ee 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -79,6 +79,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs; const struct vertex_info *vinfo; unsigned i; + unsigned nr_vbs = 0; if (!draw->render->set_primitive( draw->render, @@ -102,7 +103,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.viewport = !draw->identity_viewport; fse->key.clip = !draw->bypass_clipping; - fse->key.pad = 0; + fse->key.const_vbuffers = 0; memset(fse->key.element, 0, fse->key.nr_elements * sizeof(fse->key.element[0])); @@ -116,16 +117,23 @@ static void fse_prepare( struct draw_pt_middle_end *middle, */ fse->key.element[i].in.buffer = src->vertex_buffer_index; fse->key.element[i].in.offset = src->src_offset; + nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1); } + for (i = 0; i < 5 && i < nr_vbs; i++) { + if (draw->pt.vertex_buffer[i].pitch == 0) + fse->key.const_vbuffers |= (1<<i); + } + if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers); + { unsigned dst_offset = 0; for (i = 0; i < vinfo->num_attribs; i++) { unsigned emit_sz = 0; - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_4F: emit_sz = 4 * sizeof(float); break; @@ -153,8 +161,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, * numbers, not to positions in the hw vertex description -- * that's handled by the output_offset field. */ - fse->key.element[i].out.format = vinfo->emit[i]; - fse->key.element[i].out.vs_output = vinfo->src_index[i]; + fse->key.element[i].out.format = vinfo->attrib[i].emit; + fse->key.element[i].out.vs_output = vinfo->attrib[i].src_index; fse->key.element[i].out.offset = dst_offset; dst_offset += emit_sz; @@ -162,13 +170,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, } } - - /* Would normally look up a vertex shader and peruse its list of - * varients somehow. We omitted that step and put all the - * hardcoded "shaders" into an array. We're just making the - * assumption that this happens to be a matching shader... ie - * you're running isosurf, aren't you? - */ + fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, &fse->key ); @@ -177,18 +179,17 @@ static void fse_prepare( struct draw_pt_middle_end *middle, return ; } + if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__, + fse->active->key.const_vbuffers); + /* Now set buffer pointers: */ - for (i = 0; i < num_vs_inputs; i++) { - unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index; - - fse->active->set_input( fse->active, - i, - - ((const ubyte *) draw->pt.user.vbuffer[buf] + - draw->pt.vertex_buffer[buf].buffer_offset), - - draw->pt.vertex_buffer[buf].pitch ); + for (i = 0; i < draw->pt.nr_vertex_buffers; i++) { + fse->active->set_buffer( fse->active, + i, + ((const ubyte *) draw->pt.user.vbuffer[i] + + draw->pt.vertex_buffer[i].buffer_offset), + draw->pt.vertex_buffer[i].pitch ); } *max_vertices = (draw->render->max_vertex_buffer_bytes / diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c index 1446f785c5..3214213e44 100644 --- a/src/gallium/auxiliary/draw/draw_vertex.c +++ b/src/gallium/auxiliary/draw/draw_vertex.c @@ -49,7 +49,7 @@ draw_compute_vertex_size(struct vertex_info *vinfo) vinfo->size = 0; for (i = 0; i < vinfo->num_attribs; i++) { - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_OMIT: break; case EMIT_4UB: @@ -81,8 +81,8 @@ draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data) unsigned i, j; for (i = 0; i < vinfo->num_attribs; i++) { - j = vinfo->src_index[i]; - switch (vinfo->emit[i]) { + j = vinfo->attrib[i].src_index; + switch (vinfo->attrib[i].emit) { case EMIT_OMIT: debug_printf("EMIT_OMIT:"); break; diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h index 16c65c4317..a943607d7e 100644 --- a/src/gallium/auxiliary/draw/draw_vertex.h +++ b/src/gallium/auxiliary/draw/draw_vertex.h @@ -75,12 +75,41 @@ struct vertex_info { uint num_attribs; uint hwfmt[4]; /**< hardware format info for this format */ - enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS]; - enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS]; /**< EMIT_x */ - uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */ uint size; /**< total vertex size in dwords */ + + /* Keep this small and at the end of the struct to allow quick + * memcmp() comparisons. + */ + struct { + ubyte interp_mode:4; /**< INTERP_x */ + ubyte emit:4; /**< EMIT_x */ + ubyte src_index; /**< map to post-xform attribs */ + } attrib[PIPE_MAX_SHADER_INPUTS]; }; +static INLINE int +draw_vinfo_size( const struct vertex_info *a ) +{ + return ((const char *)&a->attrib[a->num_attribs] - + (const char *)a); +} + +static INLINE int +draw_vinfo_compare( const struct vertex_info *a, + const struct vertex_info *b ) +{ + unsigned sizea = draw_vinfo_size( a ); + return memcmp( a, b, sizea ); +} + +static INLINE void +draw_vinfo_copy( struct vertex_info *dst, + const struct vertex_info *src ) +{ + unsigned size = draw_vinfo_size( src ); + memcpy( dst, src, size ); +} + /** @@ -91,14 +120,15 @@ struct vertex_info */ static INLINE uint draw_emit_vertex_attr(struct vertex_info *vinfo, - enum attrib_emit emit, enum interp_mode interp, + enum attrib_emit emit, + enum interp_mode interp, /* only used by softpipe??? */ uint src_index) { const uint n = vinfo->num_attribs; assert(n < PIPE_MAX_SHADER_INPUTS); - vinfo->emit[n] = emit; - vinfo->interp_mode[n] = interp; - vinfo->src_index[n] = src_index; + vinfo->attrib[n].emit = emit; + vinfo->attrib[n].interp_mode = interp; + vinfo->attrib[n].src_index = src_index; vinfo->num_attribs++; return n; } diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index 45992d1986..68c24abad3 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -64,7 +64,7 @@ struct draw_vs_varient_key { unsigned nr_outputs:8; unsigned viewport:1; unsigned clip:1; - unsigned pad:5; + unsigned const_vbuffers:5; struct draw_varient_element element[PIPE_MAX_ATTRIBS]; }; @@ -76,7 +76,7 @@ struct draw_vs_varient { struct draw_vertex_shader *vs; - void (*set_input)( struct draw_vs_varient *, + void (*set_buffer)( struct draw_vs_varient *, unsigned i, const void *ptr, unsigned stride ); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index a556477a76..87232865e2 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -92,9 +92,9 @@ struct x86_reg aos_get_x86( struct aos_compilation *cp, assert(which_reg == 1); offset = Offset(struct aos_machine, constants); break; - case X86_ATTRIBS: + case X86_BUFFERS: assert(which_reg == 0); - offset = Offset(struct aos_machine, attrib); + offset = Offset(struct aos_machine, buffer); break; default: assert(0); @@ -196,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx ) } +void aos_spill_all( struct aos_compilation *cp ) +{ + unsigned i; + + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } +} + + static struct x86_reg get_xmm_writable( struct aos_compilation *cp, struct x86_reg reg ) { @@ -1939,6 +1951,11 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, save_fpu_state( &cp ); set_fpu_round_nearest( &cp ); + aos_init_inputs( &cp, linear ); + + cp.x86_reg[0] = 0; + cp.x86_reg[1] = 0; + /* Note address for loop jump */ label = x86_get_label(cp.func); @@ -2018,13 +2035,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, /* Incr index */ - if (linear) { - x86_inc(cp.func, cp.idx_EBX); - } - else { - x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4)); - } - + aos_incr_inputs( &cp, linear ); } /* decr count, loop if not zero */ @@ -2065,15 +2076,13 @@ static void vaos_set_buffer( struct draw_vs_varient *varient, unsigned stride ) { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; - unsigned i; - for (i = 0; i < vaos->base.key.nr_inputs; i++) { - if (vaos->base.key.element[i].in.buffer == buf) { - vaos->attrib[i].input_ptr = ((char *)ptr + - vaos->base.key.element[i].in.offset); - vaos->attrib[i].input_stride = stride; - } + if (buf < vaos->nr_vb) { + vaos->buffer[buf].base_ptr = (char *)ptr; + vaos->buffer[buf].stride = stride; } + + if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride); } @@ -2086,10 +2095,12 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient, struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; struct aos_machine *machine = vaos->draw->vs.aos_machine; + if (0) debug_printf("%s %d\n", __FUNCTION__, count); + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; machine->constants = vaos->draw->vs.aligned_constants; machine->immediates = vaos->base.vs->immediates; - machine->attrib = vaos->attrib; + machine->buffer = vaos->buffer; vaos->gen_run_elts( machine, elts, @@ -2105,10 +2116,13 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient, struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; struct aos_machine *machine = vaos->draw->vs.aos_machine; + if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, + vaos->base.key.const_vbuffers); + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; machine->constants = vaos->draw->vs.aligned_constants; machine->immediates = vaos->base.vs->immediates; - machine->attrib = vaos->attrib; + machine->buffer = vaos->buffer; vaos->gen_run_linear( machine, start, @@ -2127,7 +2141,7 @@ static void vaos_destroy( struct draw_vs_varient *varient ) { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; - FREE( vaos->attrib ); + FREE( vaos->buffer ); x86_release_func( &vaos->func[0] ); x86_release_func( &vaos->func[1] ); @@ -2140,6 +2154,7 @@ static void vaos_destroy( struct draw_vs_varient *varient ) static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_varient_key *key ) { + unsigned i; struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); if (!vaos) @@ -2147,17 +2162,22 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, vaos->base.key = *key; vaos->base.vs = vs; - vaos->base.set_input = vaos_set_buffer; + vaos->base.set_buffer = vaos_set_buffer; vaos->base.destroy = vaos_destroy; vaos->base.run_linear = vaos_run_linear; vaos->base.run_elts = vaos_run_elts; vaos->draw = vs->draw; - vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) ); - if (!vaos->attrib) + for (i = 0; i < key->nr_inputs; i++) + vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 ); + + vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) ); + if (!vaos->buffer) goto fail; + debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); + #if 0 tgsi_dump(vs->state.tokens, 0); #endif @@ -2179,8 +2199,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, return &vaos->base; fail: - if (vaos && vaos->attrib) - FREE(vaos->attrib); + if (vaos && vaos->buffer) + FREE(vaos->buffer); if (vaos) x86_release_func( &vaos->func[0] ); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index 7fe6f79db0..264387517b 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -87,9 +87,10 @@ struct lit_info { #define MAX_SHINE_TAB 4 #define MAX_LIT_INFO 16 -struct aos_attrib { - const void *input_ptr; - unsigned input_stride; +struct aos_buffer { + const void *base_ptr; + unsigned stride; + void *ptr; /* updated per vertex */ }; @@ -123,7 +124,7 @@ struct aos_machine { const float (*immediates)[4]; /* points to shader data */ const float (*constants)[4]; /* points to draw data */ - const struct aos_attrib *attrib; /* points to ? */ + const struct aos_buffer *buffer; /* points to ? */ }; @@ -175,12 +176,15 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp, unsigned idx, unsigned dirty ); +void aos_spill_all( struct aos_compilation *cp ); + struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, unsigned file, unsigned idx ); -boolean aos_fetch_inputs( struct aos_compilation *cp, - boolean linear ); +boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ); +boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ); +boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ); boolean aos_emit_outputs( struct aos_compilation *cp ); @@ -210,7 +214,7 @@ do { \ #define X86_NULL 0 #define X86_IMMEDIATES 1 #define X86_CONSTANTS 2 -#define X86_ATTRIBS 3 +#define X86_BUFFERS 3 struct x86_reg aos_get_x86( struct aos_compilation *cp, unsigned which_reg, @@ -232,7 +236,8 @@ struct draw_vs_varient_aos_sse { struct draw_vs_varient base; struct draw_context *draw; - struct aos_attrib *attrib; + struct aos_buffer *buffer; + unsigned nr_vb; vaos_run_linear_func gen_run_linear; vaos_run_elts_func gen_run_elts; diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c index 26297c74f8..dd79bc799a 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -54,6 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp, struct x86_reg data, struct x86_reg src_ptr ) { +#if 1 sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); /* data = z ? ? ? */ sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); @@ -62,6 +63,16 @@ static void emit_load_R32G32B32( struct aos_compilation *cp, /* data = ? 0 z 1 */ sse_movlps(cp->func, data, src_ptr); /* data = x y z 1 */ +#else + sse_movups(cp->func, data, src_ptr); + /* data = x y z ? */ + sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) ); + /* data = ? x y z */ + sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) ); + /* data = 1 x y z */ + sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) ); + /* data = x y z 1 */ +#endif } static void emit_load_R32G32( struct aos_compilation *cp, @@ -95,28 +106,6 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, -static void get_src_ptr( struct aos_compilation *cp, - struct x86_reg src, - struct x86_reg elt, - unsigned a ) -{ - struct x86_reg attrib = x86_make_disp(aos_get_x86( cp, 0, X86_ATTRIBS ), - a * sizeof(struct aos_attrib)); - - struct x86_reg input_ptr = x86_make_disp(attrib, - Offset(struct aos_attrib, input_ptr)); - - struct x86_reg input_stride = x86_make_disp(attrib, - Offset(struct aos_attrib, input_stride)); - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, src, input_stride); - x86_imul(cp->func, src, elt); - x86_add(cp->func, src, input_ptr); -} - - /* Extended swizzles? Maybe later. */ static void emit_swizzle( struct aos_compilation *cp, @@ -128,22 +117,60 @@ static void emit_swizzle( struct aos_compilation *cp, } + +static boolean get_buffer_ptr( struct aos_compilation *cp, + boolean linear, + unsigned buf_idx, + struct x86_reg elt, + struct x86_reg ptr) +{ + struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), + buf_idx * sizeof(struct aos_buffer)); + + struct x86_reg buf_stride = x86_make_disp(buf, + Offset(struct aos_buffer, stride)); + if (linear) { + struct x86_reg buf_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, ptr)); + + + /* Calculate pointer to current attrib: + */ + x86_mov(cp->func, ptr, buf_ptr); + x86_mov(cp->func, elt, buf_stride); + x86_add(cp->func, elt, ptr); + if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192)); + x86_mov(cp->func, buf_ptr, elt); + } + else { + struct x86_reg buf_base_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, base_ptr)); + + + /* Calculate pointer to current attrib: + */ + x86_mov(cp->func, ptr, buf_stride); + x86_imul(cp->func, ptr, elt); + x86_add(cp->func, ptr, buf_base_ptr); + } + + cp->insn_counter++; + + return TRUE; +} + + static boolean load_input( struct aos_compilation *cp, unsigned idx, - boolean linear ) + struct x86_reg bufptr ) { unsigned format = cp->vaos->base.key.element[idx].in.format; - struct x86_reg src = cp->tmp_EAX; + unsigned offset = cp->vaos->base.key.element[idx].in.offset; struct x86_reg dataXMM = aos_get_xmm_reg(cp); /* Figure out source pointer address: */ - get_src_ptr(cp, - src, - linear ? cp->idx_EBX : x86_deref(cp->idx_EBX), - idx); - - src = x86_deref(src); + struct x86_reg src = x86_make_disp(bufptr, offset); aos_adopt_xmm_reg( cp, dataXMM, @@ -179,20 +206,128 @@ static boolean load_input( struct aos_compilation *cp, return TRUE; } - -boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) +static boolean load_inputs( struct aos_compilation *cp, + unsigned buffer, + struct x86_reg ptr ) { unsigned i; - + for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) { - if (!load_input( cp, i, linear )) - return FALSE; - cp->insn_counter++; + if (cp->vaos->base.key.element[i].in.buffer == buffer) { + + if (!load_input( cp, i, ptr )) + return FALSE; + + cp->insn_counter++; + } + } + + return TRUE; +} + +boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ) +{ + unsigned i; + for (i = 0; i < cp->vaos->nr_vb; i++) { + struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), + i * sizeof(struct aos_buffer)); + + struct x86_reg buf_base_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, base_ptr)); + + if (cp->vaos->base.key.const_vbuffers & (1<<i)) { + struct x86_reg ptr = cp->tmp_EAX; + + x86_mov(cp->func, ptr, buf_base_ptr); + + /* Load all inputs for this constant vertex buffer + */ + load_inputs( cp, i, x86_deref(ptr) ); + + /* Then just force them out to aos_machine.input[] + */ + aos_spill_all( cp ); + + } + else if (linear) { + + struct x86_reg elt = cp->idx_EBX; + struct x86_reg ptr = cp->tmp_EAX; + + struct x86_reg buf_stride = x86_make_disp(buf, + Offset(struct aos_buffer, stride)); + + struct x86_reg buf_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, ptr)); + + + /* Calculate pointer to current attrib: + */ + x86_mov(cp->func, ptr, buf_stride); + x86_imul(cp->func, ptr, elt); + x86_add(cp->func, ptr, buf_base_ptr); + + + /* In the linear case, keep the buffer pointer instead of the + * index number. + */ + if (cp->vaos->nr_vb == 1) + x86_mov( cp->func, elt, ptr ); + else + x86_mov( cp->func, buf_ptr, ptr ); + + cp->insn_counter++; + } + } + + return TRUE; +} + +boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) +{ + unsigned j; + + for (j = 0; j < cp->vaos->nr_vb; j++) { + if (cp->vaos->base.key.const_vbuffers & (1<<j)) { + /* just retreive pre-transformed input */ + } + else if (linear && cp->vaos->nr_vb == 1) { + load_inputs( cp, 0, cp->idx_EBX ); + } + else { + struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX); + struct x86_reg ptr = cp->tmp_EAX; + + if (!get_buffer_ptr( cp, linear, j, elt, ptr )) + return FALSE; + + if (!load_inputs( cp, j, ptr )) + return FALSE; + } } return TRUE; } +boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ) +{ + if (linear && cp->vaos->nr_vb == 1) { + struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), + (0 * sizeof(struct aos_buffer) + + Offset(struct aos_buffer, stride))); + + x86_add(cp->func, cp->idx_EBX, stride); + sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192)); + } + else if (linear) { + /* Nothing to do */ + } + else { + x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4)); + } + + return TRUE; +} @@ -203,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp, struct x86_reg dst_ptr, struct x86_reg dataXMM ) { - sse_movups(cp->func, dst_ptr, dataXMM); + sse_movaps(cp->func, dst_ptr, dataXMM); } static void emit_store_R32G32B32( struct aos_compilation *cp, @@ -306,7 +441,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp ) if (data.file != file_XMM) { struct x86_reg tmp = aos_get_xmm_reg( cp ); - sse_movups(cp->func, tmp, data); + sse_movaps(cp->func, tmp, data); data = tmp; } diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c index 2ce30b9a02..727977bc3a 100644 --- a/src/gallium/auxiliary/draw/draw_vs_llvm.c +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -32,6 +32,7 @@ * Brian Paul */ +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "draw_private.h" #include "draw_context.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index 0efabd9de8..b11ae31662 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -37,7 +37,7 @@ #include "draw_vs.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) #include "pipe/p_shader_tokens.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index 4daf05dae7..7ee567d478 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -64,10 +64,10 @@ struct draw_vs_varient_generic { -static void vsvg_set_input( struct draw_vs_varient *varient, - unsigned buffer, - const void *ptr, - unsigned stride ) +static void vsvg_set_buffer( struct draw_vs_varient *varient, + unsigned buffer, + const void *ptr, + unsigned stride ) { struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; @@ -265,7 +265,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, vsvg->base.key = *key; vsvg->base.vs = vs; - vsvg->base.set_input = vsvg_set_input; + vsvg->base.set_buffer = vsvg_set_buffer; vsvg->base.run_elts = vsvg_run_elts; vsvg->base.run_linear = vsvg_run_linear; vsvg->base.destroy = vsvg_destroy; diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp index 0fc5c4ec5c..fcc5c05794 100644 --- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp +++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp @@ -1,140 +1,140 @@ static const unsigned char llvm_builtins_data[] = { -0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x29,0x02,0x00,0x00,0x01,0x10,0x00,0x00, +0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00, 0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39, 0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02, 0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24, 0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64, -0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c, -0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x32,0x22,0x48,0x09, -0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52, -0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x29,0x80,0x21,0x00, -0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,0x80,0x50,0x2b,0x03, -0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,0x14,0x01,0x80,0x11, -0x80,0x22,0x88,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,0x36,0x80,0x87,0x71, -0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,0x70,0x87,0x7a,0xd8, -0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,0xa0, -0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06, -0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,0x71,0x60,0x07,0x7a, -0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,0x20,0x07,0x7a,0x30, -0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07, -0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,0x7a,0x30,0x07,0x72, -0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,0x72,0xa0, -0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,0x07,0x74,0xa0,0x07, -0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78, -0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,0x00,0x07,0x7a,0x60, -0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0x07, -0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,0x04,0x20,0x76,0x46, -0xfc,0x6c,0x48,0x92,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,0x12,0x20,0x00,0x00, -0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x01,0x00,0x00,0x00,0x30,0x24,0x59,0x00,0x20, -0x08,0x00,0x00,0x00,0x86,0x24,0x0a,0x00,0x04,0x00,0x00,0x00,0xc0,0x90,0x84,0x01, -0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12, -0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,0x00,0x00,0x00,0x0c, -0x49,0x14,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x49,0x01,0x00,0x41,0x00,0x00,0x00, -0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90, +0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff, +0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00, +0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45, +0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83, +0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47, +0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40, +0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05, +0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36, +0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7, +0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60, +0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e, +0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71, +0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20, +0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07, +0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a, +0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60, +0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07, +0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a, +0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40, +0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07, +0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f, +0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00, +0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02, +0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00, +0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01, +0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12, +0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64, +0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90, 0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02, 0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66, 0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95, 0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00, 0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00, 0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60, -0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,0x34,0xc9,0x30,0x41, -0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,0x8d,0x35,0x56,0x01, -0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0x46,0x41,0x08,0xcc, -0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,0x35,0x04,0x80,0x39, -0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,0x04,0x3e,0x30,0x0c, -0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,0x05,0xd1,0x4c,0x11, -0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, -0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41, +0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01, +0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6, +0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d, +0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7, +0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f, +0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00, 0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84, 0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00, -0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15, +0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15, 0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19, -0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,0x17,0x60,0x20,0xc5, -0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,0xf3,0xd4,0xb8,0x69, -0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,0x0c,0x13,0xf3,0x9c, -0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,0x8b,0x23,0x28,0x76, -0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,0xd1,0x4c,0x11,0x66, -0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, -0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00, -0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,0x11,0x00,0x00,0x00, -0x63,0x08,0x4d,0x64,0x16,0xc1,0x49,0x86,0xab,0x22,0x66,0x19,0x02,0x01,0x1b,0x43, -0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,0x0a,0x20,0x0b,0x34, -0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x24,0x83,0x57,0x11,0xb3, -0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,0x48,0xb3,0x04,0xc6, -0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,0x63,0x08,0xcd,0x64, -0x64,0x40,0x70,0x92,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,0x60,0x0c,0xc1,0x99, -0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,0x33,0x38,0xd0,0x00, -0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,0xe0,0x24,0x03,0x1b, -0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,0x30,0x63,0x08,0x0f, -0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,0xa0,0x06,0x70,0x00, -0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x76,0x52,0x4c,0xcc, -0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,0xc6,0x50,0x8a,0x89, -0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,0x79,0x68,0x73,0x20, -0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,0x9e,0xdb,0x32,0x88, -0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,0xa7,0xb7,0x95,0x62, -0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,0x34,0x35,0x56,0x62, -0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,0x2c,0x82,0xd3,0x0c, -0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,0x24,0x01,0x63,0xec, -0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,0xfc,0xc4,0xd0,0x90, -0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,0xc1,0x71,0x7b,0x29, -0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,0x32,0xf6,0xe6,0x46, -0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,0x4e,0x33,0x58,0x47, -0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,0x33,0xe1,0xbc,0xa5, -0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,0x3a,0x40,0xc6,0xde, -0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,0xed,0x82,0x10,0x9c, -0xa6,0xba,0x81,0x44,0x70,0x9a,0xc1,0x17,0x9c,0x66,0x32,0x93,0x42,0x60,0x1e,0x7b, -0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,0xa7,0x6d,0xa4,0x98, -0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,0x9c,0x66,0xc0,0x7b, -0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,0x8b,0xe0,0x34,0x83, -0x2f,0x38,0xcd,0x64,0xd3,0xe6,0x61,0x08,0x4e,0x53,0xd5,0xf6,0x01,0x14,0x44,0x33, -0x45,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00, -0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04, -0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,0x1e,0xe1,0x19,0xc6, -0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,0x15,0xc1,0x31,0x84, -0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,0x71,0x6c,0x23,0x38, -0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79, -0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5, -0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0, -0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,0x63,0x21,0x40,0x70, -0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00, -0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf, -0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2, -0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00, -0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7, -0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21, -0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54, -0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13, -0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd, -0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c, -0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00, -0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04, -0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48, -0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85, -0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31, -0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c, -0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00, -0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c, -0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80, -0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51, -0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38, -0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51, -0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d, -0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05, -0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00, -0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04, -0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4, -0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58, -0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33, -0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00, -0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f, -0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52, -0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec, -0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87, -0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d, -0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61, -0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00, -0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18, -0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5, -0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73, -0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c, -0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00}; +0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05, +0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d, +0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4, +0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb, +0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00, +0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91, +0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19, +0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81, +0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70, +0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0, +0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48, +0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc, +0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80, +0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80, +0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12, +0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7, +0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46, +0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89, +0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62, +0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98, +0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a, +0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25, +0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7, +0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9, +0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6, +0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01, +0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08, +0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e, +0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e, +0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee, +0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93, +0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a, +0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10, +0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06, +0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6, +0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00, +0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10, +0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63, +0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1, +0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48, +0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58, +0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7, +0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82, +0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00, +0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb, +0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6, +0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5, +0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18, +0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f, +0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59, +0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8, +0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86, +0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72, +0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00, +0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33, +0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3, +0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0, +0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4, +0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c, +0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00, +0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04, +0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca, +0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c, +0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc, +0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31, +0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54, +0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00, +0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34, +0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4, +0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60, +0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00, +0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb, +0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76, +0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4, +0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44, +0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0, +0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03, +0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82, +0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3, +0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c, +0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84, +0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84, +0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp index e64bfb1c6c..3a2f2878a3 100644 --- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp +++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp @@ -46,6 +46,7 @@ #include "tgsi/tgsi_dump.h" #include "util/u_memory.h" +#include "util/u_math.h" #include <llvm/Module.h> #include <llvm/CallingConv.h> @@ -157,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod); llvm::ExecutionEngine *ee = cpu->engine; assert(ee); - /*FIXME : remove */ - ee->DisableLazyCompilation(); + /*FIXME : why was this disabled ? we need it for pow/sqrt/... */ + ee->DisableLazyCompilation(false); ee->addModuleProvider(mp); llvm::Function *func = func_for_shader(prog); @@ -201,7 +202,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog, unsigned int i, j; unsigned slot; vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function); - assert(runner); for (i = 0; i < count; i += MAX_TGSI_VERTICES) { diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp index a82dc30306..599975d5ad 100644 --- a/src/gallium/auxiliary/gallivm/instructions.cpp +++ b/src/gallium/auxiliary/gallivm/instructions.cpp @@ -83,6 +83,7 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB m_llvmPow = 0; m_llvmFloor = 0; m_llvmFlog = 0; + m_llvmFexp = 0; m_llvmLit = 0; m_fmtPtr = 0; @@ -92,194 +93,271 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB m_mod = ParseBitcodeFile(buffer); } +llvm::BasicBlock * Instructions::currentBlock() const +{ + return m_builder.GetInsertBlock(); +} + +llvm::Value * Instructions::abs(llvm::Value *in) +{ + std::vector<llvm::Value*> vec = extractVector(in); + Value *xabs = callFAbs(vec[0]); + Value *yabs = callFAbs(vec[1]); + Value *zabs = callFAbs(vec[2]); + Value *wabs = callFAbs(vec[3]); + return vectorFromVals(xabs, yabs, zabs, wabs); +} + llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2) { return m_builder.CreateAdd(in1, in2, name("add")); } -llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2, - llvm::Value *in3) +llvm::Value * Instructions::arl(llvm::Value *in) { - Value *mulRes = mul(in1, in2); - return add(mulRes, in3); + return floor(in); } - -llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2) + +void Instructions::beginLoop() { - return m_builder.CreateMul(in1, in2, name("mul")); + BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0); + BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0); + + m_builder.CreateBr(begin); + Loop loop; + loop.begin = begin; + loop.end = end; + m_builder.SetInsertPoint(begin); + m_loopStack.push(loop); } -const char * Instructions::name(const char *prefix) +void Instructions::bgnSub(unsigned label) { - ++m_idx; - snprintf(m_name, 32, "%s%d", prefix, m_idx); - return m_name; + llvm::Function *func = findFunction(label); + + Function::arg_iterator args = func->arg_begin(); + Value *ptr_INPUT = args++; + ptr_INPUT->setName("INPUT"); + m_storage->pushArguments(ptr_INPUT); + + llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0); + + m_func = func; + m_builder.SetInsertPoint(entry); } -llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2) +void Instructions::brk() { - Value *mulRes = mul(in1, in2); - Value *x = m_builder.CreateExtractElement(mulRes, - m_storage->constantInt(0), - name("extractx")); - Value *y = m_builder.CreateExtractElement(mulRes, - m_storage->constantInt(1), - name("extracty")); - Value *z = m_builder.CreateExtractElement(mulRes, - m_storage->constantInt(2), - name("extractz")); - Value *xy = m_builder.CreateAdd(x, y,name("xy")); - Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3")); - return vectorFromVals(dot3, dot3, dot3, dot3); + assert(!m_loopStack.empty()); + BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0); + m_builder.CreateBr(m_loopStack.top().end); + m_builder.SetInsertPoint(unr); } -llvm::Value *Instructions::callFSqrt(llvm::Value *val) +void Instructions::cal(int label, llvm::Value *input) { - if (!m_llvmFSqrt) { - // predeclare the intrinsic - std::vector<const Type*> fsqrtArgs; - fsqrtArgs.push_back(Type::FloatTy); - PAListPtr fsqrtPal; - FunctionType* fsqrtType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/fsqrtArgs, - /*isVarArg=*/false); - m_llvmFSqrt = Function::Create( - /*Type=*/fsqrtType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"llvm.sqrt.f32", m_mod); - m_llvmFSqrt->setCallingConv(CallingConv::C); - m_llvmFSqrt->setParamAttrs(fsqrtPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val, - name("sqrt")); - call->setCallingConv(CallingConv::C); - call->setTailCall(false); - return call; + std::vector<Value*> params; + params.push_back(input); + llvm::Function *func = findFunction(label); + + m_builder.CreateCall(func, params.begin(), params.end()); } -llvm::Value * Instructions::rsq(llvm::Value *in1) +llvm::Value * Instructions::ceil(llvm::Value *in) { - Value *x = m_builder.CreateExtractElement(in1, - m_storage->constantInt(0), - name("extractx")); - Value *abs = callFAbs(x); - Value *sqrt = callFSqrt(abs); - - Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), - sqrt, - name("rsqrt")); - return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt); + std::vector<llvm::Value*> vec = extractVector(in); + return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]), + callCeil(vec[2]), callCeil(vec[3])); } -llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y, - llvm::Value *z, llvm::Value *w) +llvm::Value * Instructions::clamp(llvm::Value *in1) { - Constant *const_vec = Constant::getNullValue(m_floatVecType); - Value *res = m_builder.CreateInsertElement(const_vec, x, - m_storage->constantInt(0), - name("vecx")); - res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1), - name("vecxy")); - res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2), - name("vecxyz")); - if (w) - res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3), - name("vecxyzw")); - return res; + llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f); + llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f); + return min( max(zero, in1), one); } -llvm::Value *Instructions::callFAbs(llvm::Value *val) +llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - if (!m_llvmFAbs) { - // predeclare the intrinsic - std::vector<const Type*> fabsArgs; - fabsArgs.push_back(Type::FloatTy); - PAListPtr fabsPal; - FunctionType* fabsType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/fabsArgs, - /*isVarArg=*/false); - m_llvmFAbs = Function::Create( - /*Type=*/fabsType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"fabs", m_mod); - m_llvmFAbs->setCallingConv(CallingConv::C); - m_llvmFAbs->setParamAttrs(fabsPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFAbs, val, - name("fabs")); - call->setCallingConv(CallingConv::C); + llvm::Function *func = m_mod->getFunction("cmp"); + assert(func); + + std::vector<Value*> params; + params.push_back(in1); + params.push_back(in2); + params.push_back(in3); + CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres")); call->setTailCall(false); return call; } -llvm::Value * Instructions::lit(llvm::Value *in) +llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - if (!m_llvmLit) { - m_llvmLit = m_mod->getFunction("lit"); - } - CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres")); - call->setCallingConv(CallingConv::C); - call->setTailCall(false); - return call; + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); + std::vector<llvm::Value*> vec3 = extractVector(in3); + Constant *half = ConstantFP::get(APFloat(0.5f)); + + Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp")); + Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0], + name("selx")); + + Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp")); + Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1], + name("sely")); + + Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp")); + Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2], + name("selz")); + + Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp")); + Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3], + name("selw")); + + return vectorFromVals(selx, sely, selz, selw); } -llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - Value *res = m_builder.CreateSub(in1, in2, name("sub")); - return res; + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); + std::vector<llvm::Value*> vec3 = extractVector(in3); + Constant *zero = Constant::getNullValue(Type::FloatTy); + + Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp")); + Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0], + name("selx")); + + Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp")); + Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1], + name("sely")); + + Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp")); + Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2], + name("selz")); + + Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp")); + Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3], + name("selw")); + + return vectorFromVals(selx, sely, selz, selw); } -llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2) +llvm::Value * Instructions::cos(llvm::Value *in) { - if (!m_llvmPow) { - // predeclare the intrinsic - std::vector<const Type*> powArgs; - powArgs.push_back(Type::FloatTy); - powArgs.push_back(Type::FloatTy); - PAListPtr powPal; - FunctionType* powType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/powArgs, - /*isVarArg=*/false); - m_llvmPow = Function::Create( - /*Type=*/powType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"llvm.pow.f32", m_mod); - m_llvmPow->setCallingConv(CallingConv::C); - m_llvmPow->setParamAttrs(powPal); - } - std::vector<Value*> params; - params.push_back(val1); - params.push_back(val2); - CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(), - name("pow")); - call->setCallingConv(CallingConv::C); +#if 0 + llvm::Function *func = m_mod->getFunction("vcos"); + assert(func); + + CallInst *call = m_builder.CreateCall(func, in, name("cosres")); call->setTailCall(false); return call; +#else + std::vector<llvm::Value*> elems = extractVector(in); + Function *func = m_mod->getFunction("cosf"); + assert(func); + CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres")); + cos->setCallingConv(CallingConv::C); + cos->setTailCall(true); + return vectorFromVals(cos, cos, cos, cos); +#endif } -llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2) { Value *x1 = m_builder.CreateExtractElement(in1, m_storage->constantInt(0), name("x1")); + Value *y1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(1), + name("y1")); + Value *z1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(2), + name("z1")); + Value *x2 = m_builder.CreateExtractElement(in2, m_storage->constantInt(0), name("x2")); - llvm::Value *val = callPow(x1, x2); - return vectorFromVals(val, val, val, val); + Value *y2 = m_builder.CreateExtractElement(in2, + m_storage->constantInt(1), + name("y2")); + Value *z2 = m_builder.CreateExtractElement(in2, + m_storage->constantInt(2), + name("z2")); + Value *y1z2 = mul(y1, z2); + Value *z1y2 = mul(z1, y2); + + Value *z1x2 = mul(z1, x2); + Value *x1z2 = mul(x1, z2); + + Value *x1y2 = mul(x1, y2); + Value *y1x2 = mul(y1, x2); + + return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2)); } -llvm::Value * Instructions::rcp(llvm::Value *in1) +llvm::Value * Instructions::ddx(llvm::Value *in) { - Value *x1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(0), - name("x1")); - Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), - x1, name("rcp")); - return vectorFromVals(res, res, res, res); + // FIXME + assert(0); +} + +llvm::Value * Instructions::ddy(llvm::Value *in) +{ + // FIXME + assert(0); +} + +llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2) +{ + return m_builder.CreateFDiv(in1, in2, name("div")); +} + +llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) +{ + Value *mulRes = mul(in1, in2); + Value *x = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(0), + name("extractx")); + Value *y = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(1), + name("extracty")); + Value *z = m_builder.CreateExtractElement(in3, + m_storage->constantInt(2), + name("extractz")); + Value *xy = m_builder.CreateAdd(x, y,name("xy")); + Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add")); + return vectorFromVals(dot2add, dot2add, dot2add, dot2add); +} + +llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2) +{ + Value *mulRes = mul(in1, in2); + Value *x = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(0), + name("extractx")); + Value *y = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(1), + name("extracty")); + Value *xy = m_builder.CreateAdd(x, y,name("xy")); + return vectorFromVals(xy, xy, xy, xy); +} + +llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2) +{ + Value *mulRes = mul(in1, in2); + Value *x = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(0), + name("extractx")); + Value *y = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(1), + name("extracty")); + Value *z = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(2), + name("extractz")); + Value *xy = m_builder.CreateAdd(x, y,name("xy")); + Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3")); + return vectorFromVals(dot3, dot3, dot3, dot3); } llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2) @@ -321,6 +399,53 @@ llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2) ry, z, w); } +void Instructions::elseop() +{ + assert(!m_ifStack.empty()); + BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0); + m_builder.CreateBr(ifend); + m_builder.SetInsertPoint(m_ifStack.top()); + currentBlock()->setName(name("ifelse")); + m_ifStack.pop(); + m_ifStack.push(ifend); +} + +void Instructions::endif() +{ + assert(!m_ifStack.empty()); + m_builder.CreateBr(m_ifStack.top()); + m_builder.SetInsertPoint(m_ifStack.top()); + m_ifStack.pop(); +} + +void Instructions::endLoop() +{ + assert(!m_loopStack.empty()); + Loop loop = m_loopStack.top(); + m_builder.CreateBr(loop.begin); + loop.end->moveAfter(currentBlock()); + m_builder.SetInsertPoint(loop.end); + m_loopStack.pop(); +} + +void Instructions::end() +{ + m_builder.CreateRetVoid(); +} + +void Instructions::endSub() +{ + m_func = 0; + m_builder.SetInsertPoint(0); +} + +llvm::Value * Instructions::exp(llvm::Value *in) +{ + std::vector<llvm::Value*> vec = extractVector(in); + return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]), + callFExp(vec[2]), callFExp(vec[3])); +} + llvm::Value * Instructions::ex2(llvm::Value *in) { llvm::Value *val = callPow(ConstantFP::get(APFloat(2.f)), @@ -330,31 +455,6 @@ llvm::Value * Instructions::ex2(llvm::Value *in) return vectorFromVals(val, val, val, val); } -llvm::Value * Instructions::callFloor(llvm::Value *val) -{ - if (!m_llvmFloor) { - // predeclare the intrinsic - std::vector<const Type*> floorArgs; - floorArgs.push_back(Type::FloatTy); - PAListPtr floorPal; - FunctionType* floorType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/floorArgs, - /*isVarArg=*/false); - m_llvmFloor = Function::Create( - /*Type=*/floorType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"floorf", m_mod); - m_llvmFloor->setCallingConv(CallingConv::C); - m_llvmFloor->setParamAttrs(floorPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFloor, val, - name("floorf")); - call->setCallingConv(CallingConv::C); - call->setTailCall(false); - return call; -} - llvm::Value * Instructions::floor(llvm::Value *in) { std::vector<llvm::Value*> vec = extractVector(in); @@ -362,42 +462,52 @@ llvm::Value * Instructions::floor(llvm::Value *in) callFloor(vec[2]), callFloor(vec[3])); } -llvm::Value * Instructions::arl(llvm::Value *in) -{ - return floor(in); -} - llvm::Value * Instructions::frc(llvm::Value *in) { llvm::Value *flr = floor(in); return sub(in, flr); } -llvm::Value * Instructions::callFLog(llvm::Value *val) +void Instructions::ifop(llvm::Value *in) { - if (!m_llvmFlog) { - // predeclare the intrinsic - std::vector<const Type*> flogArgs; - flogArgs.push_back(Type::FloatTy); - PAListPtr flogPal; - FunctionType* flogType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/flogArgs, - /*isVarArg=*/false); - m_llvmFlog = Function::Create( - /*Type=*/flogType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"logf", m_mod); - m_llvmFlog->setCallingConv(CallingConv::C); - m_llvmFlog->setParamAttrs(flogPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFlog, val, - name("logf")); - call->setCallingConv(CallingConv::C); + BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0); + BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0); + + //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0); + //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0); + //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0); + + Constant *float0 = Constant::getNullValue(Type::FloatTy); + + Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0), + name("extractx")); + Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp")); + m_builder.CreateCondBr(xcmp, ifthen, ifend); + //m_builder.SetInsertPoint(yblock); + + m_builder.SetInsertPoint(ifthen); + m_ifStack.push(ifend); +} + +llvm::Value * Instructions::kil(llvm::Value *in) +{ + llvm::Function *func = m_mod->getFunction("kil"); + assert(func); + + CallInst *call = m_builder.CreateCall(func, in, name("kilpres")); call->setTailCall(false); return call; } +llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2, + llvm::Value *in3) +{ + llvm::Value *m = mul(in1, in2); + llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f); + llvm::Value *s = sub(vec1, in1); + return add(m, mul(s, in3)); +} + llvm::Value * Instructions::lg2(llvm::Value *in) { std::vector<llvm::Value*> vec = extractVector(in); @@ -407,142 +517,176 @@ llvm::Value * Instructions::lg2(llvm::Value *in) callFLog(vec[2]), callFLog(vec[3])), const_vec); } -llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::lit(llvm::Value *in) +{ + if (!m_llvmLit) { + m_llvmLit = m_mod->getFunction("lit"); + } + CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::log(llvm::Value *in) +{ + std::vector<llvm::Value*> vec = extractVector(in); + return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]), + callFLog(vec[2]), callFLog(vec[3])); +} + +llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2, + llvm::Value *in3) +{ + Value *mulRes = mul(in1, in2); + return add(mulRes, in3); +} + +llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2) { std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); + Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], + name("xcmp")); Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0], name("selx")); - Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], + name("ycmp")); Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1], name("sely")); - Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], + name("zcmp")); Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2], name("selz")); - Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], + name("wcmp")); Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3], name("selw")); return vectorFromVals(selx, sely, selz, selw); } -llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2) { std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], - name("xcmp")); + Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0], name("selx")); - Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], - name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1], name("sely")); - Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], - name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2], name("selz")); - Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], - name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3], name("selw")); return vectorFromVals(selx, sely, selz, selw); } -void Instructions::printVector(llvm::Value *val) +llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2) { - static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A"; + return m_builder.CreateMul(in1, in2, name("mul")); +} - if (!m_fmtPtr) { - Constant *format = ConstantArray::get(frmt, true); - ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1); - GlobalVariable* globalFormat = new GlobalVariable( - /*Type=*/arrayTy, - /*isConstant=*/true, - /*Linkage=*/GlobalValue::InternalLinkage, - /*Initializer=*/0, // has initializer, specified below - /*Name=*/name(".str"), - m_mod); - globalFormat->setInitializer(format); +llvm::Value * Instructions::neg(llvm::Value *in) +{ + Value *neg = m_builder.CreateNeg(in, name("neg")); + return neg; +} - Constant* const_int0 = Constant::getNullValue(IntegerType::get(32)); - std::vector<Constant*> const_ptr_21_indices; - const_ptr_21_indices.push_back(const_int0); - const_ptr_21_indices.push_back(const_int0); - m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat, - &const_ptr_21_indices[0], const_ptr_21_indices.size()); - } +llvm::Value * Instructions::nrm(llvm::Value *in) +{ + llvm::Value *v = rsq(in); + return mul(v, in); +} - Function *func_printf = m_mod->getFunction("printf"); - if (!func_printf) - func_printf = declarePrintf(); - assert(func_printf); - std::vector<llvm::Value*> vec = extractVector(val); - Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx")); - Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy")); - Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz")); - Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw")); - std::vector<Value*> params; - params.push_back(m_fmtPtr); - params.push_back(dx); - params.push_back(dy); - params.push_back(dz); - params.push_back(dw); - CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(), - name("printf")); - call->setCallingConv(CallingConv::C); - call->setTailCall(true); +llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2) +{ + Value *x1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(0), + name("x1")); + Value *x2 = m_builder.CreateExtractElement(in2, + m_storage->constantInt(0), + name("x2")); + llvm::Value *val = callPow(x1, x2); + return vectorFromVals(val, val, val, val); } -llvm::Function * Instructions::declarePrintf() +llvm::Value * Instructions::rcp(llvm::Value *in1) { - std::vector<const Type*> args; - PAListPtr params; - FunctionType* funcTy = FunctionType::get( - /*Result=*/IntegerType::get(32), - /*Params=*/args, - /*isVarArg=*/true); - Function* func_printf = Function::Create( - /*Type=*/funcTy, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"printf", m_mod); - func_printf->setCallingConv(CallingConv::C); - func_printf->setParamAttrs(params); - return func_printf; + Value *x1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(0), + name("x1")); + Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), + x1, name("rcp")); + return vectorFromVals(res, res, res, res); +} + +llvm::Value * Instructions::rsq(llvm::Value *in1) +{ + Value *x = m_builder.CreateExtractElement(in1, + m_storage->constantInt(0), + name("extractx")); + Value *abs = callFAbs(x); + Value *sqrt = callFSqrt(abs); + + Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), + sqrt, + name("rsqrt")); + return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt); } +llvm::Value * Instructions::scs(llvm::Value *in) +{ + llvm::Function *func = m_mod->getFunction("scs"); + assert(func); -llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2) + CallInst *call = m_builder.CreateCall(func, in, name("scsres")); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2) { Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); Constant *const0f = Constant::getNullValue(Type::FloatTy); std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp")); + + Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp")); Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp")); Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); - Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp")); Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); - Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp")); Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); return vectorFromVals(x, y, z, w); } + +llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2) +{ + Constant *const0f = Constant::getNullValue(Type::FloatTy); + + return vectorFromVals(const0f, const0f, const0f, const0f); +} + llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2) { Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); @@ -566,157 +710,118 @@ llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2) return vectorFromVals(x, y, z, w); } - -llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2) { Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); Constant *const0f = Constant::getNullValue(Type::FloatTy); std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - - Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); + Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp")); Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp")); Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); - Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp")); Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); - Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp")); Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); return vectorFromVals(x, y, z, w); } -llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::sin(llvm::Value *in) { - Value *x1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(0), - name("x1")); - Value *y1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(1), - name("y1")); - Value *z1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(2), - name("z1")); + llvm::Function *func = m_mod->getFunction("vsin"); + assert(func); - Value *x2 = m_builder.CreateExtractElement(in2, - m_storage->constantInt(0), - name("x2")); - Value *y2 = m_builder.CreateExtractElement(in2, - m_storage->constantInt(1), - name("y2")); - Value *z2 = m_builder.CreateExtractElement(in2, - m_storage->constantInt(2), - name("z2")); - Value *y1z2 = mul(y1, z2); - Value *z1y2 = mul(z1, y2); + CallInst *call = m_builder.CreateCall(func, in, name("sinres")); + call->setTailCall(false); + return call; +} - Value *z1x2 = mul(z1, x2); - Value *x1z2 = mul(x1, z2); +llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2) +{ + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + Constant *const0f = Constant::getNullValue(Type::FloatTy); - Value *x1y2 = mul(x1, y2); - Value *y1x2 = mul(y1, x2); + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); - return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2)); -} + Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp")); + Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); + Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp")); + Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); -llvm::Value * Instructions::abs(llvm::Value *in) -{ - std::vector<llvm::Value*> vec = extractVector(in); - Value *xabs = callFAbs(vec[0]); - Value *yabs = callFAbs(vec[1]); - Value *zabs = callFAbs(vec[2]); - Value *wabs = callFAbs(vec[3]); - return vectorFromVals(xabs, yabs, zabs, wabs); + Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp")); + Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); + + Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp")); + Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); + + return vectorFromVals(x, y, z, w); } -void Instructions::ifop(llvm::Value *in) +llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2) { - BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0); - BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0); + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + Constant *const0f = Constant::getNullValue(Type::FloatTy); - //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0); - //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0); - //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0); + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); - Constant *float0 = Constant::getNullValue(Type::FloatTy); + Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); + Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0), - name("extractx")); - Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp")); - m_builder.CreateCondBr(xcmp, ifthen, ifend); - //m_builder.SetInsertPoint(yblock); + Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); + Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); - m_builder.SetInsertPoint(ifthen); - m_ifStack.push(ifend); -} + Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); + Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); -llvm::BasicBlock * Instructions::currentBlock() const -{ - return m_builder.GetInsertBlock(); -} + Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); + Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); -void Instructions::elseop() -{ - assert(!m_ifStack.empty()); - BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0); - m_builder.CreateBr(ifend); - m_builder.SetInsertPoint(m_ifStack.top()); - currentBlock()->setName(name("ifelse")); - m_ifStack.pop(); - m_ifStack.push(ifend); + return vectorFromVals(x, y, z, w); } -void Instructions::endif() +llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2) { - assert(!m_ifStack.empty()); - m_builder.CreateBr(m_ifStack.top()); - m_builder.SetInsertPoint(m_ifStack.top()); - m_ifStack.pop(); -} + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + Constant *const0f = Constant::getNullValue(Type::FloatTy); -llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2, - llvm::Value *in3) -{ - llvm::Value *m = mul(in1, in2); - llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f); - llvm::Value *s = sub(vec1, in1); - return add(m, mul(s, in3)); -} + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); -void Instructions::beginLoop() -{ - BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0); - BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0); + Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp")); + Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - m_builder.CreateBr(begin); - Loop loop; - loop.begin = begin; - loop.end = end; - m_builder.SetInsertPoint(begin); - m_loopStack.push(loop); + Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp")); + Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); + + Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp")); + Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); + + Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp")); + Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); + + return vectorFromVals(x, y, z, w); } -void Instructions::endLoop() +llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2) { - assert(!m_loopStack.empty()); - Loop loop = m_loopStack.top(); - m_builder.CreateBr(loop.begin); - loop.end->moveAfter(currentBlock()); - m_builder.SetInsertPoint(loop.end); - m_loopStack.pop(); + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + + return vectorFromVals(const1f, const1f, const1f, const1f); } -void Instructions::brk() +llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2) { - assert(!m_loopStack.empty()); - BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0); - m_builder.CreateBr(m_loopStack.top().end); - m_builder.SetInsertPoint(unr); + Value *res = m_builder.CreateSub(in1, in2, name("sub")); + return res; } llvm::Value * Instructions::trunc(llvm::Value *in) @@ -741,18 +846,298 @@ llvm::Value * Instructions::trunc(llvm::Value *in) return vectorFromVals(fx, fy, fz, fw); } -void Instructions::end() +llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - m_builder.CreateRetVoid(); + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); + std::vector<llvm::Value*> vec3 = extractVector(in3); + + Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3")); + Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3")); + Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3")); + Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3")); + + Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3")); + Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3")); + Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3")); + Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3")); + + return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3); } -void Instructions::cal(int label, llvm::Value *input) +void Instructions::printVector(llvm::Value *val) { + static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A"; + + if (!m_fmtPtr) { + Constant *format = ConstantArray::get(frmt, true); + ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1); + GlobalVariable* globalFormat = new GlobalVariable( + /*Type=*/arrayTy, + /*isConstant=*/true, + /*Linkage=*/GlobalValue::InternalLinkage, + /*Initializer=*/0, // has initializer, specified below + /*Name=*/name(".str"), + m_mod); + globalFormat->setInitializer(format); + + Constant* const_int0 = Constant::getNullValue(IntegerType::get(32)); + std::vector<Constant*> const_ptr_21_indices; + const_ptr_21_indices.push_back(const_int0); + const_ptr_21_indices.push_back(const_int0); + m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat, + &const_ptr_21_indices[0], const_ptr_21_indices.size()); + } + + Function *func_printf = m_mod->getFunction("printf"); + if (!func_printf) + func_printf = declarePrintf(); + assert(func_printf); + std::vector<llvm::Value*> vec = extractVector(val); + Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx")); + Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy")); + Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz")); + Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw")); std::vector<Value*> params; - params.push_back(input); - llvm::Function *func = findFunction(label); + params.push_back(m_fmtPtr); + params.push_back(dx); + params.push_back(dy); + params.push_back(dz); + params.push_back(dw); + CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(), + name("printf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(true); +} - m_builder.CreateCall(func, params.begin(), params.end()); +const char * Instructions::name(const char *prefix) +{ + ++m_idx; + snprintf(m_name, 32, "%s%d", prefix, m_idx); + return m_name; +} + +llvm::Value * Instructions::callCeil(llvm::Value *val) +{ + if (!m_llvmCeil) { + // predeclare the intrinsic + std::vector<const Type*> ceilArgs; + ceilArgs.push_back(Type::FloatTy); + AttrListPtr ceilPal; + FunctionType* ceilType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/ceilArgs, + /*isVarArg=*/false); + m_llvmCeil = Function::Create( + /*Type=*/ceilType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"ceilf", m_mod); + m_llvmCeil->setCallingConv(CallingConv::C); + m_llvmCeil->setAttributes(ceilPal); + } + CallInst *call = m_builder.CreateCall(m_llvmCeil, val, + name("ceilf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value *Instructions::callFAbs(llvm::Value *val) +{ + if (!m_llvmFAbs) { + // predeclare the intrinsic + std::vector<const Type*> fabsArgs; + fabsArgs.push_back(Type::FloatTy); + AttrListPtr fabsPal; + FunctionType* fabsType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/fabsArgs, + /*isVarArg=*/false); + m_llvmFAbs = Function::Create( + /*Type=*/fabsType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"fabs", m_mod); + m_llvmFAbs->setCallingConv(CallingConv::C); + m_llvmFAbs->setAttributes(fabsPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFAbs, val, + name("fabs")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callFExp(llvm::Value *val) +{ + if (!m_llvmFexp) { + // predeclare the intrinsic + std::vector<const Type*> fexpArgs; + fexpArgs.push_back(Type::FloatTy); + AttrListPtr fexpPal; + FunctionType* fexpType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/fexpArgs, + /*isVarArg=*/false); + m_llvmFexp = Function::Create( + /*Type=*/fexpType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"expf", m_mod); + m_llvmFexp->setCallingConv(CallingConv::C); + m_llvmFexp->setAttributes(fexpPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFexp, val, + name("expf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callFLog(llvm::Value *val) +{ + if (!m_llvmFlog) { + // predeclare the intrinsic + std::vector<const Type*> flogArgs; + flogArgs.push_back(Type::FloatTy); + AttrListPtr flogPal; + FunctionType* flogType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/flogArgs, + /*isVarArg=*/false); + m_llvmFlog = Function::Create( + /*Type=*/flogType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"logf", m_mod); + m_llvmFlog->setCallingConv(CallingConv::C); + m_llvmFlog->setAttributes(flogPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFlog, val, + name("logf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callFloor(llvm::Value *val) +{ + if (!m_llvmFloor) { + // predeclare the intrinsic + std::vector<const Type*> floorArgs; + floorArgs.push_back(Type::FloatTy); + AttrListPtr floorPal; + FunctionType* floorType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/floorArgs, + /*isVarArg=*/false); + m_llvmFloor = Function::Create( + /*Type=*/floorType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"floorf", m_mod); + m_llvmFloor->setCallingConv(CallingConv::C); + m_llvmFloor->setAttributes(floorPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFloor, val, + name("floorf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value *Instructions::callFSqrt(llvm::Value *val) +{ + if (!m_llvmFSqrt) { + // predeclare the intrinsic + std::vector<const Type*> fsqrtArgs; + fsqrtArgs.push_back(Type::FloatTy); + AttrListPtr fsqrtPal; + FunctionType* fsqrtType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/fsqrtArgs, + /*isVarArg=*/false); + m_llvmFSqrt = Function::Create( + /*Type=*/fsqrtType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"llvm.sqrt.f32", m_mod); + m_llvmFSqrt->setCallingConv(CallingConv::C); + m_llvmFSqrt->setAttributes(fsqrtPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val, + name("sqrt")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2) +{ + if (!m_llvmPow) { + // predeclare the intrinsic + std::vector<const Type*> powArgs; + powArgs.push_back(Type::FloatTy); + powArgs.push_back(Type::FloatTy); + AttrListPtr powPal; + FunctionType* powType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/powArgs, + /*isVarArg=*/false); + m_llvmPow = Function::Create( + /*Type=*/powType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"llvm.pow.f32", m_mod); + m_llvmPow->setCallingConv(CallingConv::C); + m_llvmPow->setAttributes(powPal); + } + std::vector<Value*> params; + params.push_back(val1); + params.push_back(val2); + CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(), + name("pow")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y, + llvm::Value *z, llvm::Value *w) +{ + Constant *const_vec = Constant::getNullValue(m_floatVecType); + Value *res = m_builder.CreateInsertElement(const_vec, x, + m_storage->constantInt(0), + name("vecx")); + res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1), + name("vecxy")); + res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2), + name("vecxyz")); + if (w) + res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3), + name("vecxyzw")); + return res; +} + +llvm::Value * Instructions::constVector(float x, float y, float z, float w) +{ + std::vector<Constant*> vec(4); + vec[0] = ConstantFP::get(APFloat(x)); + vec[1] = ConstantFP::get(APFloat(y)); + vec[2] = ConstantFP::get(APFloat(z)); + vec[3] = ConstantFP::get(APFloat(w)); + return ConstantVector::get(m_floatVecType, vec); +} + +llvm::Function * Instructions::declarePrintf() +{ + std::vector<const Type*> args; + AttrListPtr params; + FunctionType* funcTy = FunctionType::get( + /*Result=*/IntegerType::get(32), + /*Params=*/args, + /*isVarArg=*/true); + Function* func_printf = Function::Create( + /*Type=*/funcTy, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"printf", m_mod); + func_printf->setCallingConv(CallingConv::C); + func_printf->setAttributes(params); + return func_printf; } llvm::Function * Instructions::declareFunc(int label) @@ -763,7 +1148,7 @@ llvm::Function * Instructions::declareFunc(int label) args.push_back(vecPtr); args.push_back(vecPtr); args.push_back(vecPtr); - PAListPtr params; + AttrListPtr params; FunctionType *funcType = FunctionType::get( /*Result=*/Type::VoidTy, /*Params=*/args, @@ -774,31 +1159,10 @@ llvm::Function * Instructions::declareFunc(int label) /*Linkage=*/GlobalValue::ExternalLinkage, /*Name=*/name.c_str(), m_mod); func->setCallingConv(CallingConv::C); - func->setParamAttrs(params); + func->setAttributes(params); return func; } -void Instructions::bgnSub(unsigned label) -{ - llvm::Function *func = findFunction(label); - - Function::arg_iterator args = func->arg_begin(); - Value *ptr_INPUT = args++; - ptr_INPUT->setName("INPUT"); - m_storage->pushArguments(ptr_INPUT); - - llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0); - - m_func = func; - m_builder.SetInsertPoint(entry); -} - -void Instructions::endSub() -{ - m_func = 0; - m_builder.SetInsertPoint(0); -} - llvm::Function * Instructions::findFunction(int label) { llvm::Function *func = m_functions[label]; @@ -809,17 +1173,6 @@ llvm::Function * Instructions::findFunction(int label) return func; } -llvm::Value * Instructions::constVector(float x, float y, float z, float w) -{ - std::vector<Constant*> vec(4); - vec[0] = ConstantFP::get(APFloat(x)); - vec[1] = ConstantFP::get(APFloat(y)); - vec[2] = ConstantFP::get(APFloat(z)); - vec[3] = ConstantFP::get(APFloat(w)); - return ConstantVector::get(m_floatVecType, vec); -} - - std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec) { std::vector<llvm::Value*> elems(4); @@ -834,69 +1187,7 @@ std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec) return elems; } -llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) -{ - llvm::Function *func = m_mod->getFunction("cmp"); - assert(func); - - std::vector<Value*> params; - params.push_back(in1); - params.push_back(in2); - params.push_back(in3); - CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres")); - call->setTailCall(false); - return call; -} - -llvm::Value * Instructions::cos(llvm::Value *in) -{ -#if 0 - llvm::Function *func = m_mod->getFunction("vcos"); - assert(func); - CallInst *call = m_builder.CreateCall(func, in, name("cosres")); - call->setTailCall(false); - return call; -#else - std::vector<llvm::Value*> elems = extractVector(in); - Function *func = m_mod->getFunction("cosf"); - assert(func); - CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres")); - cos->setCallingConv(CallingConv::C); - cos->setTailCall(true); - return vectorFromVals(cos, cos, cos, cos); -#endif -} - -llvm::Value * Instructions::scs(llvm::Value *in) -{ - llvm::Function *func = m_mod->getFunction("scs"); - assert(func); - - CallInst *call = m_builder.CreateCall(func, in, name("scsres")); - call->setTailCall(false); - return call; -} - -llvm::Value * Instructions::kil(llvm::Value *in) -{ - llvm::Function *func = m_mod->getFunction("kil"); - assert(func); - - CallInst *call = m_builder.CreateCall(func, in, name("kilpres")); - call->setTailCall(false); - return call; -} - -llvm::Value * Instructions::sin(llvm::Value *in) -{ - llvm::Function *func = m_mod->getFunction("vsin"); - assert(func); - - CallInst *call = m_builder.CreateCall(func, in, name("sinres")); - call->setTailCall(false); - return call; -} #endif //MESA_LLVM diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h index d286ce80c7..e18571251e 100644 --- a/src/gallium/auxiliary/gallivm/instructions.h +++ b/src/gallium/auxiliary/gallivm/instructions.h @@ -57,15 +57,24 @@ public: llvm::BasicBlock *currentBlock() const; llvm::Value *abs(llvm::Value *in1); - llvm::Value *arl(llvm::Value *in1); llvm::Value *add(llvm::Value *in1, llvm::Value *in2); + llvm::Value *arl(llvm::Value *in1); void beginLoop(); void bgnSub(unsigned); void brk(); void cal(int label, llvm::Value *input); + llvm::Value *ceil(llvm::Value *in); + llvm::Value *clamp(llvm::Value *in); llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); + llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); + llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); llvm::Value *cos(llvm::Value *in); llvm::Value *cross(llvm::Value *in1, llvm::Value *in2); + llvm::Value *ddx(llvm::Value *in); + llvm::Value *ddy(llvm::Value *in); + llvm::Value *div(llvm::Value *in1, llvm::Value *in2); + llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3); + llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2); llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2); llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2); llvm::Value *dph(llvm::Value *in1, llvm::Value *in2); @@ -75,6 +84,7 @@ public: void endLoop(); void end(); void endSub(); + llvm::Value *exp(llvm::Value *in); llvm::Value *ex2(llvm::Value *in); llvm::Value *floor(llvm::Value *in); llvm::Value *frc(llvm::Value *in); @@ -82,32 +92,43 @@ public: llvm::Value *kil(llvm::Value *in); llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); - llvm::Value *lit(llvm::Value *in); llvm::Value *lg2(llvm::Value *in); + llvm::Value *lit(llvm::Value *in); + llvm::Value *log(llvm::Value *in); llvm::Value *madd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); - llvm::Value *min(llvm::Value *in1, llvm::Value *in2); llvm::Value *max(llvm::Value *in1, llvm::Value *in2); + llvm::Value *min(llvm::Value *in1, llvm::Value *in2); llvm::Value *mul(llvm::Value *in1, llvm::Value *in2); + llvm::Value *neg(llvm::Value *in); + llvm::Value *nrm(llvm::Value *in); llvm::Value *pow(llvm::Value *in1, llvm::Value *in2); llvm::Value *rcp(llvm::Value *in); llvm::Value *rsq(llvm::Value *in); llvm::Value *scs(llvm::Value *in); + llvm::Value *seq(llvm::Value *in1, llvm::Value *in2); + llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2); llvm::Value *sge(llvm::Value *in1, llvm::Value *in2); llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2); llvm::Value *sin(llvm::Value *in); + llvm::Value *sle(llvm::Value *in1, llvm::Value *in2); llvm::Value *slt(llvm::Value *in1, llvm::Value *in2); + llvm::Value *sne(llvm::Value *in1, llvm::Value *in2); + llvm::Value *str(llvm::Value *in1, llvm::Value *in2); llvm::Value *sub(llvm::Value *in1, llvm::Value *in2); llvm::Value *trunc(llvm::Value *in); + llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); void printVector(llvm::Value *val); private: const char *name(const char *prefix); + llvm::Value *callCeil(llvm::Value *val); llvm::Value *callFAbs(llvm::Value *val); + llvm::Value *callFExp(llvm::Value *val); + llvm::Value *callFLog(llvm::Value *val); llvm::Value *callFloor(llvm::Value *val); llvm::Value *callFSqrt(llvm::Value *val); - llvm::Value *callFLog(llvm::Value *val); llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2); llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y, @@ -125,16 +146,18 @@ private: llvm::Module *m_mod; llvm::Function *m_func; char m_name[32]; - llvm::IRBuilder m_builder; + llvm::IRBuilder<> m_builder; int m_idx; llvm::VectorType *m_floatVecType; + llvm::Function *m_llvmCeil; llvm::Function *m_llvmFSqrt; llvm::Function *m_llvmFAbs; llvm::Function *m_llvmPow; llvm::Function *m_llvmFloor; llvm::Function *m_llvmFlog; + llvm::Function *m_llvmFexp; llvm::Function *m_llvmLit; llvm::Constant *m_fmtPtr; diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp index efddc04e81..d5600fd22d 100644 --- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp +++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp @@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y, return res; } -std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in) -{ - std::vector<llvm::Value*> res(4); - - //Extract x's - llvm::Value *x1 = m_builder.CreateExtractElement(in[0], - m_storage->constantInt(0), - name("extractX")); - //cast it to an unsigned int - x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast")); - - res[0] = x1;//vectorFromVals(x1, x2, x3, x4); - //only x is valid. the others shouldn't be necessary - /* - res[1] = Constant::getNullValue(m_floatVecType); - res[2] = Constant::getNullValue(m_floatVecType); - res[3] = Constant::getNullValue(m_floatVecType); - */ - - return res; -} - - -std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - std::vector<llvm::Value*> res(4); - - res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx")); - res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy")); - res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz")); - res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw")); - - return res; -} - -std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - std::vector<llvm::Value*> res(4); - - res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx")); - res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly")); - res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz")); - res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw")); - - return res; -} - void InstructionsSoa::end() { m_builder.CreateRetVoid(); } -std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2, - const std::vector<llvm::Value*> in3) -{ - std::vector<llvm::Value*> res = mul(in1, in2); - return add(res, in3); -} - std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector) { std::vector<llvm::Value*> res(4); @@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector) return res; } +llvm::IRBuilder<>* InstructionsSoa::getIRBuilder() +{ + return &m_builder; +} + void InstructionsSoa::createFunctionMap() { m_functionsMap[TGSI_OPCODE_ABS] = "abs"; @@ -181,6 +129,7 @@ void InstructionsSoa::createFunctionMap() m_functionsMap[TGSI_OPCODE_POWER] = "pow"; m_functionsMap[TGSI_OPCODE_LIT] = "lit"; m_functionsMap[TGSI_OPCODE_RSQ] = "rsq"; + m_functionsMap[TGSI_OPCODE_SLT] = "slt"; } void InstructionsSoa::createDependencies() @@ -273,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i return callBuiltin(func, in1); } +std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + std::vector<llvm::Value*> res(4); + + res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx")); + res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy")); + res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz")); + res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw")); + + return res; +} + +std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in) +{ + std::vector<llvm::Value*> res(4); + + //Extract x's + llvm::Value *x1 = m_builder.CreateExtractElement(in[0], + m_storage->constantInt(0), + name("extractX")); + //cast it to an unsigned int + x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast")); + + res[0] = x1;//vectorFromVals(x1, x2, x3, x4); + //only x is valid. the others shouldn't be necessary + /* + res[1] = Constant::getNullValue(m_floatVecType); + res[2] = Constant::getNullValue(m_floatVecType); + res[3] = Constant::getNullValue(m_floatVecType); + */ + + return res; +} + std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1, const std::vector<llvm::Value*> in2) { @@ -280,6 +264,98 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i return callBuiltin(func, in1, in2); } +std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in) +{ + llvm::Function *func = function(TGSI_OPCODE_LIT); + return callBuiltin(func, in); +} + +std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2, + const std::vector<llvm::Value*> in3) +{ + std::vector<llvm::Value*> res = mul(in1, in2); + return add(res, in3); +} + +std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_MAX); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_MIN); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + std::vector<llvm::Value*> res(4); + + res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx")); + res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly")); + res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz")); + res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw")); + + return res; +} + +std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_POWER); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in) +{ + llvm::Function *func = function(TGSI_OPCODE_RSQ); + return callBuiltin(func, in); +} + +std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_SLT); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + std::vector<llvm::Value*> res(4); + + res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx")); + res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby")); + res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz")); + res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw")); + + return res; +} + +void checkFunction(Function *func) +{ + for (Function::const_iterator BI = func->begin(), BE = func->end(); + BI != BE; ++BI) { + const BasicBlock &BB = *BI; + for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end(); + II != IE; ++II) { + const Instruction &I = *II; + std::cout<< "Instr = "<<I; + for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) { + const Value *Op = I.getOperand(op); + std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl; + //I->setOperand(op, V); + } + } + } +} + llvm::Value * InstructionsSoa::allocaTemp() { VectorType *vector = VectorType::get(Type::FloatTy, 4); @@ -399,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std return allocaToResult(allocaPtr); } -std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - llvm::Function *func = function(TGSI_OPCODE_POWER); - return callBuiltin(func, in1, in2); -} - -std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - llvm::Function *func = function(TGSI_OPCODE_MIN); - return callBuiltin(func, in1, in2); -} - - -std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - llvm::Function *func = function(TGSI_OPCODE_MAX); - return callBuiltin(func, in1, in2); -} - -void checkFunction(Function *func) -{ - for (Function::const_iterator BI = func->begin(), BE = func->end(); - BI != BE; ++BI) { - const BasicBlock &BB = *BI; - for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end(); - II != IE; ++II) { - const Instruction &I = *II; - std::cout<< "Instr = "<<I; - for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) { - const Value *Op = I.getOperand(op); - std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl; - //I->setOperand(op, V); - } - } - } -} - void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op) { assert(originalFunc); @@ -458,8 +494,8 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op) func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage, originalFunc->getName(), currentModule()); func->setCallingConv(CallingConv::C); - const PAListPtr pal; - func->setParamAttrs(pal); + const AttrListPtr pal; + func->setAttributes(pal); currentModule()->dump(); } else { DenseMap<const Value*, Value *> val; @@ -483,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op) } } -std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - std::vector<llvm::Value*> res(4); - - res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx")); - res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby")); - res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz")); - res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw")); - - return res; -} - -std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in) -{ - llvm::Function *func = function(TGSI_OPCODE_LIT); - return callBuiltin(func, in); -} - -std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in) -{ - llvm::Function *func = function(TGSI_OPCODE_RSQ); - return callBuiltin(func, in); -} diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h index 3e20b652dd..d6831e0a6b 100644 --- a/src/gallium/auxiliary/gallivm/instructionssoa.h +++ b/src/gallium/auxiliary/gallivm/instructionssoa.h @@ -69,11 +69,14 @@ public: std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1, const std::vector<llvm::Value*> in2); std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1); + std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2); std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1, const std::vector<llvm::Value*> in2); void end(); std::vector<llvm::Value*> extractVector(llvm::Value *vector); + llvm::IRBuilder<>* getIRBuilder(); private: const char * name(const char *prefix) const; llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y, @@ -96,7 +99,7 @@ private: const std::vector<llvm::Value*> in3); void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST); private: - llvm::IRBuilder m_builder; + llvm::IRBuilder<> m_builder; StorageSoa *m_storage; std::map<int, std::string> m_functionsMap; diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c index 78f84510e2..cb85e1734e 100644 --- a/src/gallium/auxiliary/gallivm/soabuiltins.c +++ b/src/gallium/auxiliary/gallivm/soabuiltins.c @@ -36,6 +36,8 @@ typedef __attribute__(( ext_vector_type(4) )) float float4; extern float fabsf(float val); +/* helpers */ + float4 absvec(float4 vec) { float4 res; @@ -47,6 +49,58 @@ float4 absvec(float4 vec) return res; } +float4 maxvec(float4 a, float4 b) +{ + return (float4){(a.x > b.x) ? a.x : b.x, + (a.y > b.y) ? a.y : b.y, + (a.z > b.z) ? a.z : b.z, + (a.w > b.w) ? a.w : b.w}; +} + +float4 minvec(float4 a, float4 b) +{ + return (float4){(a.x < b.x) ? a.x : b.x, + (a.y < b.y) ? a.y : b.y, + (a.z < b.z) ? a.z : b.z, + (a.w < b.w) ? a.w : b.w}; +} + +extern float powf(float num, float p); +extern float sqrtf(float x); + +float4 powvec(float4 vec, float4 q) +{ + float4 p; + p.x = powf(vec.x, q.x); + p.y = powf(vec.y, q.y); + p.z = powf(vec.z, q.z); + p.w = powf(vec.w, q.w); + return p; +} + +float4 sqrtvec(float4 vec) +{ + float4 p; + p.x = sqrtf(vec.x); + p.y = sqrtf(vec.y); + p.z = sqrtf(vec.z); + p.w = sqrtf(vec.w); + return p; +} + +float4 sltvec(float4 v1, float4 v2) +{ + float4 p; + p.x = (v1.x < v2.x) ? 1.0 : 0.0; + p.y = (v1.y < v2.y) ? 1.0 : 0.0; + p.z = (v1.z < v2.z) ? 1.0 : 0.0; + p.w = (v1.w < v2.w) ? 1.0 : 0.0; + return p; +} + + +/* instructions */ + void abs(float4 *res, float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w) { @@ -69,7 +123,6 @@ void dp3(float4 *res, res[3] = dot; } - void dp4(float4 *res, float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) @@ -83,35 +136,25 @@ void dp4(float4 *res, res[3] = dot; } -extern float powf(float num, float p); -extern float sqrtf(float x); - -float4 powvec(float4 vec, float4 q) -{ - float4 p; - p.x = powf(vec.x, q.x); - p.y = powf(vec.y, q.y); - p.z = powf(vec.z, q.z); - p.w = powf(vec.w, q.w); - return p; -} - -void pow(float4 *res, - float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, - float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) +void lit(float4 *res, + float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w) { - res[0] = powvec(tmp0x, tmp1x); - res[1] = res[0]; - res[2] = res[0]; - res[3] = res[0]; -} + const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0}; + const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f}; + const float4 plus128 = (float4) {128.f, 128.f, 128.f, 128.f}; -float4 minvec(float4 a, float4 b) -{ - return (float4){(a.x < b.x) ? a.x : b.x, - (a.y < b.y) ? a.y : b.y, - (a.z < b.z) ? a.z : b.z, - (a.w < b.w) ? a.w : b.w}; + res[0] = (float4){1.0, 1.0, 1.0, 1.0}; + if (tmp0x.x > 0) { + float4 tmpy = maxvec(tmp0y, zerovec); + float4 tmpw = minvec(tmp0w, plus128); + tmpw = maxvec(tmpw, min128); + res[1] = tmp0x; + res[2] = powvec(tmpy, tmpw); + } else { + res[1] = zerovec; + res[2] = zerovec; + } + res[3] = (float4){1.0, 1.0, 1.0, 1.0}; } void min(float4 *res, @@ -125,14 +168,6 @@ void min(float4 *res, } -float4 maxvec(float4 a, float4 b) -{ - return (float4){(a.x > b.x) ? a.x : b.x, - (a.y > b.y) ? a.y : b.y, - (a.z > b.z) ? a.z : b.z, - (a.w > b.w) ? a.w : b.w}; -} - void max(float4 *res, float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) @@ -143,37 +178,14 @@ void max(float4 *res, res[3] = maxvec(tmp0w, tmp1w); } - -void lit(float4 *res, - float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w) -{ - const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0}; - const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f}; - const float4 plus128 = (float4) {128.f, 128.f, 128.f, 128.f}; - - res[0] = (float4){1.0, 1.0, 1.0, 1.0}; - if (tmp0x.x > 0) { - float4 tmpy = maxvec(tmp0y, zerovec); - float4 tmpw = minvec(tmp0w, plus128); - tmpw = maxvec(tmpw, min128); - res[1] = tmp0x; - res[2] = powvec(tmpy, tmpw); - } else { - res[1] = zerovec; - res[2] = zerovec; - } - res[3] = (float4){1.0, 1.0, 1.0, 1.0}; -} - - -float4 sqrtvec(float4 vec) +void pow(float4 *res, + float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, + float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) { - float4 p; - p.x = sqrtf(vec.x); - p.y = sqrtf(vec.y); - p.z = sqrtf(vec.z); - p.w = sqrtf(vec.w); - return p; + res[0] = powvec(tmp0x, tmp1x); + res[1] = res[0]; + res[2] = res[0]; + res[3] = res[0]; } void rsq(float4 *res, @@ -185,3 +197,14 @@ void rsq(float4 *res, res[2] = onevec/sqrtvec(absvec(tmp0z)); res[3] = onevec/sqrtvec(absvec(tmp0w)); } + +void slt(float4 *res, + float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, + float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) +{ + res[0] = sltvec(tmp0x, tmp1x); + res[1] = sltvec(tmp0y, tmp1y); + res[2] = sltvec(tmp0z, tmp1z); + res[3] = sltvec(tmp0w, tmp1w); +} + diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp index 78d754371f..4fc075cf6d 100644 --- a/src/gallium/auxiliary/gallivm/storagesoa.cpp +++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp @@ -93,7 +93,7 @@ void StorageSoa::declareImmediates() std::vector<float> vals(4); std::vector<Constant*> channelArray; - vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0]; + vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3]; llvm::Constant *xChannel = createConstGlobalVector(vals); vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1]; @@ -144,22 +144,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx) return res; } -std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx) +llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc) { - std::vector<llvm::Value*> res(4); + std::vector<llvm::Value*> x(4); + x[0] = m_builder->CreateExtractElement(vector, + constantInt(cc), + name("x")); + + VectorType *vectorType = VectorType::get(Type::FloatTy, 4); + Constant *constVector = Constant::getNullValue(vectorType); + Value *res = m_builder->CreateInsertElement(constVector, x[0], + constantInt(0), + name("vecx")); + res = m_builder->CreateInsertElement(res, x[0], constantInt(1), + name("vecxx")); + res = m_builder->CreateInsertElement(res, x[0], constantInt(2), + name("vecxxx")); + res = m_builder->CreateInsertElement(res, x[0], constantInt(3), + name("vecxxxx")); + return res; +} + +std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx) +{ + llvm::Value* res; + std::vector<llvm::Value*> res2(4); llvm::Value *xChannel, *yChannel, *zChannel, *wChannel; xChannel = elementPointer(m_consts, idx, 0); - yChannel = elementPointer(m_consts, idx, 1); - zChannel = elementPointer(m_consts, idx, 2); - wChannel = elementPointer(m_consts, idx, 3); - res[0] = alignedArrayLoad(xChannel); - res[1] = alignedArrayLoad(yChannel); - res[2] = alignedArrayLoad(zChannel); - res[3] = alignedArrayLoad(wChannel); + res = alignedArrayLoad(xChannel); - return res; + res2[0]=unpackConstElement(m_builder, res,0); + res2[1]=unpackConstElement(m_builder, res,1); + res2[2]=unpackConstElement(m_builder, res,2); + res2[3]=unpackConstElement(m_builder, res,3); + + return res2; } std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx) @@ -260,6 +281,12 @@ llvm::Module * StorageSoa::currentModule() const return m_block->getParent()->getParent(); } +llvm::Constant * StorageSoa::createConstGlobalFloat(const float val) +{ + Constant*c = ConstantFP::get(APFloat(val)); + return c; +} + llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec) { VectorType *vectorType = VectorType::get(Type::FloatTy, 4); @@ -278,7 +305,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v } std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle, - llvm::Value *indIdx) + llvm::IRBuilder<>* m_builder,llvm::Value *indIdx) { std::vector<llvm::Value*> val(4); @@ -302,7 +329,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in val = tempElement(realIndex); break; case TGSI_FILE_CONSTANT: - val = constElement(realIndex); + val = constElement(m_builder, realIndex); break; case TGSI_FILE_IMMEDIATE: val = immediateElement(realIndex); diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h index ae2fc7c6ae..f21ca6ec43 100644 --- a/src/gallium/auxiliary/gallivm/storagesoa.h +++ b/src/gallium/auxiliary/gallivm/storagesoa.h @@ -29,6 +29,7 @@ #define STORAGESOA_H #include <pipe/p_shader_tokens.h> +#include <llvm/Support/IRBuilder.h> #include <vector> #include <list> @@ -56,7 +57,7 @@ public: std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, - llvm::Value *indIdx =0); + llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0); void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val, int mask); @@ -76,10 +77,12 @@ private: const char *name(const char *prefix) const; llvm::Value *alignedArrayLoad(llvm::Value *val); llvm::Module *currentModule() const; + llvm::Constant *createConstGlobalFloat(const float val); llvm::Constant *createConstGlobalVector(const std::vector<float> &vec); std::vector<llvm::Value*> inputElement(llvm::Value *indIdx); - std::vector<llvm::Value*> constElement(llvm::Value *indIdx); + llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc); + std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx); std::vector<llvm::Value*> outputElement(llvm::Value *indIdx); std::vector<llvm::Value*> tempElement(llvm::Value *indIdx); std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx); diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp index cc1516a45e..1191a6cae9 100644 --- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp +++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp @@ -52,7 +52,7 @@ static inline FunctionType *vertexShaderFunctionType() // pass are castable to the following: // [4 x <4 x float>] inputs, // [4 x <4 x float>] output, - // [4 x [4 x float]] consts, + // [4 x [1 x float]] consts, // [4 x <4 x float>] temps std::vector<const Type*> funcArgs; @@ -61,7 +61,7 @@ static inline FunctionType *vertexShaderFunctionType() PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0); ArrayType *floatArray = ArrayType::get(Type::FloatTy, 4); - ArrayType *constsArray = ArrayType::get(floatArray, 4); + ArrayType *constsArray = ArrayType::get(floatArray, 1); PointerType *constsArrayPtr = PointerType::get(constsArray, 0); funcArgs.push_back(vectorArrayPtr);//inputs @@ -246,6 +246,7 @@ translate_instruction(llvm::Module *module, val = storage->constElement(src->SrcRegister.Index, indIdx); } else if (src->SrcRegister.File == TGSI_FILE_INPUT) { val = storage->inputElement(src->SrcRegister.Index, indIdx); + // FIXME we should not be generating elements for temporaries, this creates useless memory writes } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) { val = storage->tempElement(src->SrcRegister.Index); } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) { @@ -286,9 +287,13 @@ translate_instruction(llvm::Module *module, out = instr->rsq(inputs[0]); } break; - case TGSI_OPCODE_EXP: + case TGSI_OPCODE_EXP: { + out = instr->exp(inputs[0]); + } break; - case TGSI_OPCODE_LOG: + case TGSI_OPCODE_LOG: { + out = instr->log(inputs[0]); + } break; case TGSI_OPCODE_MUL: { out = instr->mul(inputs[0], inputs[1]); @@ -338,21 +343,31 @@ translate_instruction(llvm::Module *module, out = instr->lerp(inputs[0], inputs[1], inputs[2]); } break; - case TGSI_OPCODE_CND: + case TGSI_OPCODE_CND: { + out = instr->cnd(inputs[0], inputs[1], inputs[2]); + } break; - case TGSI_OPCODE_CND0: + case TGSI_OPCODE_CND0: { + out = instr->cnd0(inputs[0], inputs[1], inputs[2]); + } break; - case TGSI_OPCODE_DOT2ADD: + case TGSI_OPCODE_DOT2ADD: { + out = instr->dot2add(inputs[0], inputs[1], inputs[2]); + } break; case TGSI_OPCODE_INDEX: break; - case TGSI_OPCODE_NEGATE: + case TGSI_OPCODE_NEGATE: { + out = instr->neg(inputs[0]); + } break; case TGSI_OPCODE_FRAC: { out = instr->frc(inputs[0]); } break; - case TGSI_OPCODE_CLAMP: + case TGSI_OPCODE_CLAMP: { + out = instr->clamp(inputs[0]); + } break; case TGSI_OPCODE_FLOOR: { out = instr->floor(inputs[0]); @@ -392,9 +407,13 @@ translate_instruction(llvm::Module *module, out = instr->cos(inputs[0]); } break; - case TGSI_OPCODE_DDX: + case TGSI_OPCODE_DDX: { + out = instr->ddx(inputs[0]); + } break; - case TGSI_OPCODE_DDY: + case TGSI_OPCODE_DDY: { + out = instr->ddy(inputs[0]); + } break; case TGSI_OPCODE_KILP: break; @@ -408,9 +427,13 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_RFL: break; - case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SEQ: { + out = instr->seq(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_SFL: + case TGSI_OPCODE_SFL: { + out = instr->sfl(inputs[0], inputs[1]); + } break; case TGSI_OPCODE_SGT: { out = instr->sgt(inputs[0], inputs[1]); @@ -420,11 +443,17 @@ translate_instruction(llvm::Module *module, out = instr->sin(inputs[0]); } break; - case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SLE: { + out = instr->sle(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_SNE: + case TGSI_OPCODE_SNE: { + out = instr->sne(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_STR: + case TGSI_OPCODE_STR: { + out = instr->str(inputs[0], inputs[1]); + } break; case TGSI_OPCODE_TEX: break; @@ -438,7 +467,9 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_UP4UB: break; - case TGSI_OPCODE_X2D: + case TGSI_OPCODE_X2D: { + out = instr->x2d(inputs[0], inputs[1], inputs[2]); + } break; case TGSI_OPCODE_ARA: break; @@ -468,11 +499,18 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_TXB: break; - case TGSI_OPCODE_NRM: + case TGSI_OPCODE_NRM4: + case TGSI_OPCODE_NRM: { + out = instr->nrm(inputs[0]); + } break; - case TGSI_OPCODE_DIV: + case TGSI_OPCODE_DIV: { + out = instr->div(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_DP2: + case TGSI_OPCODE_DP2: { + out = instr->dp2(inputs[0], inputs[1]); + } break; case TGSI_OPCODE_TXL: break; @@ -590,8 +628,6 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_M3X2: break; - case TGSI_OPCODE_NRM4: - break; case TGSI_OPCODE_CALLNZ: break; case TGSI_OPCODE_IFC: @@ -641,6 +677,7 @@ translate_instruction(llvm::Module *module, if (dst->DstRegister.File == TGSI_FILE_OUTPUT) { storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask); + // FIXME we should not be generating elements for temporaries, this creates useless memory writes } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) { storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask); } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) { @@ -672,9 +709,8 @@ translate_instructionir(llvm::Module *module, if (src->SrcRegister.Indirect) { indIdx = storage->addrElement(src->SrcRegisterInd.Index); } - val = storage->load((enum tgsi_file_type)src->SrcRegister.File, - src->SrcRegister.Index, swizzle, indIdx); + src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx); inputs[i] = val; } @@ -732,6 +768,7 @@ translate_instructionir(llvm::Module *module, } break; case TGSI_OPCODE_SLT: { + out = instr->slt(inputs[0], inputs[1]); } break; case TGSI_OPCODE_SGE: { @@ -989,7 +1026,6 @@ translate_instructionir(llvm::Module *module, /* store results */ for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) { struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i]; - storage->store((enum tgsi_file_type)dst->DstRegister.File, dst->DstRegister.Index, out, dst->DstRegister.WriteMask); } diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index 491141f190..dea1aed032 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -164,6 +164,27 @@ rem_prefix(const char *longname) } +static const char * +reg_name(int reg) +{ + switch (reg) { + case SPE_REG_SP: + return "$sp"; + case SPE_REG_RA: + return "$lr"; + default: + { + /* cycle through four buffers to handle multiple calls per printf */ + static char buf[4][10]; + static int b = 0; + b = (b + 1) % 4; + sprintf(buf[b], "$%d", reg); + return buf[b]; + } + } +} + + static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, unsigned rA, unsigned rB, const char *name) { @@ -176,7 +197,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, assert(p->num_inst <= p->max_inst); if (p->print) { indent(p); - printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB); + printf("%s\t%s, %s, %s\n", + rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB)); } } @@ -194,7 +216,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT, assert(p->num_inst <= p->max_inst); if (p->print) { indent(p); - printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC); + printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT), + reg_name(rA), reg_name(rB), reg_name(rC)); } } @@ -211,7 +234,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT, assert(p->num_inst <= p->max_inst); if (p->print) { indent(p); - printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm); + printf("%s\t%s, %s, 0x%x\n", + rem_prefix(name), reg_name(rT), reg_name(rA), imm); } } @@ -229,7 +253,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT, assert(p->num_inst <= p->max_inst); if (p->print) { indent(p); - printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm); + printf("%s\t%s, %s, 0x%x\n", + rem_prefix(name), reg_name(rT), reg_name(rA), imm); } } @@ -247,15 +272,22 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT, assert(p->num_inst <= p->max_inst); if (p->print) { indent(p); - if (strcmp(name, "spe_lqd") == 0 || - strcmp(name, "spe_stqd") == 0) - printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA); - else - printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm); + printf("%s\t%s, %s, 0x%x\n", + rem_prefix(name), reg_name(rT), reg_name(rA), imm); } } +/** As above, but do range checking on signed immediate value */ +static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT, + unsigned rA, int imm, const char *name) +{ + assert(imm <= 511); + assert(imm >= -512); + emit_RI10(p, op, rT, rA, imm, name); +} + + static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, int imm, const char *name) { @@ -267,7 +299,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, assert(p->num_inst <= p->max_inst); if (p->print) { indent(p); - printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm); + printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm); } } @@ -283,7 +315,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, assert(p->num_inst <= p->max_inst); if (p->print) { indent(p); - printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm); + printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm); } } @@ -332,6 +364,12 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ emit_RI10(p, _op, rT, rA, imm, __FUNCTION__); \ } +#define EMIT_RI10s(_name, _op) \ +void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ +{ \ + emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__); \ +} + #define EMIT_RI16(_name, _op) \ void _name (struct spe_function *p, unsigned rT, int imm) \ { \ @@ -353,20 +391,28 @@ void _name (struct spe_function *p, int imm) \ #include "rtasm_ppc_spe.h" + /** * Initialize an spe_function. * \param code_size size of instruction buffer to allocate, in bytes. */ void spe_init_func(struct spe_function *p, unsigned code_size) { + unsigned int i; + p->store = align_malloc(code_size, 16); p->num_inst = 0; p->max_inst = code_size / SPE_INST_SIZE; + p->set_count = 0; + memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0])); + /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile. */ - p->regs[0] = ~7; - p->regs[1] = (1U << (80 - 64)) - 1; + p->regs[0] = p->regs[1] = p->regs[2] = 1; + for (i = 80; i <= 127; i++) { + p->regs[i] = 1; + } p->print = false; p->indent = 0; @@ -398,12 +444,8 @@ int spe_allocate_available_register(struct spe_function *p) { unsigned i; for (i = 0; i < SPE_NUM_REGS; i++) { - const uint64_t mask = (1ULL << (i % 64)); - const unsigned idx = i / 64; - - assert(idx < 2); - if ((p->regs[idx] & mask) != 0) { - p->regs[idx] &= ~mask; + if (p->regs[i] == 0) { + p->regs[i] = 1; return i; } } @@ -417,31 +459,84 @@ int spe_allocate_available_register(struct spe_function *p) */ int spe_allocate_register(struct spe_function *p, int reg) { - const unsigned idx = reg / 64; - const unsigned bit = reg % 64; - assert(reg < SPE_NUM_REGS); - assert((p->regs[idx] & (1ULL << bit)) != 0); - - p->regs[idx] &= ~(1ULL << bit); + assert(p->regs[reg] == 0); + p->regs[reg] = 1; return reg; } /** - * Mark the given SPE register as "unallocated". + * Mark the given SPE register as "unallocated". Note that this should + * only be used on registers allocated in the current register set; an + * assertion will fail if an attempt is made to deallocate a register + * allocated in an earlier register set. */ void spe_release_register(struct spe_function *p, int reg) { - const unsigned idx = reg / 64; - const unsigned bit = reg % 64; + assert(reg < SPE_NUM_REGS); + assert(p->regs[reg] == 1); - assert(idx < 2); + p->regs[reg] = 0; +} - assert(reg < SPE_NUM_REGS); - assert((p->regs[idx] & (1ULL << bit)) == 0); +/** + * Start a new set of registers. This can be called if + * it will be difficult later to determine exactly what + * registers were actually allocated during a code generation + * sequence, and you really just want to deallocate all of them. + */ +void spe_allocate_register_set(struct spe_function *p) +{ + unsigned int i; - p->regs[idx] |= (1ULL << bit); + /* Keep track of the set count. If it ever wraps around to 0, + * we're in trouble. + */ + p->set_count++; + assert(p->set_count > 0); + + /* Increment the allocation count of all registers currently + * allocated. Then any registers that are allocated in this set + * will be the only ones with a count of 1; they'll all be released + * when the register set is released. + */ + for (i = 0; i < SPE_NUM_REGS; i++) { + if (p->regs[i] > 0) + p->regs[i]++; + } +} + +void spe_release_register_set(struct spe_function *p) +{ + unsigned int i; + + /* If the set count drops below zero, we're in trouble. */ + assert(p->set_count > 0); + p->set_count--; + + /* Drop the allocation level of all registers. Any allocated + * during this register set will drop to 0 and then become + * available. + */ + for (i = 0; i < SPE_NUM_REGS; i++) { + if (p->regs[i] > 0) + p->regs[i]--; + } +} + + +unsigned +spe_get_registers_used(const struct spe_function *p, ubyte used[]) +{ + unsigned i, num = 0; + /* only count registers in the range available to callers */ + for (i = 2; i < 80; i++) { + if (p->regs[i]) { + used[num++] = i; + } + } + return num; } @@ -459,7 +554,7 @@ spe_indent(struct spe_function *p, int spaces) } -extern void +void spe_comment(struct spe_function *p, int rel_indent, const char *s) { if (p->print) { @@ -472,6 +567,56 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s) /** + * Load quad word. + * NOTE: offset is in bytes and the least significant 4 bits must be zero! + */ +void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset) +{ + const boolean pSave = p->print; + + /* offset must be a multiple of 16 */ + assert(offset % 16 == 0); + /* offset must fit in 10-bit signed int field, after shifting */ + assert((offset >> 4) <= 511); + assert((offset >> 4) >= -512); + + p->print = FALSE; + emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd"); + p->print = pSave; + + if (p->print) { + indent(p); + printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA)); + } +} + + +/** + * Store quad word. + * NOTE: offset is in bytes and the least significant 4 bits must be zero! + */ +void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset) +{ + const boolean pSave = p->print; + + /* offset must be a multiple of 16 */ + assert(offset % 16 == 0); + /* offset must fit in 10-bit signed int field, after shifting */ + assert((offset >> 4) <= 511); + assert((offset >> 4) >= -512); + + p->print = FALSE; + emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd"); + p->print = pSave; + + if (p->print) { + indent(p); + printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA)); + } +} + + +/** * For branch instructions: * \param d if 1, disable interupts if branch is taken * \param e if 1, enable interupts if branch is taken @@ -603,22 +748,187 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui) { /* If the whole value is in the lower 18 bits, use ila, which * doesn't sign-extend. Otherwise, if the two halfwords of - * the constant are identical, use ilh. Otherwise, we have - * to use ilhu followed by iohl. + * the constant are identical, use ilh. Otherwise, if every byte of + * the desired value is 0x00 or 0xff, we can use Form Select Mask for + * Bytes Immediate (fsmbi) to load the value in a single instruction. + * Otherwise, in the general case, we have to use ilhu followed by iohl. */ - if ((ui & 0xfffc0000) == ui) { + if ((ui & 0x0003ffff) == ui) { spe_ila(p, rT, ui); } else if ((ui >> 16) == (ui & 0xffff)) { spe_ilh(p, rT, ui & 0xffff); } + else if ( + ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) && + ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) && + ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) && + ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000) + ) { + unsigned int mask = 0; + /* fsmbi duplicates each bit in the given mask eight times, + * using a 16-bit value to initialize a 16-byte quadword. + * Each 4-bit nybble of the mask corresponds to a full word + * of the result; look at the value and figure out the mask + * (replicated for each word in the quadword), and then + * form the "select mask" to get the value. + */ + if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111; + if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222; + if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444; + if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888; + spe_fsmbi(p, rT, mask); + } else { + /* The general case: this usually uses two instructions, but + * may use only one if the low-order 16 bits of each word are 0. + */ spe_ilhu(p, rT, ui >> 16); if (ui & 0xffff) spe_iohl(p, rT, ui & 0xffff); } } +/** + * This function is constructed identically to spe_xor_uint() below. + * Changes to one should be made in the other. + */ +void +spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If we can, emit a single instruction, either And Byte Immediate + * (which uses the same constant across each byte), And Halfword Immediate + * (which sign-extends a 10-bit immediate to 16 bits and uses that + * across each halfword), or And Word Immediate (which sign-extends + * a 10-bit immediate to 32 bits). + * + * Otherwise, we'll need to use a temporary register. + */ + unsigned int tmp; + + /* If the upper 23 bits are all 0s or all 1s, sign extension + * will work and we can use And Word Immediate + */ + tmp = ui & 0xfffffe00; + if (tmp == 0xfffffe00 || tmp == 0) { + spe_andi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric along halfword boundaries and + * the upper 7 bits of each halfword are all 0s or 1s, we + * can use And Halfword Immediate + */ + tmp = ui & 0xfe00fe00; + if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { + spe_andhi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric in each byte, then we can use + * the And Byte Immediate instruction. + */ + tmp = ui & 0x000000ff; + if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { + spe_andbi(p, rT, rA, tmp); + return; + } + + /* Otherwise, we'll have to use a temporary register. */ + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_and(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); +} + + +/** + * This function is constructed identically to spe_and_uint() above. + * Changes to one should be made in the other. + */ +void +spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If we can, emit a single instruction, either Exclusive Or Byte + * Immediate (which uses the same constant across each byte), Exclusive + * Or Halfword Immediate (which sign-extends a 10-bit immediate to + * 16 bits and uses that across each halfword), or Exclusive Or Word + * Immediate (which sign-extends a 10-bit immediate to 32 bits). + * + * Otherwise, we'll need to use a temporary register. + */ + unsigned int tmp; + + /* If the upper 23 bits are all 0s or all 1s, sign extension + * will work and we can use Exclusive Or Word Immediate + */ + tmp = ui & 0xfffffe00; + if (tmp == 0xfffffe00 || tmp == 0) { + spe_xori(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric along halfword boundaries and + * the upper 7 bits of each halfword are all 0s or 1s, we + * can use Exclusive Or Halfword Immediate + */ + tmp = ui & 0xfe00fe00; + if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { + spe_xorhi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric in each byte, then we can use + * the Exclusive Or Byte Immediate instruction. + */ + tmp = ui & 0x000000ff; + if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { + spe_xorbi(p, rT, rA, tmp); + return; + } + + /* Otherwise, we'll have to use a temporary register. */ + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_xor(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); +} + +void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If the comparison value is 9 bits or less, it fits inside a + * Compare Equal Word Immediate instruction. + */ + if ((ui & 0x000001ff) == ui) { + spe_ceqi(p, rT, rA, ui); + } + /* Otherwise, we're going to have to load a word first. */ + else { + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_ceq(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); + } +} + +void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If the comparison value is 10 bits or less, it fits inside a + * Compare Logical Greater Than Word Immediate instruction. + */ + if ((ui & 0x000003ff) == ui) { + spe_clgti(p, rT, rA, ui); + } + /* Otherwise, we're going to have to load a word first. */ + else { + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_clgt(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); + } +} void spe_splat(struct spe_function *p, unsigned rT, unsigned rA) diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index 61c7edeb60..d6a3c02f20 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -53,17 +53,26 @@ struct spe_function uint num_inst; uint max_inst; - /** - * Mask of used / unused registers - * - * Each set bit corresponds to an available register. Each cleared bit - * corresponds to an allocated register. + /** + * The "set count" reflects the number of nested register sets + * are allowed. In the unlikely case that we exceed the set count, + * register allocation will start to be confused, which is critical + * enough that we check for it. + */ + unsigned char set_count; + + /** + * Flags for used and unused registers. Each byte corresponds to a + * register; a 0 in that byte means that the register is available. + * A value of 1 means that the register was allocated in the current + * register set. Any other value N means that the register was allocated + * N register sets ago. * * \sa * spe_allocate_register, spe_allocate_available_register, - * spe_release_register + * spe_allocate_register_set, spe_release_register_set, spe_release_register, */ - uint64_t regs[SPE_NUM_REGS / 64]; + unsigned char regs[SPE_NUM_REGS]; boolean print; /**< print/dump instructions as they're emitted? */ int indent; /**< number of spaces to indent */ @@ -77,6 +86,11 @@ extern unsigned spe_code_size(const struct spe_function *p); extern int spe_allocate_available_register(struct spe_function *p); extern int spe_allocate_register(struct spe_function *p, int reg); extern void spe_release_register(struct spe_function *p, int reg); +extern void spe_allocate_register_set(struct spe_function *p); +extern void spe_release_register_set(struct spe_function *p); + +extern unsigned +spe_get_registers_used(const struct spe_function *p, ubyte used[]); extern void spe_print_code(struct spe_function *p, boolean enable); extern void spe_indent(struct spe_function *p, int spaces); @@ -105,6 +119,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s); #define EMIT_RI10(_name, _op) \ extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \ int imm) +#define EMIT_RI10s(_name, _op) \ + extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \ + int imm) #define EMIT_RI16(_name, _op) \ extern void _name (struct spe_function *p, unsigned rT, int imm) #define EMIT_RI18(_name, _op) \ @@ -117,11 +134,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s); /* Memory load / store instructions */ -EMIT_RI10(spe_lqd, 0x034); EMIT_RR (spe_lqx, 0x1c4); EMIT_RI16(spe_lqa, 0x061); EMIT_RI16(spe_lqr, 0x067); -EMIT_RI10(spe_stqd, 0x024); EMIT_RR (spe_stqx, 0x144); EMIT_RI16(spe_stqa, 0x041); EMIT_RI16(spe_stqr, 0x047); @@ -151,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065); EMIT_RR (spe_ah, 0x0c8); EMIT_RI10(spe_ahi, 0x01d); EMIT_RR (spe_a, 0x0c0); -EMIT_RI10(spe_ai, 0x01c); +EMIT_RI10s(spe_ai, 0x01c); EMIT_RR (spe_sfh, 0x048); EMIT_RI10(spe_sfhi, 0x00d); EMIT_RR (spe_sf, 0x040); @@ -189,19 +204,19 @@ EMIT_R (spe_xshw, 0x2ae); EMIT_R (spe_xswd, 0x2a6); EMIT_RR (spe_and, 0x0c1); EMIT_RR (spe_andc, 0x2c1); -EMIT_RI10(spe_andbi, 0x016); -EMIT_RI10(spe_andhi, 0x015); -EMIT_RI10(spe_andi, 0x014); +EMIT_RI10s(spe_andbi, 0x016); +EMIT_RI10s(spe_andhi, 0x015); +EMIT_RI10s(spe_andi, 0x014); EMIT_RR (spe_or, 0x041); EMIT_RR (spe_orc, 0x2c9); -EMIT_RI10(spe_orbi, 0x006); -EMIT_RI10(spe_orhi, 0x005); -EMIT_RI10(spe_ori, 0x004); +EMIT_RI10s(spe_orbi, 0x006); +EMIT_RI10s(spe_orhi, 0x005); +EMIT_RI10s(spe_ori, 0x004); EMIT_R (spe_orx, 0x1f0); EMIT_RR (spe_xor, 0x241); -EMIT_RI10(spe_xorbi, 0x026); -EMIT_RI10(spe_xorhi, 0x025); -EMIT_RI10(spe_xori, 0x024); +EMIT_RI10s(spe_xorbi, 0x026); +EMIT_RI10s(spe_xorhi, 0x025); +EMIT_RI10s(spe_xori, 0x024); EMIT_RR (spe_nand, 0x0c9); EMIT_RR (spe_nor, 0x049); EMIT_RR (spe_eqv, 0x249); @@ -279,6 +294,12 @@ EMIT_RI16(spe_brz, 0x040); EMIT_RI16(spe_brhnz, 0x046); EMIT_RI16(spe_brhz, 0x044); +extern void +spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset); + +extern void +spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset); + extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e); extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e); extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, @@ -307,6 +328,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i); extern void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui); +/** And immediate value into rT. */ +extern void +spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Xor immediate value into rT. */ +extern void +spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare equal with immediate value. */ +extern void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare greater with immediate value. */ +extern void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + /** Replicate word 0 of rA across rT. */ extern void spe_splat(struct spe_function *p, unsigned rT, unsigned rA); @@ -388,6 +425,7 @@ EMIT_R (spe_wrch, 0x10d); #undef EMIT_RI7 #undef EMIT_RI8 #undef EMIT_RI10 +#undef EMIT_RI10s #undef EMIT_RI16 #undef EMIT_RI18 #undef EMIT_I16 diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 3bba9dcc07..99ee74cf14 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -371,7 +371,11 @@ void x86_jcc( struct x86_function *p, DUMP_I(cc); if (offset < 0) { - assert(p->csr - p->store > -offset); + /*assert(p->csr - p->store > -offset);*/ + if (p->csr - p->store <= -offset) { + /* probably out of memory (using the error_overflow buffer) */ + return; + } } if (offset <= 127 && offset >= -128) { @@ -675,6 +679,44 @@ void x86_and( struct x86_function *p, * SSE instructions */ +void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr) +{ + DUMP_R( ptr ); + assert(ptr.mod != mod_REG); + emit_2ub(p, 0x0f, 0x18); + emit_modrm_noreg(p, 0, ptr); +} + +void sse_prefetch0( struct x86_function *p, struct x86_reg ptr) +{ + DUMP_R( ptr ); + assert(ptr.mod != mod_REG); + emit_2ub(p, 0x0f, 0x18); + emit_modrm_noreg(p, 1, ptr); +} + +void sse_prefetch1( struct x86_function *p, struct x86_reg ptr) +{ + DUMP_R( ptr ); + assert(ptr.mod != mod_REG); + emit_2ub(p, 0x0f, 0x18); + emit_modrm_noreg(p, 2, ptr); +} + +void sse_movntps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src) +{ + DUMP_RR( dst, src ); + + assert(dst.mod != mod_REG); + assert(src.mod == mod_REG); + emit_2ub(p, 0x0f, 0x2b); + emit_modrm(p, src, dst); +} + + + void sse_movss( struct x86_function *p, struct x86_reg dst, diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h index 510aa1b0de..1b5eaaca85 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h @@ -185,6 +185,13 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr); +void sse_prefetch0( struct x86_function *p, struct x86_reg ptr); +void sse_prefetch1( struct x86_function *p, struct x86_reg ptr); + +void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src); + void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 4fdad3a5c7..f79170b9d6 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -25,6 +25,10 @@ * **************************************************************************/ +#include "pipe/p_config.h" + +#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) + #include "pipe/p_debug.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" @@ -36,8 +40,6 @@ #include "rtasm/rtasm_x86sse.h" -#ifdef PIPE_ARCH_X86 - /* for 1/sqrt() * * This costs about 100fps (close to 10%) in gears: diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile index d3951e4e7d..b3d1045a8f 100644 --- a/src/gallium/auxiliary/util/Makefile +++ b/src/gallium/auxiliary/util/Makefile @@ -10,6 +10,7 @@ C_SOURCES = \ u_gen_mipmap.c \ u_handle_table.c \ u_hash_table.c \ + u_keymap.c \ u_math.c \ u_mm.c \ u_rect.c \ diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript index e65c17b1cc..8a04955a16 100644 --- a/src/gallium/auxiliary/util/SConscript +++ b/src/gallium/auxiliary/util/SConscript @@ -11,13 +11,14 @@ util = env.ConvenienceLibrary( 'u_gen_mipmap.c', 'u_handle_table.c', 'u_hash_table.c', + 'u_keymap.c', 'u_math.c', 'u_mm.c', 'u_rect.c', 'u_simple_shaders.c', 'u_snprintf.c', - 'u_stream_stdc.c', - 'u_stream_wd.c', + 'u_stream_stdc.c', + 'u_stream_wd.c', 'u_tile.c', 'u_time.c', ]) diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c new file mode 100644 index 0000000000..01b17ddb1b --- /dev/null +++ b/src/gallium/auxiliary/util/u_keymap.c @@ -0,0 +1,309 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Key lookup/associative container. + * + * Like Jose's u_hash_table, based on CSO cache code for now. + * + * Author: Brian Paul + */ + + +#include "pipe/p_compiler.h" +#include "pipe/p_debug.h" +#include "pipe/p_error.h" + +#include "cso_cache/cso_hash.h" + +#include "util/u_memory.h" +#include "util/u_keymap.h" + + +struct keymap +{ + struct cso_hash *cso; + unsigned key_size; + unsigned max_entries; /* XXX not obeyed net */ + unsigned num_entries; + keymap_delete_func delete_func; +}; + + +struct keymap_item +{ + void *key, *value; +}; + + +/** + * This the default key-delete function used when the client doesn't + * provide one. + */ +static void +default_delete_func(const struct keymap *map, + const void *key, void *data, void *user) +{ + FREE((void*) data); +} + + +static INLINE struct keymap_item * +hash_table_item(struct cso_hash_iter iter) +{ + return (struct keymap_item *) cso_hash_iter_data(iter); +} + + +/** + * Return 4-byte hash key for a block of bytes. + */ +static unsigned +hash(const void *key, unsigned keySize) +{ + unsigned i, hash; + + keySize /= 4; /* convert from bytes to uints */ + + hash = 0; + for (i = 0; i < keySize; i++) { + hash ^= (i + 1) * ((const unsigned *) key)[i]; + } + + /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/ + + return hash; +} + + +/** + * Create a new map. + * \param keySize size of the keys in bytes + * \param maxEntries max number of entries to allow (~0 = infinity) + * \param deleteFunc optional callback to call when entries + * are deleted/replaced + */ +struct keymap * +util_new_keymap(unsigned keySize, unsigned maxEntries, + keymap_delete_func deleteFunc) +{ + struct keymap *map = MALLOC_STRUCT(keymap); + if (!map) + return NULL; + + map->cso = cso_hash_create(); + if (!map->cso) { + FREE(map); + return NULL; + } + + map->max_entries = maxEntries; + map->num_entries = 0; + map->key_size = keySize; + map->delete_func = deleteFunc ? deleteFunc : default_delete_func; + + return map; +} + + +/** + * Delete/free a keymap and all entries. The deleteFunc that was given at + * create time will be called for each entry. + * \param user user-provided pointer passed through to the delete callback + */ +void +util_delete_keymap(struct keymap *map, void *user) +{ + util_keymap_remove_all(map, user); + cso_hash_delete(map->cso); + FREE(map); +} + + +static INLINE struct cso_hash_iter +hash_table_find_iter(const struct keymap *map, const void *key, + unsigned key_hash) +{ + struct cso_hash_iter iter; + struct keymap_item *item; + + iter = cso_hash_find(map->cso, key_hash); + while (!cso_hash_iter_is_null(iter)) { + item = (struct keymap_item *) cso_hash_iter_data(iter); + if (!memcmp(item->key, key, map->key_size)) + break; + iter = cso_hash_iter_next(iter); + } + + return iter; +} + + +static INLINE struct keymap_item * +hash_table_find_item(const struct keymap *map, const void *key, + unsigned key_hash) +{ + struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash); + if (cso_hash_iter_is_null(iter)) { + return NULL; + } + else { + return hash_table_item(iter); + } +} + + +/** + * Insert a new key + data pointer into the table. + * Note: we create a copy of the key, but not the data! + * If the key is already present in the table, replace the existing + * entry (calling the delete callback on the previous entry). + * If the maximum capacity of the map is reached an old entry + * will be deleted (the delete callback will be called). + */ +boolean +util_keymap_insert(struct keymap *map, const void *key, + const void *data, void *user) +{ + unsigned key_hash; + struct keymap_item *item; + struct cso_hash_iter iter; + + assert(map); + + key_hash = hash(key, map->key_size); + + item = hash_table_find_item(map, key, key_hash); + if (item) { + /* call delete callback for old entry/item */ + map->delete_func(map, item->key, item->value, user); + item->value = (void *) data; + return TRUE; + } + + item = MALLOC_STRUCT(keymap_item); + if (!item) + return FALSE; + + item->key = mem_dup(key, map->key_size); + item->value = (void *) data; + + iter = cso_hash_insert(map->cso, key_hash, item); + if (cso_hash_iter_is_null(iter)) { + FREE(item); + return FALSE; + } + + map->num_entries++; + + return TRUE; +} + + +/** + * Look up a key in the map and return the associated data pointer. + */ +const void * +util_keymap_lookup(const struct keymap *map, const void *key) +{ + unsigned key_hash; + struct keymap_item *item; + + assert(map); + + key_hash = hash(key, map->key_size); + + item = hash_table_find_item(map, key, key_hash); + if (!item) + return NULL; + + return item->value; +} + + +/** + * Remove an entry from the map. + * The delete callback will be called if the given key/entry is found. + * \param user passed to the delete callback as the last param. + */ +void +util_keymap_remove(struct keymap *map, const void *key, void *user) +{ + unsigned key_hash; + struct cso_hash_iter iter; + struct keymap_item *item; + + assert(map); + + key_hash = hash(key, map->key_size); + + iter = hash_table_find_iter(map, key, key_hash); + if (cso_hash_iter_is_null(iter)) + return; + + item = hash_table_item(iter); + assert(item); + map->delete_func(map, item->key, item->value, user); + FREE(item->key); + FREE(item); + + map->num_entries--; + + cso_hash_erase(map->cso, iter); +} + + +/** + * Remove all entries from the map, calling the delete callback for each. + * \param user passed to the delete callback as the last param. + */ +void +util_keymap_remove_all(struct keymap *map, void *user) +{ + struct cso_hash_iter iter; + struct keymap_item *item; + + assert(map); + + iter = cso_hash_first_node(map->cso); + while (!cso_hash_iter_is_null(iter)) { + item = (struct keymap_item *) + cso_hash_take(map->cso, cso_hash_iter_key(iter)); + map->delete_func(map, item->key, item->value, user); + FREE(item->key); + FREE(item); + iter = cso_hash_first_node(map->cso); + } +} + + +extern void +util_keymap_info(const struct keymap *map) +{ + debug_printf("Keymap %p: %u of max %u entries\n", + (void *) map, map->num_entries, map->max_entries); +} diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h new file mode 100644 index 0000000000..8d60a76fc3 --- /dev/null +++ b/src/gallium/auxiliary/util/u_keymap.h @@ -0,0 +1,68 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_KEYMAP_H +#define U_KEYMAP_H + +#include "pipe/p_compiler.h" + + +/** opaque keymap type */ +struct keymap; + + +/** Delete/callback function type */ +typedef void (*keymap_delete_func)(const struct keymap *map, + const void *key, void *data, + void *user); + + +extern struct keymap * +util_new_keymap(unsigned keySize, unsigned maxEntries, + keymap_delete_func deleteFunc); + +extern void +util_delete_keymap(struct keymap *map, void *user); + +extern boolean +util_keymap_insert(struct keymap *map, const void *key, + const void *data, void *user); + +extern const void * +util_keymap_lookup(const struct keymap *map, const void *key); + +extern void +util_keymap_remove(struct keymap *map, const void *key, void *user); + +extern void +util_keymap_remove_all(struct keymap *map, void *user); + +extern void +util_keymap_info(const struct keymap *map); + + +#endif /* U_KEYMAP_H */ diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c index 0729114d6a..5b3cab4642 100644 --- a/src/gallium/auxiliary/util/u_math.c +++ b/src/gallium/auxiliary/util/u_math.c @@ -30,7 +30,7 @@ #include "util/u_math.h" - +/** 2^x, for x in [-1.0, 1.0[ */ float pow2_table[POW2_TABLE_SIZE]; @@ -38,9 +38,21 @@ static void init_pow2_table(void) { int i; - for (i = 0; i < POW2_TABLE_SIZE; i++) { - pow2_table[i] = (float) pow(2.0, i / POW2_TABLE_SCALE); - } + for (i = 0; i < POW2_TABLE_SIZE; i++) + pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE); +} + + +/** log2(x), for x in [1.0, 2.0[ */ +float log2_table[LOG2_TABLE_SIZE]; + + +static void +init_log2_table(void) +{ + unsigned i; + for (i = 0; i < LOG2_TABLE_SIZE; i++) + log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SIZE)); } @@ -53,6 +65,7 @@ util_init_math(void) static boolean initialized = FALSE; if (!initialized) { init_pow2_table(); + init_log2_table(); initialized = TRUE; } } diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index 084655e6dd..be7303e550 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -174,8 +174,10 @@ static INLINE float logf( float f ) -#define POW2_TABLE_SIZE 256 -#define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1)) +#define POW2_TABLE_SIZE_LOG2 9 +#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2) +#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2) +#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2)) extern float pow2_table[POW2_TABLE_SIZE]; @@ -186,98 +188,78 @@ util_init_math(void); union fi { float f; - int i; - unsigned ui; + int32_t i; + uint32_t ui; }; /** - * Fast approximation to exp(x). - * Compute with base 2 exponents: exp(x) = exp2(log2(e) * x) - * Note: log2(e) is a constant, k = 1.44269 - * So, exp(x) = exp2(k * x); + * Fast version of 2^x * Identity: exp2(a + b) = exp2(a) * exp2(b) - * Let ipart = int(k*x) - * Let fpart = k*x - ipart; - * So, exp2(k*x) = exp2(ipart) * exp2(fpart) + * Let ipart = int(x) + * Let fpart = x - ipart; + * So, exp2(x) = exp2(ipart) * exp2(fpart) * Compute exp2(ipart) with i << ipart * Compute exp2(fpart) with lookup table. */ static INLINE float -util_fast_exp(float x) +util_fast_exp2(float x) { - if (x >= 0.0f) { - float k = 1.44269f; /* = log2(e) */ - float kx = k * x; - int ipart = (int) kx; - float fpart = kx - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return y; - } - else { - /* exp(-x) = 1.0 / exp(x) */ - float k = -1.44269f; - float kx = k * x; - int ipart = (int) kx; - float fpart = kx - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return 1.0f / y; - } + int32_t ipart; + float fpart, mpart; + union fi epart; + + if(x > 129.00000f) + return 3.402823466e+38f; + + if(x < -126.99999f) + return 0.0f; + + ipart = (int32_t) x; + fpart = x - (float) ipart; + + /* same as + * epart.f = (float) (1 << ipart) + * but faster and without integer overflow for ipart > 31 */ + epart.i = (ipart + 127 ) << 23; + + mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)]; + + return epart.f * mpart; } /** - * Fast version of 2^x - * XXX the above function could be implemented in terms of this one. + * Fast approximation to exp(x). */ static INLINE float -util_fast_exp2(float x) +util_fast_exp(float x) { - if (x >= 0.0f) { - int ipart = (int) x; - float fpart = x - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return y; - } - else { - /* exp(-x) = 1.0 / exp(x) */ - int ipart = (int) -x; - float fpart = -x - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return 1.0f / y; - } + const float k = 1.44269f; /* = log2(e) */ + return util_fast_exp2(k * x); } -/** - * Based on code from http://www.flipcode.com/archives/Fast_log_Function.shtml - */ +#define LOG2_TABLE_SIZE_LOG2 8 +#define LOG2_TABLE_SIZE (1 << LOG2_TABLE_SIZE_LOG2) +extern float log2_table[LOG2_TABLE_SIZE]; + + static INLINE float -util_fast_log2(float val) +util_fast_log2(float x) { union fi num; - int log_2; - num.f = val; - log_2 = ((num.i >> 23) & 255) - 128; - num.i &= ~(255 << 23); - num.i += 127 << 23; - num.f = ((-1.0f/3) * num.f + 2) * num.f - 2.0f/3; - return num.f + log_2; + float epart, mpart; + num.f = x; + epart = (float)(((num.i & 0x7f800000) >> 23) - 127); + mpart = log2_table[(num.i & 0x007fffff) >> (23 - LOG2_TABLE_SIZE_LOG2)]; + return epart + mpart; } static INLINE float util_fast_pow(float x, float y) { - /* XXX these tests may need adjustment */ - if (y >= 3.0f && (-0.02f <= x && x <= 0.02f)) - return 0.0f; - if (y >= 50.0f && (-0.9f <= x && x <= 0.9f)) - return 0.0f; return util_fast_exp2(util_fast_log2(x) * y); } diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h index 0c8356cd05..e2a8491e62 100644 --- a/src/gallium/auxiliary/util/u_sse.h +++ b/src/gallium/auxiliary/util/u_sse.h @@ -37,6 +37,10 @@ #ifndef U_SSE_H_ #define U_SSE_H_ +#include "pipe/p_config.h" + +#if defined(PIPE_ARCH_SSE) + #include <xmmintrin.h> #include <emmintrin.h> @@ -66,7 +70,8 @@ _mm_castps_si128(__m128 a) return u.m128i; } -#endif +#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */ +#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ #endif /* U_SSE_H_ */ |