diff options
Diffstat (limited to 'src/gallium/auxiliary')
72 files changed, 5819 insertions, 1259 deletions
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c index b1ccfc0374..68508f24de 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.c +++ b/src/gallium/auxiliary/cso_cache/cso_context.c @@ -80,6 +80,10 @@ struct cso_context { }; +static void +free_framebuffer_state(struct pipe_framebuffer_state *fb); + + static boolean delete_blend_state(struct cso_context *ctx, void *state) { struct cso_blend *cso = (struct cso_blend *)state; @@ -252,6 +256,9 @@ void cso_release_all( struct cso_context *ctx ) pipe_texture_reference(&ctx->textures_saved[i], NULL); } + free_framebuffer_state(&ctx->fb); + free_framebuffer_state(&ctx->fb_saved); + if (ctx->cache) { cso_cache_delete( ctx->cache ); ctx->cache = NULL; @@ -784,6 +791,18 @@ copy_framebuffer_state(struct pipe_framebuffer_state *dst, } +static void +free_framebuffer_state(struct pipe_framebuffer_state *fb) +{ + uint i; + + for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) { + pipe_surface_reference(&fb->cbufs[i], NULL); + } + pipe_surface_reference(&fb->zsbuf, NULL); +} + + enum pipe_error cso_set_framebuffer(struct cso_context *ctx, const struct pipe_framebuffer_state *fb) { @@ -804,6 +823,7 @@ void cso_restore_framebuffer(struct cso_context *ctx) if (memcmp(&ctx->fb, &ctx->fb_saved, sizeof(ctx->fb))) { copy_framebuffer_state(&ctx->fb, &ctx->fb_saved); ctx->pipe->set_framebuffer_state(ctx->pipe, &ctx->fb); + free_framebuffer_state(&ctx->fb_saved); } } diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c index 7f0044c5a7..4e7664f9bf 100644 --- a/src/gallium/auxiliary/cso_cache/cso_hash.c +++ b/src/gallium/auxiliary/cso_cache/cso_hash.c @@ -431,3 +431,9 @@ struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter --hash->data.d->size; return ret; } + +boolean cso_hash_contains(struct cso_hash *hash, unsigned key) +{ + struct cso_node **node = cso_hash_find_node(hash, key); + return (*node != hash->data.e); +} diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h index 85f3e276c6..5891c325fa 100644 --- a/src/gallium/auxiliary/cso_cache/cso_hash.h +++ b/src/gallium/auxiliary/cso_cache/cso_hash.h @@ -44,6 +44,7 @@ #ifndef CSO_HASH_H #define CSO_HASH_H +#include "pipe/p_compiler.h" #ifdef __cplusplus extern "C" { @@ -95,6 +96,11 @@ struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash); */ struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key); +/** + * Returns true if a value with the given key exists in the hash + */ +boolean cso_hash_contains(struct cso_hash *hash, unsigned key); + int cso_hash_iter_is_null(struct cso_hash_iter iter); unsigned cso_hash_iter_key(struct cso_hash_iter iter); diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile index f2e36a89e9..bdbf5a08ed 100644 --- a/src/gallium/auxiliary/draw/Makefile +++ b/src/gallium/auxiliary/draw/Makefile @@ -40,6 +40,7 @@ C_SOURCES = \ draw_vs_aos_machine.c \ draw_vs_exec.c \ draw_vs_llvm.c \ + draw_vs_ppc.c \ draw_vs_sse.c diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript index 544a04918b..5f05aa324a 100644 --- a/src/gallium/auxiliary/draw/SConscript +++ b/src/gallium/auxiliary/draw/SConscript @@ -38,6 +38,7 @@ draw = env.ConvenienceLibrary( 'draw_vs_aos_machine.c', 'draw_vs_exec.c', 'draw_vs_llvm.c', + 'draw_vs_ppc.c', 'draw_vs_sse.c', 'draw_vs_varient.c' ]) diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 78249054f2..b439bc4059 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -274,6 +274,14 @@ draw_enable_point_sprites(struct draw_context *draw, boolean enable) } +void +draw_set_force_passthrough( struct draw_context *draw, boolean enable ) +{ + draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); + draw->force_passthrough = enable; +} + + /** * Ask the draw module for the location/slot of the given vertex attribute in * a post-transformed vertex. diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index 0ab3681b64..3eeb453531 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -160,6 +160,9 @@ void draw_set_render( struct draw_context *draw, void draw_set_driver_clipping( struct draw_context *draw, boolean bypass_clipping ); +void draw_set_force_passthrough( struct draw_context *draw, + boolean enable ); + /******************************************************************************* * Draw pipeline */ diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c index c0cf4269db..9825e116c3 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c +++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c @@ -231,9 +231,9 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint prim ) unsigned emit_sz = 0; unsigned src_buffer = 0; unsigned output_format; - unsigned src_offset = (vbuf->vinfo->src_index[i] * 4 * sizeof(float) ); + unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) ); - switch (vbuf->vinfo->emit[i]) { + switch (vbuf->vinfo->attrib[i].emit) { case EMIT_4F: output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit_sz = 4 * sizeof(float); diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 626a2e3e30..5d531146c5 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -163,12 +163,15 @@ struct draw_context struct { boolean bypass_clipping; + boolean bypass_vs; } driver; boolean flushing; /**< debugging/sanity */ boolean suspend_flushing; /**< internally set */ boolean bypass_clipping; /**< set if either api or driver bypass_clipping true */ + boolean force_passthrough; /**< never clip or shade */ + /* pipe state that we need: */ const struct pipe_rasterizer_state *rasterizer; struct pipe_viewport_state viewport; @@ -193,7 +196,7 @@ struct draw_context const float (*aligned_constants)[4]; - float (*aligned_constant_storage)[4]; + const float (*aligned_constant_storage)[4]; unsigned const_storage_size; diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 669c11c993..87ec6ae20c 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -69,26 +69,26 @@ draw_pt_arrays(struct draw_context *draw, return TRUE; } - - if (!draw->render) { - opt |= PT_PIPELINE; - } - - if (draw_need_pipeline(draw, - draw->rasterizer, - prim)) { - opt |= PT_PIPELINE; - } - - if (!draw->bypass_clipping && !draw->pt.test_fse) { - opt |= PT_CLIPTEST; + if (!draw->force_passthrough) { + if (!draw->render) { + opt |= PT_PIPELINE; + } + + if (draw_need_pipeline(draw, + draw->rasterizer, + prim)) { + opt |= PT_PIPELINE; + } + + if (!draw->bypass_clipping && !draw->pt.test_fse) { + opt |= PT_CLIPTEST; + } + + if (!draw->rasterizer->bypass_vs) { + opt |= PT_SHADE; + } } - - if (!draw->rasterizer->bypass_vs) { - opt |= PT_SHADE; - } - - + if (opt == 0) middle = draw->pt.middle.fetch_emit; else if (opt == PT_SHADE && !draw->pt.no_fse) diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index d4eca80588..d520b05869 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -84,11 +84,11 @@ void draw_pt_emit_prepare( struct pt_emit *emit, unsigned emit_sz = 0; unsigned src_buffer = 0; unsigned output_format; - unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) ); + unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) ); - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_4F: output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit_sz = 4 * sizeof(float); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index 5a4db6cfe5..3966ad48ba 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -121,7 +121,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, memset(&key, 0, sizeof(key)); for (i = 0; i < vinfo->num_attribs; i++) { - const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->src_index[i]]; + const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->attrib[i].src_index]; unsigned emit_sz = 0; unsigned input_format = src->src_format; @@ -129,7 +129,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, unsigned input_offset = src->src_offset; unsigned output_format; - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_4F: output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit_sz = 4 * sizeof(float); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index 73fc70c1bc..f7e6a1a8ee 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -79,6 +79,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs; const struct vertex_info *vinfo; unsigned i; + unsigned nr_vbs = 0; if (!draw->render->set_primitive( draw->render, @@ -102,7 +103,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.viewport = !draw->identity_viewport; fse->key.clip = !draw->bypass_clipping; - fse->key.pad = 0; + fse->key.const_vbuffers = 0; memset(fse->key.element, 0, fse->key.nr_elements * sizeof(fse->key.element[0])); @@ -116,16 +117,23 @@ static void fse_prepare( struct draw_pt_middle_end *middle, */ fse->key.element[i].in.buffer = src->vertex_buffer_index; fse->key.element[i].in.offset = src->src_offset; + nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1); } + for (i = 0; i < 5 && i < nr_vbs; i++) { + if (draw->pt.vertex_buffer[i].pitch == 0) + fse->key.const_vbuffers |= (1<<i); + } + if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers); + { unsigned dst_offset = 0; for (i = 0; i < vinfo->num_attribs; i++) { unsigned emit_sz = 0; - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_4F: emit_sz = 4 * sizeof(float); break; @@ -153,8 +161,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, * numbers, not to positions in the hw vertex description -- * that's handled by the output_offset field. */ - fse->key.element[i].out.format = vinfo->emit[i]; - fse->key.element[i].out.vs_output = vinfo->src_index[i]; + fse->key.element[i].out.format = vinfo->attrib[i].emit; + fse->key.element[i].out.vs_output = vinfo->attrib[i].src_index; fse->key.element[i].out.offset = dst_offset; dst_offset += emit_sz; @@ -162,13 +170,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, } } - - /* Would normally look up a vertex shader and peruse its list of - * varients somehow. We omitted that step and put all the - * hardcoded "shaders" into an array. We're just making the - * assumption that this happens to be a matching shader... ie - * you're running isosurf, aren't you? - */ + fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, &fse->key ); @@ -177,18 +179,17 @@ static void fse_prepare( struct draw_pt_middle_end *middle, return ; } + if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__, + fse->active->key.const_vbuffers); + /* Now set buffer pointers: */ - for (i = 0; i < num_vs_inputs; i++) { - unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index; - - fse->active->set_input( fse->active, - i, - - ((const ubyte *) draw->pt.user.vbuffer[buf] + - draw->pt.vertex_buffer[buf].buffer_offset), - - draw->pt.vertex_buffer[buf].pitch ); + for (i = 0; i < draw->pt.nr_vertex_buffers; i++) { + fse->active->set_buffer( fse->active, + i, + ((const ubyte *) draw->pt.user.vbuffer[i] + + draw->pt.vertex_buffer[i].buffer_offset), + draw->pt.vertex_buffer[i].pitch ); } *max_vertices = (draw->render->max_vertex_buffer_bytes / diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c index 1446f785c5..3214213e44 100644 --- a/src/gallium/auxiliary/draw/draw_vertex.c +++ b/src/gallium/auxiliary/draw/draw_vertex.c @@ -49,7 +49,7 @@ draw_compute_vertex_size(struct vertex_info *vinfo) vinfo->size = 0; for (i = 0; i < vinfo->num_attribs; i++) { - switch (vinfo->emit[i]) { + switch (vinfo->attrib[i].emit) { case EMIT_OMIT: break; case EMIT_4UB: @@ -81,8 +81,8 @@ draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data) unsigned i, j; for (i = 0; i < vinfo->num_attribs; i++) { - j = vinfo->src_index[i]; - switch (vinfo->emit[i]) { + j = vinfo->attrib[i].src_index; + switch (vinfo->attrib[i].emit) { case EMIT_OMIT: debug_printf("EMIT_OMIT:"); break; diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h index 16c65c4317..a943607d7e 100644 --- a/src/gallium/auxiliary/draw/draw_vertex.h +++ b/src/gallium/auxiliary/draw/draw_vertex.h @@ -75,12 +75,41 @@ struct vertex_info { uint num_attribs; uint hwfmt[4]; /**< hardware format info for this format */ - enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS]; - enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS]; /**< EMIT_x */ - uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */ uint size; /**< total vertex size in dwords */ + + /* Keep this small and at the end of the struct to allow quick + * memcmp() comparisons. + */ + struct { + ubyte interp_mode:4; /**< INTERP_x */ + ubyte emit:4; /**< EMIT_x */ + ubyte src_index; /**< map to post-xform attribs */ + } attrib[PIPE_MAX_SHADER_INPUTS]; }; +static INLINE int +draw_vinfo_size( const struct vertex_info *a ) +{ + return ((const char *)&a->attrib[a->num_attribs] - + (const char *)a); +} + +static INLINE int +draw_vinfo_compare( const struct vertex_info *a, + const struct vertex_info *b ) +{ + unsigned sizea = draw_vinfo_size( a ); + return memcmp( a, b, sizea ); +} + +static INLINE void +draw_vinfo_copy( struct vertex_info *dst, + const struct vertex_info *src ) +{ + unsigned size = draw_vinfo_size( src ); + memcpy( dst, src, size ); +} + /** @@ -91,14 +120,15 @@ struct vertex_info */ static INLINE uint draw_emit_vertex_attr(struct vertex_info *vinfo, - enum attrib_emit emit, enum interp_mode interp, + enum attrib_emit emit, + enum interp_mode interp, /* only used by softpipe??? */ uint src_index) { const uint n = vinfo->num_attribs; assert(n < PIPE_MAX_SHADER_INPUTS); - vinfo->emit[n] = emit; - vinfo->interp_mode[n] = interp; - vinfo->src_index[n] = src_index; + vinfo->attrib[n].emit = emit; + vinfo->attrib[n].interp_mode = interp; + vinfo->attrib[n].src_index = src_index; vinfo->num_attribs++; return n; } diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index 34adbd49b0..7f305304ff 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -85,7 +85,10 @@ draw_create_vertex_shader(struct draw_context *draw, if (!vs) { vs = draw_create_vs_sse( draw, shader ); if (!vs) { - vs = draw_create_vs_exec( draw, shader ); + vs = draw_create_vs_ppc( draw, shader ); + if (!vs) { + vs = draw_create_vs_exec( draw, shader ); + } } } diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index 45992d1986..89ae158751 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -64,7 +64,7 @@ struct draw_vs_varient_key { unsigned nr_outputs:8; unsigned viewport:1; unsigned clip:1; - unsigned pad:5; + unsigned const_vbuffers:5; struct draw_varient_element element[PIPE_MAX_ATTRIBS]; }; @@ -76,7 +76,7 @@ struct draw_vs_varient { struct draw_vertex_shader *vs; - void (*set_input)( struct draw_vs_varient *, + void (*set_buffer)( struct draw_vs_varient *, unsigned i, const void *ptr, unsigned stride ); @@ -158,6 +158,10 @@ draw_create_vs_sse(struct draw_context *draw, const struct pipe_shader_state *templ); struct draw_vertex_shader * +draw_create_vs_ppc(struct draw_context *draw, + const struct pipe_shader_state *templ); + +struct draw_vertex_shader * draw_create_vs_llvm(struct draw_context *draw, const struct pipe_shader_state *templ); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index a556477a76..87232865e2 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -92,9 +92,9 @@ struct x86_reg aos_get_x86( struct aos_compilation *cp, assert(which_reg == 1); offset = Offset(struct aos_machine, constants); break; - case X86_ATTRIBS: + case X86_BUFFERS: assert(which_reg == 0); - offset = Offset(struct aos_machine, attrib); + offset = Offset(struct aos_machine, buffer); break; default: assert(0); @@ -196,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx ) } +void aos_spill_all( struct aos_compilation *cp ) +{ + unsigned i; + + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } +} + + static struct x86_reg get_xmm_writable( struct aos_compilation *cp, struct x86_reg reg ) { @@ -1939,6 +1951,11 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, save_fpu_state( &cp ); set_fpu_round_nearest( &cp ); + aos_init_inputs( &cp, linear ); + + cp.x86_reg[0] = 0; + cp.x86_reg[1] = 0; + /* Note address for loop jump */ label = x86_get_label(cp.func); @@ -2018,13 +2035,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, /* Incr index */ - if (linear) { - x86_inc(cp.func, cp.idx_EBX); - } - else { - x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4)); - } - + aos_incr_inputs( &cp, linear ); } /* decr count, loop if not zero */ @@ -2065,15 +2076,13 @@ static void vaos_set_buffer( struct draw_vs_varient *varient, unsigned stride ) { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; - unsigned i; - for (i = 0; i < vaos->base.key.nr_inputs; i++) { - if (vaos->base.key.element[i].in.buffer == buf) { - vaos->attrib[i].input_ptr = ((char *)ptr + - vaos->base.key.element[i].in.offset); - vaos->attrib[i].input_stride = stride; - } + if (buf < vaos->nr_vb) { + vaos->buffer[buf].base_ptr = (char *)ptr; + vaos->buffer[buf].stride = stride; } + + if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride); } @@ -2086,10 +2095,12 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient, struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; struct aos_machine *machine = vaos->draw->vs.aos_machine; + if (0) debug_printf("%s %d\n", __FUNCTION__, count); + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; machine->constants = vaos->draw->vs.aligned_constants; machine->immediates = vaos->base.vs->immediates; - machine->attrib = vaos->attrib; + machine->buffer = vaos->buffer; vaos->gen_run_elts( machine, elts, @@ -2105,10 +2116,13 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient, struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; struct aos_machine *machine = vaos->draw->vs.aos_machine; + if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, + vaos->base.key.const_vbuffers); + machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; machine->constants = vaos->draw->vs.aligned_constants; machine->immediates = vaos->base.vs->immediates; - machine->attrib = vaos->attrib; + machine->buffer = vaos->buffer; vaos->gen_run_linear( machine, start, @@ -2127,7 +2141,7 @@ static void vaos_destroy( struct draw_vs_varient *varient ) { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; - FREE( vaos->attrib ); + FREE( vaos->buffer ); x86_release_func( &vaos->func[0] ); x86_release_func( &vaos->func[1] ); @@ -2140,6 +2154,7 @@ static void vaos_destroy( struct draw_vs_varient *varient ) static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_varient_key *key ) { + unsigned i; struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); if (!vaos) @@ -2147,17 +2162,22 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, vaos->base.key = *key; vaos->base.vs = vs; - vaos->base.set_input = vaos_set_buffer; + vaos->base.set_buffer = vaos_set_buffer; vaos->base.destroy = vaos_destroy; vaos->base.run_linear = vaos_run_linear; vaos->base.run_elts = vaos_run_elts; vaos->draw = vs->draw; - vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) ); - if (!vaos->attrib) + for (i = 0; i < key->nr_inputs; i++) + vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 ); + + vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) ); + if (!vaos->buffer) goto fail; + debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); + #if 0 tgsi_dump(vs->state.tokens, 0); #endif @@ -2179,8 +2199,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, return &vaos->base; fail: - if (vaos && vaos->attrib) - FREE(vaos->attrib); + if (vaos && vaos->buffer) + FREE(vaos->buffer); if (vaos) x86_release_func( &vaos->func[0] ); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index 7fe6f79db0..264387517b 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -87,9 +87,10 @@ struct lit_info { #define MAX_SHINE_TAB 4 #define MAX_LIT_INFO 16 -struct aos_attrib { - const void *input_ptr; - unsigned input_stride; +struct aos_buffer { + const void *base_ptr; + unsigned stride; + void *ptr; /* updated per vertex */ }; @@ -123,7 +124,7 @@ struct aos_machine { const float (*immediates)[4]; /* points to shader data */ const float (*constants)[4]; /* points to draw data */ - const struct aos_attrib *attrib; /* points to ? */ + const struct aos_buffer *buffer; /* points to ? */ }; @@ -175,12 +176,15 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp, unsigned idx, unsigned dirty ); +void aos_spill_all( struct aos_compilation *cp ); + struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, unsigned file, unsigned idx ); -boolean aos_fetch_inputs( struct aos_compilation *cp, - boolean linear ); +boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ); +boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ); +boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ); boolean aos_emit_outputs( struct aos_compilation *cp ); @@ -210,7 +214,7 @@ do { \ #define X86_NULL 0 #define X86_IMMEDIATES 1 #define X86_CONSTANTS 2 -#define X86_ATTRIBS 3 +#define X86_BUFFERS 3 struct x86_reg aos_get_x86( struct aos_compilation *cp, unsigned which_reg, @@ -232,7 +236,8 @@ struct draw_vs_varient_aos_sse { struct draw_vs_varient base; struct draw_context *draw; - struct aos_attrib *attrib; + struct aos_buffer *buffer; + unsigned nr_vb; vaos_run_linear_func gen_run_linear; vaos_run_elts_func gen_run_elts; diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c index 26297c74f8..39f75b50b7 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -54,6 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp, struct x86_reg data, struct x86_reg src_ptr ) { +#if 1 sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); /* data = z ? ? ? */ sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); @@ -62,6 +63,16 @@ static void emit_load_R32G32B32( struct aos_compilation *cp, /* data = ? 0 z 1 */ sse_movlps(cp->func, data, src_ptr); /* data = x y z 1 */ +#else + sse_movups(cp->func, data, src_ptr); + /* data = x y z ? */ + sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) ); + /* data = ? x y z */ + sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) ); + /* data = 1 x y z */ + sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) ); + /* data = x y z 1 */ +#endif } static void emit_load_R32G32( struct aos_compilation *cp, @@ -95,28 +106,6 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, -static void get_src_ptr( struct aos_compilation *cp, - struct x86_reg src, - struct x86_reg elt, - unsigned a ) -{ - struct x86_reg attrib = x86_make_disp(aos_get_x86( cp, 0, X86_ATTRIBS ), - a * sizeof(struct aos_attrib)); - - struct x86_reg input_ptr = x86_make_disp(attrib, - Offset(struct aos_attrib, input_ptr)); - - struct x86_reg input_stride = x86_make_disp(attrib, - Offset(struct aos_attrib, input_stride)); - - /* Calculate pointer to current attrib: - */ - x86_mov(cp->func, src, input_stride); - x86_imul(cp->func, src, elt); - x86_add(cp->func, src, input_ptr); -} - - /* Extended swizzles? Maybe later. */ static void emit_swizzle( struct aos_compilation *cp, @@ -128,22 +117,60 @@ static void emit_swizzle( struct aos_compilation *cp, } + +static boolean get_buffer_ptr( struct aos_compilation *cp, + boolean linear, + unsigned buf_idx, + struct x86_reg elt, + struct x86_reg ptr) +{ + struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), + buf_idx * sizeof(struct aos_buffer)); + + struct x86_reg buf_stride = x86_make_disp(buf, + Offset(struct aos_buffer, stride)); + if (linear) { + struct x86_reg buf_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, ptr)); + + + /* Calculate pointer to current attrib: + */ + x86_mov(cp->func, ptr, buf_ptr); + x86_mov(cp->func, elt, buf_stride); + x86_add(cp->func, elt, ptr); + if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192)); + x86_mov(cp->func, buf_ptr, elt); + } + else { + struct x86_reg buf_base_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, base_ptr)); + + + /* Calculate pointer to current attrib: + */ + x86_mov(cp->func, ptr, buf_stride); + x86_imul(cp->func, ptr, elt); + x86_add(cp->func, ptr, buf_base_ptr); + } + + cp->insn_counter++; + + return TRUE; +} + + static boolean load_input( struct aos_compilation *cp, unsigned idx, - boolean linear ) + struct x86_reg bufptr ) { unsigned format = cp->vaos->base.key.element[idx].in.format; - struct x86_reg src = cp->tmp_EAX; + unsigned offset = cp->vaos->base.key.element[idx].in.offset; struct x86_reg dataXMM = aos_get_xmm_reg(cp); /* Figure out source pointer address: */ - get_src_ptr(cp, - src, - linear ? cp->idx_EBX : x86_deref(cp->idx_EBX), - idx); - - src = x86_deref(src); + struct x86_reg src = x86_make_disp(bufptr, offset); aos_adopt_xmm_reg( cp, dataXMM, @@ -179,20 +206,128 @@ static boolean load_input( struct aos_compilation *cp, return TRUE; } - -boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) +static boolean load_inputs( struct aos_compilation *cp, + unsigned buffer, + struct x86_reg ptr ) { unsigned i; - + for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) { - if (!load_input( cp, i, linear )) - return FALSE; - cp->insn_counter++; + if (cp->vaos->base.key.element[i].in.buffer == buffer) { + + if (!load_input( cp, i, ptr )) + return FALSE; + + cp->insn_counter++; + } + } + + return TRUE; +} + +boolean aos_init_inputs( struct aos_compilation *cp, boolean linear ) +{ + unsigned i; + for (i = 0; i < cp->vaos->nr_vb; i++) { + struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), + i * sizeof(struct aos_buffer)); + + struct x86_reg buf_base_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, base_ptr)); + + if (cp->vaos->base.key.const_vbuffers & (1<<i)) { + struct x86_reg ptr = cp->tmp_EAX; + + x86_mov(cp->func, ptr, buf_base_ptr); + + /* Load all inputs for this constant vertex buffer + */ + load_inputs( cp, i, x86_deref(ptr) ); + + /* Then just force them out to aos_machine.input[] + */ + aos_spill_all( cp ); + + } + else if (linear) { + + struct x86_reg elt = cp->idx_EBX; + struct x86_reg ptr = cp->tmp_EAX; + + struct x86_reg buf_stride = x86_make_disp(buf, + Offset(struct aos_buffer, stride)); + + struct x86_reg buf_ptr = x86_make_disp(buf, + Offset(struct aos_buffer, ptr)); + + + /* Calculate pointer to current attrib: + */ + x86_mov(cp->func, ptr, buf_stride); + x86_imul(cp->func, ptr, elt); + x86_add(cp->func, ptr, buf_base_ptr); + + + /* In the linear case, keep the buffer pointer instead of the + * index number. + */ + if (cp->vaos->nr_vb == 1) + x86_mov( cp->func, elt, ptr ); + else + x86_mov( cp->func, buf_ptr, ptr ); + + cp->insn_counter++; + } + } + + return TRUE; +} + +boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) +{ + unsigned j; + + for (j = 0; j < cp->vaos->nr_vb; j++) { + if (cp->vaos->base.key.const_vbuffers & (1<<j)) { + /* just retreive pre-transformed input */ + } + else if (linear && cp->vaos->nr_vb == 1) { + load_inputs( cp, 0, cp->idx_EBX ); + } + else { + struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX); + struct x86_reg ptr = cp->tmp_EAX; + + if (!get_buffer_ptr( cp, linear, j, elt, ptr )) + return FALSE; + + if (!load_inputs( cp, j, ptr )) + return FALSE; + } } return TRUE; } +boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear ) +{ + if (linear && cp->vaos->nr_vb == 1) { + struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), + (0 * sizeof(struct aos_buffer) + + Offset(struct aos_buffer, stride))); + + x86_add(cp->func, cp->idx_EBX, stride); + sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192)); + } + else if (linear) { + /* Nothing to do */ + } + else { + x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4)); + } + + return TRUE; +} @@ -306,7 +441,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp ) if (data.file != file_XMM) { struct x86_reg tmp = aos_get_xmm_reg( cp ); - sse_movups(cp->func, tmp, data); + sse_movaps(cp->func, tmp, data); data = tmp; } diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index 44563803f9..13d4fcfdbf 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -62,12 +62,15 @@ vs_exec_prepare( struct draw_vertex_shader *shader, { struct exec_vertex_shader *evs = exec_vertex_shader(shader); - /* specify the vertex program to interpret/execute */ - tgsi_exec_machine_bind_shader(evs->machine, - shader->state.tokens, - PIPE_MAX_SAMPLERS, - NULL /*samplers*/ ); - + /* Specify the vertex program to interpret/execute. + * Avoid rebinding when possible. + */ + if (evs->machine->Tokens != shader->state.tokens) { + tgsi_exec_machine_bind_shader(evs->machine, + shader->state.tokens, + PIPE_MAX_SAMPLERS, + NULL /*samplers*/ ); + } } diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c index 2ce30b9a02..727977bc3a 100644 --- a/src/gallium/auxiliary/draw/draw_vs_llvm.c +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -32,6 +32,7 @@ * Brian Paul */ +#include "util/u_memory.h" #include "pipe/p_shader_tokens.h" #include "draw_private.h" #include "draw_context.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c new file mode 100644 index 0000000000..8eff6d4fda --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c @@ -0,0 +1,274 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + /* + * Authors: + * Keith Whitwell <keith@tungstengraphics.com> + * Brian Paul + */ + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "pipe/p_config.h" + +#include "draw_vs.h" + +#if defined(PIPE_ARCH_PPC) + +#include "pipe/p_shader_tokens.h" + +#include "draw_private.h" +#include "draw_context.h" + +#include "rtasm/rtasm_cpu.h" +#include "rtasm/rtasm_ppc.h" +#include "tgsi/tgsi_ppc.h" +#include "tgsi/tgsi_parse.h" + + + +typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4], + float (*outputs)[4][4], + float (*temps)[4][4], + float (*immeds)[4][4], + float (*consts)[4], + const float *builtins); + +#if 0 + const struct tgsi_exec_vector *input, + struct tgsi_exec_vector *output, + float (*constant)[4], /* 3 */ + struct tgsi_exec_vector *temporary, /* 4 */ + float (*immediates)[4], /* 5 */ + const float (*aos_input)[4], /* 6 */ + uint num_inputs, /* 7 */ + uint input_stride, /* 8 */ + float (*aos_output)[4], /* 9 */ + uint num_outputs, /* 10 */ + uint output_stride ); /* 11 */ +#endif + +struct draw_ppc_vertex_shader { + struct draw_vertex_shader base; + struct ppc_function ppc_program; + + codegen_function func; + + struct tgsi_exec_machine *machine; +}; + + +static void +vs_ppc_prepare( struct draw_vertex_shader *base, + struct draw_context *draw ) +{ +} + + + +/* Simplified vertex shader interface for the pt paths. Given the + * complexity of code-generating all the above operations together, + * it's time to try doing all the other stuff separately. + */ +static void +vs_ppc_run_linear( struct draw_vertex_shader *base, + const float (*input)[4], + float (*output)[4], + const float (*constants)[4], + unsigned count, + unsigned input_stride, + unsigned output_stride ) +{ + struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base; + struct tgsi_exec_machine *machine = shader->machine; + unsigned int i; + +#define MAX_VERTICES 4 + + /* loop over verts */ + for (i = 0; i < count; i += MAX_VERTICES) { + const uint max_vertices = MIN2(MAX_VERTICES, count - i); + float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB; + float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB; + float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB; + uint attr; + + /* convert (up to) four input verts to SoA format */ + for (attr = 0; attr < base->info.num_inputs; attr++) { + const float *vIn = (const float *) input; + uint vert; + for (vert = 0; vert < max_vertices; vert++) { +#if 0 + if (attr==0) + printf("Input v%d a%d: %f %f %f %f\n", + vert, attr, vIn[0], vIn[1], vIn[2], vIn[3]); +#endif + inputs_soa[attr][0][vert] = vIn[attr * 4 + 0]; + inputs_soa[attr][1][vert] = vIn[attr * 4 + 1]; + inputs_soa[attr][2][vert] = vIn[attr * 4 + 2]; + inputs_soa[attr][3][vert] = vIn[attr * 4 + 3]; + vIn += input_stride / 4; + } + } + + /* run compiled shader + */ +#if 0 + shader->func(machine->Inputs, + machine->Outputs, + (float (*)[4])constants, + machine->Temps, + (float (*)[4])shader->base.immediates, + input, + base->info.num_inputs, + input_stride, + output, + base->info.num_outputs, + output_stride ); +#else + shader->func(inputs_soa, outputs_soa, temps_soa, + (float (*)[4][4]) shader->base.immediates, + (float (*)[4]) constants, + ppc_builtin_constants); + + /*output[0][0] = input[0][0] * 0.5;*/ +#endif + + /* convert (up to) four output verts from SoA back to AoS format */ + for (attr = 0; attr < base->info.num_outputs; attr++) { + float *vOut = (float *) output; + uint vert; + for (vert = 0; vert < max_vertices; vert++) { + vOut[attr * 4 + 0] = outputs_soa[attr][0][vert]; + vOut[attr * 4 + 1] = outputs_soa[attr][1][vert]; + vOut[attr * 4 + 2] = outputs_soa[attr][2][vert]; + vOut[attr * 4 + 3] = outputs_soa[attr][3][vert]; +#if 0 + if (attr==0) + printf("Output v%d a%d: %f %f %f %f\n", + vert, attr, vOut[0], vOut[1], vOut[2], vOut[3]); +#endif + vOut += output_stride / 4; + } + } + + /* advance to next group of four input/output verts */ + input = (const float (*)[4])((const char *)input + input_stride * max_vertices); + output = (float (*)[4])((char *)output + output_stride * max_vertices); + } +} + + + + +static void +vs_ppc_delete( struct draw_vertex_shader *base ) +{ + struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base; + + ppc_release_func( &shader->ppc_program ); + + align_free( (void *) shader->base.immediates ); + + FREE( (void*) shader->base.state.tokens ); + FREE( shader ); +} + + +struct draw_vertex_shader * +draw_create_vs_ppc(struct draw_context *draw, + const struct pipe_shader_state *templ) +{ + struct draw_ppc_vertex_shader *vs; + + vs = CALLOC_STRUCT( draw_ppc_vertex_shader ); + if (vs == NULL) + return NULL; + + /* we make a private copy of the tokens */ + vs->base.state.tokens = tgsi_dup_tokens(templ->tokens); + if (!vs->base.state.tokens) + goto fail; + + tgsi_scan_shader(templ->tokens, &vs->base.info); + + vs->base.draw = draw; +#if 0 + if (1) + vs->base.create_varient = draw_vs_varient_aos_ppc; + else +#endif + vs->base.create_varient = draw_vs_varient_generic; + vs->base.prepare = vs_ppc_prepare; + vs->base.run_linear = vs_ppc_run_linear; + vs->base.delete = vs_ppc_delete; + + vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 * 4 * + sizeof(float), 16); + + vs->machine = &draw->vs.machine; + + ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */ + + if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens, + &vs->ppc_program, + (float (*)[4])vs->base.immediates, + TRUE )) + goto fail; + + vs->func = (codegen_function) ppc_get_func( &vs->ppc_program ); + if (!vs->func) { + goto fail; + } + + return &vs->base; + +fail: + /* + debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n"); + */ + + ppc_release_func( &vs->ppc_program ); + + FREE(vs); + return NULL; +} + + + +#else /* PIPE_ARCH_PPC */ + + +struct draw_vertex_shader * +draw_create_vs_ppc( struct draw_context *draw, + const struct pipe_shader_state *templ ) +{ + return (void *) 0; +} + + +#endif /* PIPE_ARCH_PPC */ diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index 0efabd9de8..b11ae31662 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -37,7 +37,7 @@ #include "draw_vs.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) #include "pipe/p_shader_tokens.h" diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index 4daf05dae7..7ee567d478 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -64,10 +64,10 @@ struct draw_vs_varient_generic { -static void vsvg_set_input( struct draw_vs_varient *varient, - unsigned buffer, - const void *ptr, - unsigned stride ) +static void vsvg_set_buffer( struct draw_vs_varient *varient, + unsigned buffer, + const void *ptr, + unsigned stride ) { struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient; @@ -265,7 +265,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, vsvg->base.key = *key; vsvg->base.vs = vs; - vsvg->base.set_input = vsvg_set_input; + vsvg->base.set_buffer = vsvg_set_buffer; vsvg->base.run_elts = vsvg_run_elts; vsvg->base.run_linear = vsvg_run_linear; vsvg->base.destroy = vsvg_destroy; diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp index 0fc5c4ec5c..fcc5c05794 100644 --- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp +++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp @@ -1,140 +1,140 @@ static const unsigned char llvm_builtins_data[] = { -0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x29,0x02,0x00,0x00,0x01,0x10,0x00,0x00, +0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00, 0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39, 0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02, 0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24, 0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64, -0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c, -0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x32,0x22,0x48,0x09, -0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52, -0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x29,0x80,0x21,0x00, -0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,0x80,0x50,0x2b,0x03, -0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,0x14,0x01,0x80,0x11, -0x80,0x22,0x88,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,0x36,0x80,0x87,0x71, -0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,0x70,0x87,0x7a,0xd8, -0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,0xa0, -0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06, -0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,0x71,0x60,0x07,0x7a, -0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,0x20,0x07,0x7a,0x30, -0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07, -0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,0x7a,0x30,0x07,0x72, -0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,0x72,0xa0, -0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,0x07,0x74,0xa0,0x07, -0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78, -0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,0x00,0x07,0x7a,0x60, -0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0x07, -0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,0x04,0x20,0x76,0x46, -0xfc,0x6c,0x48,0x92,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,0x12,0x20,0x00,0x00, -0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x01,0x00,0x00,0x00,0x30,0x24,0x59,0x00,0x20, -0x08,0x00,0x00,0x00,0x86,0x24,0x0a,0x00,0x04,0x00,0x00,0x00,0xc0,0x90,0x84,0x01, -0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12, -0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,0x00,0x00,0x00,0x0c, -0x49,0x14,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x49,0x01,0x00,0x41,0x00,0x00,0x00, -0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90, +0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff, +0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00, +0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45, +0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83, +0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47, +0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40, +0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05, +0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36, +0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7, +0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60, +0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e, +0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71, +0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20, +0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07, +0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a, +0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60, +0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07, +0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a, +0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40, +0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07, +0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f, +0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00, +0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02, +0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00, +0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01, +0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12, +0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64, +0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90, 0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02, 0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66, 0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95, 0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00, 0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00, 0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60, -0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,0x34,0xc9,0x30,0x41, -0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,0x8d,0x35,0x56,0x01, -0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0x46,0x41,0x08,0xcc, -0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,0x35,0x04,0x80,0x39, -0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,0x04,0x3e,0x30,0x0c, -0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,0x05,0xd1,0x4c,0x11, -0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, -0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41, +0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01, +0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6, +0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d, +0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7, +0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f, +0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00, 0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84, 0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00, -0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15, +0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15, 0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19, -0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,0x17,0x60,0x20,0xc5, -0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,0xf3,0xd4,0xb8,0x69, -0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,0x0c,0x13,0xf3,0x9c, -0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,0x8b,0x23,0x28,0x76, -0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,0xd1,0x4c,0x11,0x66, -0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, -0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00, -0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,0x11,0x00,0x00,0x00, -0x63,0x08,0x4d,0x64,0x16,0xc1,0x49,0x86,0xab,0x22,0x66,0x19,0x02,0x01,0x1b,0x43, -0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,0x0a,0x20,0x0b,0x34, -0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x24,0x83,0x57,0x11,0xb3, -0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,0x48,0xb3,0x04,0xc6, -0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,0x63,0x08,0xcd,0x64, -0x64,0x40,0x70,0x92,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,0x60,0x0c,0xc1,0x99, -0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,0x33,0x38,0xd0,0x00, -0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,0xe0,0x24,0x03,0x1b, -0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,0x30,0x63,0x08,0x0f, -0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,0xa0,0x06,0x70,0x00, -0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x76,0x52,0x4c,0xcc, -0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,0xc6,0x50,0x8a,0x89, -0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,0x79,0x68,0x73,0x20, -0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,0x9e,0xdb,0x32,0x88, -0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,0xa7,0xb7,0x95,0x62, -0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,0x34,0x35,0x56,0x62, -0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,0x2c,0x82,0xd3,0x0c, -0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,0x24,0x01,0x63,0xec, -0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,0xfc,0xc4,0xd0,0x90, -0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,0xc1,0x71,0x7b,0x29, -0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,0x32,0xf6,0xe6,0x46, -0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,0x4e,0x33,0x58,0x47, -0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,0x33,0xe1,0xbc,0xa5, -0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,0x3a,0x40,0xc6,0xde, -0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,0xed,0x82,0x10,0x9c, -0xa6,0xba,0x81,0x44,0x70,0x9a,0xc1,0x17,0x9c,0x66,0x32,0x93,0x42,0x60,0x1e,0x7b, -0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,0xa7,0x6d,0xa4,0x98, -0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,0x9c,0x66,0xc0,0x7b, -0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,0x8b,0xe0,0x34,0x83, -0x2f,0x38,0xcd,0x64,0xd3,0xe6,0x61,0x08,0x4e,0x53,0xd5,0xf6,0x01,0x14,0x44,0x33, -0x45,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00, -0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04, -0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,0x1e,0xe1,0x19,0xc6, -0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,0x15,0xc1,0x31,0x84, -0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,0x71,0x6c,0x23,0x38, -0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79, -0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5, -0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0, -0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,0x63,0x21,0x40,0x70, -0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00, -0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf, -0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2, -0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00, -0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7, -0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21, -0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54, -0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13, -0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd, -0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c, -0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00, -0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04, -0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48, -0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85, -0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31, -0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c, -0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00, -0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c, -0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80, -0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51, -0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38, -0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51, -0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d, -0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05, -0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00, -0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04, -0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4, -0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58, -0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33, -0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00, -0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f, -0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52, -0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec, -0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87, -0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d, -0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61, -0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00, -0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18, -0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5, -0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73, -0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c, -0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00}; +0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05, +0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d, +0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4, +0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb, +0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00, +0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91, +0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19, +0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81, +0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70, +0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0, +0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48, +0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc, +0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80, +0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80, +0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12, +0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7, +0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46, +0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89, +0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62, +0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98, +0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a, +0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25, +0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7, +0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9, +0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6, +0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01, +0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08, +0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e, +0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e, +0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee, +0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93, +0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a, +0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10, +0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06, +0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6, +0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00, +0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10, +0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63, +0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1, +0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48, +0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58, +0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7, +0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82, +0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00, +0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb, +0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6, +0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5, +0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18, +0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f, +0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59, +0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8, +0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86, +0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72, +0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00, +0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33, +0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3, +0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0, +0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4, +0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c, +0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00, +0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04, +0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca, +0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c, +0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc, +0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31, +0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54, +0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00, +0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00, +0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34, +0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4, +0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60, +0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00, +0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb, +0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76, +0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4, +0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44, +0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0, +0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03, +0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82, +0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3, +0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c, +0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84, +0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84, +0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp index e64bfb1c6c..3a2f2878a3 100644 --- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp +++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp @@ -46,6 +46,7 @@ #include "tgsi/tgsi_dump.h" #include "util/u_memory.h" +#include "util/u_math.h" #include <llvm/Module.h> #include <llvm/CallingConv.h> @@ -157,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod); llvm::ExecutionEngine *ee = cpu->engine; assert(ee); - /*FIXME : remove */ - ee->DisableLazyCompilation(); + /*FIXME : why was this disabled ? we need it for pow/sqrt/... */ + ee->DisableLazyCompilation(false); ee->addModuleProvider(mp); llvm::Function *func = func_for_shader(prog); @@ -201,7 +202,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog, unsigned int i, j; unsigned slot; vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function); - assert(runner); for (i = 0; i < count; i += MAX_TGSI_VERTICES) { diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp index a82dc30306..599975d5ad 100644 --- a/src/gallium/auxiliary/gallivm/instructions.cpp +++ b/src/gallium/auxiliary/gallivm/instructions.cpp @@ -83,6 +83,7 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB m_llvmPow = 0; m_llvmFloor = 0; m_llvmFlog = 0; + m_llvmFexp = 0; m_llvmLit = 0; m_fmtPtr = 0; @@ -92,194 +93,271 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB m_mod = ParseBitcodeFile(buffer); } +llvm::BasicBlock * Instructions::currentBlock() const +{ + return m_builder.GetInsertBlock(); +} + +llvm::Value * Instructions::abs(llvm::Value *in) +{ + std::vector<llvm::Value*> vec = extractVector(in); + Value *xabs = callFAbs(vec[0]); + Value *yabs = callFAbs(vec[1]); + Value *zabs = callFAbs(vec[2]); + Value *wabs = callFAbs(vec[3]); + return vectorFromVals(xabs, yabs, zabs, wabs); +} + llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2) { return m_builder.CreateAdd(in1, in2, name("add")); } -llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2, - llvm::Value *in3) +llvm::Value * Instructions::arl(llvm::Value *in) { - Value *mulRes = mul(in1, in2); - return add(mulRes, in3); + return floor(in); } - -llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2) + +void Instructions::beginLoop() { - return m_builder.CreateMul(in1, in2, name("mul")); + BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0); + BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0); + + m_builder.CreateBr(begin); + Loop loop; + loop.begin = begin; + loop.end = end; + m_builder.SetInsertPoint(begin); + m_loopStack.push(loop); } -const char * Instructions::name(const char *prefix) +void Instructions::bgnSub(unsigned label) { - ++m_idx; - snprintf(m_name, 32, "%s%d", prefix, m_idx); - return m_name; + llvm::Function *func = findFunction(label); + + Function::arg_iterator args = func->arg_begin(); + Value *ptr_INPUT = args++; + ptr_INPUT->setName("INPUT"); + m_storage->pushArguments(ptr_INPUT); + + llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0); + + m_func = func; + m_builder.SetInsertPoint(entry); } -llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2) +void Instructions::brk() { - Value *mulRes = mul(in1, in2); - Value *x = m_builder.CreateExtractElement(mulRes, - m_storage->constantInt(0), - name("extractx")); - Value *y = m_builder.CreateExtractElement(mulRes, - m_storage->constantInt(1), - name("extracty")); - Value *z = m_builder.CreateExtractElement(mulRes, - m_storage->constantInt(2), - name("extractz")); - Value *xy = m_builder.CreateAdd(x, y,name("xy")); - Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3")); - return vectorFromVals(dot3, dot3, dot3, dot3); + assert(!m_loopStack.empty()); + BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0); + m_builder.CreateBr(m_loopStack.top().end); + m_builder.SetInsertPoint(unr); } -llvm::Value *Instructions::callFSqrt(llvm::Value *val) +void Instructions::cal(int label, llvm::Value *input) { - if (!m_llvmFSqrt) { - // predeclare the intrinsic - std::vector<const Type*> fsqrtArgs; - fsqrtArgs.push_back(Type::FloatTy); - PAListPtr fsqrtPal; - FunctionType* fsqrtType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/fsqrtArgs, - /*isVarArg=*/false); - m_llvmFSqrt = Function::Create( - /*Type=*/fsqrtType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"llvm.sqrt.f32", m_mod); - m_llvmFSqrt->setCallingConv(CallingConv::C); - m_llvmFSqrt->setParamAttrs(fsqrtPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val, - name("sqrt")); - call->setCallingConv(CallingConv::C); - call->setTailCall(false); - return call; + std::vector<Value*> params; + params.push_back(input); + llvm::Function *func = findFunction(label); + + m_builder.CreateCall(func, params.begin(), params.end()); } -llvm::Value * Instructions::rsq(llvm::Value *in1) +llvm::Value * Instructions::ceil(llvm::Value *in) { - Value *x = m_builder.CreateExtractElement(in1, - m_storage->constantInt(0), - name("extractx")); - Value *abs = callFAbs(x); - Value *sqrt = callFSqrt(abs); - - Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), - sqrt, - name("rsqrt")); - return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt); + std::vector<llvm::Value*> vec = extractVector(in); + return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]), + callCeil(vec[2]), callCeil(vec[3])); } -llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y, - llvm::Value *z, llvm::Value *w) +llvm::Value * Instructions::clamp(llvm::Value *in1) { - Constant *const_vec = Constant::getNullValue(m_floatVecType); - Value *res = m_builder.CreateInsertElement(const_vec, x, - m_storage->constantInt(0), - name("vecx")); - res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1), - name("vecxy")); - res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2), - name("vecxyz")); - if (w) - res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3), - name("vecxyzw")); - return res; + llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f); + llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f); + return min( max(zero, in1), one); } -llvm::Value *Instructions::callFAbs(llvm::Value *val) +llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - if (!m_llvmFAbs) { - // predeclare the intrinsic - std::vector<const Type*> fabsArgs; - fabsArgs.push_back(Type::FloatTy); - PAListPtr fabsPal; - FunctionType* fabsType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/fabsArgs, - /*isVarArg=*/false); - m_llvmFAbs = Function::Create( - /*Type=*/fabsType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"fabs", m_mod); - m_llvmFAbs->setCallingConv(CallingConv::C); - m_llvmFAbs->setParamAttrs(fabsPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFAbs, val, - name("fabs")); - call->setCallingConv(CallingConv::C); + llvm::Function *func = m_mod->getFunction("cmp"); + assert(func); + + std::vector<Value*> params; + params.push_back(in1); + params.push_back(in2); + params.push_back(in3); + CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres")); call->setTailCall(false); return call; } -llvm::Value * Instructions::lit(llvm::Value *in) +llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - if (!m_llvmLit) { - m_llvmLit = m_mod->getFunction("lit"); - } - CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres")); - call->setCallingConv(CallingConv::C); - call->setTailCall(false); - return call; + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); + std::vector<llvm::Value*> vec3 = extractVector(in3); + Constant *half = ConstantFP::get(APFloat(0.5f)); + + Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp")); + Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0], + name("selx")); + + Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp")); + Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1], + name("sely")); + + Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp")); + Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2], + name("selz")); + + Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp")); + Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3], + name("selw")); + + return vectorFromVals(selx, sely, selz, selw); } -llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - Value *res = m_builder.CreateSub(in1, in2, name("sub")); - return res; + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); + std::vector<llvm::Value*> vec3 = extractVector(in3); + Constant *zero = Constant::getNullValue(Type::FloatTy); + + Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp")); + Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0], + name("selx")); + + Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp")); + Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1], + name("sely")); + + Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp")); + Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2], + name("selz")); + + Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp")); + Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3], + name("selw")); + + return vectorFromVals(selx, sely, selz, selw); } -llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2) +llvm::Value * Instructions::cos(llvm::Value *in) { - if (!m_llvmPow) { - // predeclare the intrinsic - std::vector<const Type*> powArgs; - powArgs.push_back(Type::FloatTy); - powArgs.push_back(Type::FloatTy); - PAListPtr powPal; - FunctionType* powType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/powArgs, - /*isVarArg=*/false); - m_llvmPow = Function::Create( - /*Type=*/powType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"llvm.pow.f32", m_mod); - m_llvmPow->setCallingConv(CallingConv::C); - m_llvmPow->setParamAttrs(powPal); - } - std::vector<Value*> params; - params.push_back(val1); - params.push_back(val2); - CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(), - name("pow")); - call->setCallingConv(CallingConv::C); +#if 0 + llvm::Function *func = m_mod->getFunction("vcos"); + assert(func); + + CallInst *call = m_builder.CreateCall(func, in, name("cosres")); call->setTailCall(false); return call; +#else + std::vector<llvm::Value*> elems = extractVector(in); + Function *func = m_mod->getFunction("cosf"); + assert(func); + CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres")); + cos->setCallingConv(CallingConv::C); + cos->setTailCall(true); + return vectorFromVals(cos, cos, cos, cos); +#endif } -llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2) { Value *x1 = m_builder.CreateExtractElement(in1, m_storage->constantInt(0), name("x1")); + Value *y1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(1), + name("y1")); + Value *z1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(2), + name("z1")); + Value *x2 = m_builder.CreateExtractElement(in2, m_storage->constantInt(0), name("x2")); - llvm::Value *val = callPow(x1, x2); - return vectorFromVals(val, val, val, val); + Value *y2 = m_builder.CreateExtractElement(in2, + m_storage->constantInt(1), + name("y2")); + Value *z2 = m_builder.CreateExtractElement(in2, + m_storage->constantInt(2), + name("z2")); + Value *y1z2 = mul(y1, z2); + Value *z1y2 = mul(z1, y2); + + Value *z1x2 = mul(z1, x2); + Value *x1z2 = mul(x1, z2); + + Value *x1y2 = mul(x1, y2); + Value *y1x2 = mul(y1, x2); + + return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2)); } -llvm::Value * Instructions::rcp(llvm::Value *in1) +llvm::Value * Instructions::ddx(llvm::Value *in) { - Value *x1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(0), - name("x1")); - Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), - x1, name("rcp")); - return vectorFromVals(res, res, res, res); + // FIXME + assert(0); +} + +llvm::Value * Instructions::ddy(llvm::Value *in) +{ + // FIXME + assert(0); +} + +llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2) +{ + return m_builder.CreateFDiv(in1, in2, name("div")); +} + +llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) +{ + Value *mulRes = mul(in1, in2); + Value *x = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(0), + name("extractx")); + Value *y = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(1), + name("extracty")); + Value *z = m_builder.CreateExtractElement(in3, + m_storage->constantInt(2), + name("extractz")); + Value *xy = m_builder.CreateAdd(x, y,name("xy")); + Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add")); + return vectorFromVals(dot2add, dot2add, dot2add, dot2add); +} + +llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2) +{ + Value *mulRes = mul(in1, in2); + Value *x = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(0), + name("extractx")); + Value *y = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(1), + name("extracty")); + Value *xy = m_builder.CreateAdd(x, y,name("xy")); + return vectorFromVals(xy, xy, xy, xy); +} + +llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2) +{ + Value *mulRes = mul(in1, in2); + Value *x = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(0), + name("extractx")); + Value *y = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(1), + name("extracty")); + Value *z = m_builder.CreateExtractElement(mulRes, + m_storage->constantInt(2), + name("extractz")); + Value *xy = m_builder.CreateAdd(x, y,name("xy")); + Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3")); + return vectorFromVals(dot3, dot3, dot3, dot3); } llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2) @@ -321,6 +399,53 @@ llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2) ry, z, w); } +void Instructions::elseop() +{ + assert(!m_ifStack.empty()); + BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0); + m_builder.CreateBr(ifend); + m_builder.SetInsertPoint(m_ifStack.top()); + currentBlock()->setName(name("ifelse")); + m_ifStack.pop(); + m_ifStack.push(ifend); +} + +void Instructions::endif() +{ + assert(!m_ifStack.empty()); + m_builder.CreateBr(m_ifStack.top()); + m_builder.SetInsertPoint(m_ifStack.top()); + m_ifStack.pop(); +} + +void Instructions::endLoop() +{ + assert(!m_loopStack.empty()); + Loop loop = m_loopStack.top(); + m_builder.CreateBr(loop.begin); + loop.end->moveAfter(currentBlock()); + m_builder.SetInsertPoint(loop.end); + m_loopStack.pop(); +} + +void Instructions::end() +{ + m_builder.CreateRetVoid(); +} + +void Instructions::endSub() +{ + m_func = 0; + m_builder.SetInsertPoint(0); +} + +llvm::Value * Instructions::exp(llvm::Value *in) +{ + std::vector<llvm::Value*> vec = extractVector(in); + return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]), + callFExp(vec[2]), callFExp(vec[3])); +} + llvm::Value * Instructions::ex2(llvm::Value *in) { llvm::Value *val = callPow(ConstantFP::get(APFloat(2.f)), @@ -330,31 +455,6 @@ llvm::Value * Instructions::ex2(llvm::Value *in) return vectorFromVals(val, val, val, val); } -llvm::Value * Instructions::callFloor(llvm::Value *val) -{ - if (!m_llvmFloor) { - // predeclare the intrinsic - std::vector<const Type*> floorArgs; - floorArgs.push_back(Type::FloatTy); - PAListPtr floorPal; - FunctionType* floorType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/floorArgs, - /*isVarArg=*/false); - m_llvmFloor = Function::Create( - /*Type=*/floorType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"floorf", m_mod); - m_llvmFloor->setCallingConv(CallingConv::C); - m_llvmFloor->setParamAttrs(floorPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFloor, val, - name("floorf")); - call->setCallingConv(CallingConv::C); - call->setTailCall(false); - return call; -} - llvm::Value * Instructions::floor(llvm::Value *in) { std::vector<llvm::Value*> vec = extractVector(in); @@ -362,42 +462,52 @@ llvm::Value * Instructions::floor(llvm::Value *in) callFloor(vec[2]), callFloor(vec[3])); } -llvm::Value * Instructions::arl(llvm::Value *in) -{ - return floor(in); -} - llvm::Value * Instructions::frc(llvm::Value *in) { llvm::Value *flr = floor(in); return sub(in, flr); } -llvm::Value * Instructions::callFLog(llvm::Value *val) +void Instructions::ifop(llvm::Value *in) { - if (!m_llvmFlog) { - // predeclare the intrinsic - std::vector<const Type*> flogArgs; - flogArgs.push_back(Type::FloatTy); - PAListPtr flogPal; - FunctionType* flogType = FunctionType::get( - /*Result=*/Type::FloatTy, - /*Params=*/flogArgs, - /*isVarArg=*/false); - m_llvmFlog = Function::Create( - /*Type=*/flogType, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"logf", m_mod); - m_llvmFlog->setCallingConv(CallingConv::C); - m_llvmFlog->setParamAttrs(flogPal); - } - CallInst *call = m_builder.CreateCall(m_llvmFlog, val, - name("logf")); - call->setCallingConv(CallingConv::C); + BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0); + BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0); + + //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0); + //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0); + //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0); + + Constant *float0 = Constant::getNullValue(Type::FloatTy); + + Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0), + name("extractx")); + Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp")); + m_builder.CreateCondBr(xcmp, ifthen, ifend); + //m_builder.SetInsertPoint(yblock); + + m_builder.SetInsertPoint(ifthen); + m_ifStack.push(ifend); +} + +llvm::Value * Instructions::kil(llvm::Value *in) +{ + llvm::Function *func = m_mod->getFunction("kil"); + assert(func); + + CallInst *call = m_builder.CreateCall(func, in, name("kilpres")); call->setTailCall(false); return call; } +llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2, + llvm::Value *in3) +{ + llvm::Value *m = mul(in1, in2); + llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f); + llvm::Value *s = sub(vec1, in1); + return add(m, mul(s, in3)); +} + llvm::Value * Instructions::lg2(llvm::Value *in) { std::vector<llvm::Value*> vec = extractVector(in); @@ -407,142 +517,176 @@ llvm::Value * Instructions::lg2(llvm::Value *in) callFLog(vec[2]), callFLog(vec[3])), const_vec); } -llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::lit(llvm::Value *in) +{ + if (!m_llvmLit) { + m_llvmLit = m_mod->getFunction("lit"); + } + CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::log(llvm::Value *in) +{ + std::vector<llvm::Value*> vec = extractVector(in); + return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]), + callFLog(vec[2]), callFLog(vec[3])); +} + +llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2, + llvm::Value *in3) +{ + Value *mulRes = mul(in1, in2); + return add(mulRes, in3); +} + +llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2) { std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); + Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], + name("xcmp")); Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0], name("selx")); - Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], + name("ycmp")); Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1], name("sely")); - Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], + name("zcmp")); Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2], name("selz")); - Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], + name("wcmp")); Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3], name("selw")); return vectorFromVals(selx, sely, selz, selw); } -llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2) { std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], - name("xcmp")); + Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0], name("selx")); - Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], - name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1], name("sely")); - Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], - name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2], name("selz")); - Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], - name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3], name("selw")); return vectorFromVals(selx, sely, selz, selw); } -void Instructions::printVector(llvm::Value *val) +llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2) { - static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A"; + return m_builder.CreateMul(in1, in2, name("mul")); +} - if (!m_fmtPtr) { - Constant *format = ConstantArray::get(frmt, true); - ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1); - GlobalVariable* globalFormat = new GlobalVariable( - /*Type=*/arrayTy, - /*isConstant=*/true, - /*Linkage=*/GlobalValue::InternalLinkage, - /*Initializer=*/0, // has initializer, specified below - /*Name=*/name(".str"), - m_mod); - globalFormat->setInitializer(format); +llvm::Value * Instructions::neg(llvm::Value *in) +{ + Value *neg = m_builder.CreateNeg(in, name("neg")); + return neg; +} - Constant* const_int0 = Constant::getNullValue(IntegerType::get(32)); - std::vector<Constant*> const_ptr_21_indices; - const_ptr_21_indices.push_back(const_int0); - const_ptr_21_indices.push_back(const_int0); - m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat, - &const_ptr_21_indices[0], const_ptr_21_indices.size()); - } +llvm::Value * Instructions::nrm(llvm::Value *in) +{ + llvm::Value *v = rsq(in); + return mul(v, in); +} - Function *func_printf = m_mod->getFunction("printf"); - if (!func_printf) - func_printf = declarePrintf(); - assert(func_printf); - std::vector<llvm::Value*> vec = extractVector(val); - Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx")); - Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy")); - Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz")); - Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw")); - std::vector<Value*> params; - params.push_back(m_fmtPtr); - params.push_back(dx); - params.push_back(dy); - params.push_back(dz); - params.push_back(dw); - CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(), - name("printf")); - call->setCallingConv(CallingConv::C); - call->setTailCall(true); +llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2) +{ + Value *x1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(0), + name("x1")); + Value *x2 = m_builder.CreateExtractElement(in2, + m_storage->constantInt(0), + name("x2")); + llvm::Value *val = callPow(x1, x2); + return vectorFromVals(val, val, val, val); } -llvm::Function * Instructions::declarePrintf() +llvm::Value * Instructions::rcp(llvm::Value *in1) { - std::vector<const Type*> args; - PAListPtr params; - FunctionType* funcTy = FunctionType::get( - /*Result=*/IntegerType::get(32), - /*Params=*/args, - /*isVarArg=*/true); - Function* func_printf = Function::Create( - /*Type=*/funcTy, - /*Linkage=*/GlobalValue::ExternalLinkage, - /*Name=*/"printf", m_mod); - func_printf->setCallingConv(CallingConv::C); - func_printf->setParamAttrs(params); - return func_printf; + Value *x1 = m_builder.CreateExtractElement(in1, + m_storage->constantInt(0), + name("x1")); + Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), + x1, name("rcp")); + return vectorFromVals(res, res, res, res); +} + +llvm::Value * Instructions::rsq(llvm::Value *in1) +{ + Value *x = m_builder.CreateExtractElement(in1, + m_storage->constantInt(0), + name("extractx")); + Value *abs = callFAbs(x); + Value *sqrt = callFSqrt(abs); + + Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)), + sqrt, + name("rsqrt")); + return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt); } +llvm::Value * Instructions::scs(llvm::Value *in) +{ + llvm::Function *func = m_mod->getFunction("scs"); + assert(func); -llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2) + CallInst *call = m_builder.CreateCall(func, in, name("scsres")); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2) { Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); Constant *const0f = Constant::getNullValue(Type::FloatTy); std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp")); + + Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp")); Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp")); Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); - Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp")); Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); - Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp")); Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); return vectorFromVals(x, y, z, w); } + +llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2) +{ + Constant *const0f = Constant::getNullValue(Type::FloatTy); + + return vectorFromVals(const0f, const0f, const0f, const0f); +} + llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2) { Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); @@ -566,157 +710,118 @@ llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2) return vectorFromVals(x, y, z, w); } - -llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2) { Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); Constant *const0f = Constant::getNullValue(Type::FloatTy); std::vector<llvm::Value*> vec1 = extractVector(in1); std::vector<llvm::Value*> vec2 = extractVector(in2); - - Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); + Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp")); Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); + Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp")); Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); - Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); + Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp")); Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); - Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); + Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp")); Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); return vectorFromVals(x, y, z, w); } -llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2) +llvm::Value * Instructions::sin(llvm::Value *in) { - Value *x1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(0), - name("x1")); - Value *y1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(1), - name("y1")); - Value *z1 = m_builder.CreateExtractElement(in1, - m_storage->constantInt(2), - name("z1")); + llvm::Function *func = m_mod->getFunction("vsin"); + assert(func); - Value *x2 = m_builder.CreateExtractElement(in2, - m_storage->constantInt(0), - name("x2")); - Value *y2 = m_builder.CreateExtractElement(in2, - m_storage->constantInt(1), - name("y2")); - Value *z2 = m_builder.CreateExtractElement(in2, - m_storage->constantInt(2), - name("z2")); - Value *y1z2 = mul(y1, z2); - Value *z1y2 = mul(z1, y2); + CallInst *call = m_builder.CreateCall(func, in, name("sinres")); + call->setTailCall(false); + return call; +} - Value *z1x2 = mul(z1, x2); - Value *x1z2 = mul(x1, z2); +llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2) +{ + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + Constant *const0f = Constant::getNullValue(Type::FloatTy); - Value *x1y2 = mul(x1, y2); - Value *y1x2 = mul(y1, x2); + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); - return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2)); -} + Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp")); + Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); + Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp")); + Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); -llvm::Value * Instructions::abs(llvm::Value *in) -{ - std::vector<llvm::Value*> vec = extractVector(in); - Value *xabs = callFAbs(vec[0]); - Value *yabs = callFAbs(vec[1]); - Value *zabs = callFAbs(vec[2]); - Value *wabs = callFAbs(vec[3]); - return vectorFromVals(xabs, yabs, zabs, wabs); + Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp")); + Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); + + Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp")); + Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); + + return vectorFromVals(x, y, z, w); } -void Instructions::ifop(llvm::Value *in) +llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2) { - BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0); - BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0); + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + Constant *const0f = Constant::getNullValue(Type::FloatTy); - //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0); - //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0); - //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0); + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); - Constant *float0 = Constant::getNullValue(Type::FloatTy); + Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp")); + Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0), - name("extractx")); - Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp")); - m_builder.CreateCondBr(xcmp, ifthen, ifend); - //m_builder.SetInsertPoint(yblock); + Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp")); + Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); - m_builder.SetInsertPoint(ifthen); - m_ifStack.push(ifend); -} + Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp")); + Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); -llvm::BasicBlock * Instructions::currentBlock() const -{ - return m_builder.GetInsertBlock(); -} + Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp")); + Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); -void Instructions::elseop() -{ - assert(!m_ifStack.empty()); - BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0); - m_builder.CreateBr(ifend); - m_builder.SetInsertPoint(m_ifStack.top()); - currentBlock()->setName(name("ifelse")); - m_ifStack.pop(); - m_ifStack.push(ifend); + return vectorFromVals(x, y, z, w); } -void Instructions::endif() +llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2) { - assert(!m_ifStack.empty()); - m_builder.CreateBr(m_ifStack.top()); - m_builder.SetInsertPoint(m_ifStack.top()); - m_ifStack.pop(); -} + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + Constant *const0f = Constant::getNullValue(Type::FloatTy); -llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2, - llvm::Value *in3) -{ - llvm::Value *m = mul(in1, in2); - llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f); - llvm::Value *s = sub(vec1, in1); - return add(m, mul(s, in3)); -} + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); -void Instructions::beginLoop() -{ - BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0); - BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0); + Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp")); + Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel")); - m_builder.CreateBr(begin); - Loop loop; - loop.begin = begin; - loop.end = end; - m_builder.SetInsertPoint(begin); - m_loopStack.push(loop); + Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp")); + Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel")); + + Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp")); + Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel")); + + Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp")); + Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel")); + + return vectorFromVals(x, y, z, w); } -void Instructions::endLoop() +llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2) { - assert(!m_loopStack.empty()); - Loop loop = m_loopStack.top(); - m_builder.CreateBr(loop.begin); - loop.end->moveAfter(currentBlock()); - m_builder.SetInsertPoint(loop.end); - m_loopStack.pop(); + Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f)); + + return vectorFromVals(const1f, const1f, const1f, const1f); } -void Instructions::brk() +llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2) { - assert(!m_loopStack.empty()); - BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0); - m_builder.CreateBr(m_loopStack.top().end); - m_builder.SetInsertPoint(unr); + Value *res = m_builder.CreateSub(in1, in2, name("sub")); + return res; } llvm::Value * Instructions::trunc(llvm::Value *in) @@ -741,18 +846,298 @@ llvm::Value * Instructions::trunc(llvm::Value *in) return vectorFromVals(fx, fy, fz, fw); } -void Instructions::end() +llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) { - m_builder.CreateRetVoid(); + std::vector<llvm::Value*> vec1 = extractVector(in1); + std::vector<llvm::Value*> vec2 = extractVector(in2); + std::vector<llvm::Value*> vec3 = extractVector(in3); + + Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3")); + Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3")); + Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3")); + Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3")); + + Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3")); + Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3")); + Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3")); + Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3")); + + return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3); } -void Instructions::cal(int label, llvm::Value *input) +void Instructions::printVector(llvm::Value *val) { + static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A"; + + if (!m_fmtPtr) { + Constant *format = ConstantArray::get(frmt, true); + ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1); + GlobalVariable* globalFormat = new GlobalVariable( + /*Type=*/arrayTy, + /*isConstant=*/true, + /*Linkage=*/GlobalValue::InternalLinkage, + /*Initializer=*/0, // has initializer, specified below + /*Name=*/name(".str"), + m_mod); + globalFormat->setInitializer(format); + + Constant* const_int0 = Constant::getNullValue(IntegerType::get(32)); + std::vector<Constant*> const_ptr_21_indices; + const_ptr_21_indices.push_back(const_int0); + const_ptr_21_indices.push_back(const_int0); + m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat, + &const_ptr_21_indices[0], const_ptr_21_indices.size()); + } + + Function *func_printf = m_mod->getFunction("printf"); + if (!func_printf) + func_printf = declarePrintf(); + assert(func_printf); + std::vector<llvm::Value*> vec = extractVector(val); + Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx")); + Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy")); + Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz")); + Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw")); std::vector<Value*> params; - params.push_back(input); - llvm::Function *func = findFunction(label); + params.push_back(m_fmtPtr); + params.push_back(dx); + params.push_back(dy); + params.push_back(dz); + params.push_back(dw); + CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(), + name("printf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(true); +} - m_builder.CreateCall(func, params.begin(), params.end()); +const char * Instructions::name(const char *prefix) +{ + ++m_idx; + snprintf(m_name, 32, "%s%d", prefix, m_idx); + return m_name; +} + +llvm::Value * Instructions::callCeil(llvm::Value *val) +{ + if (!m_llvmCeil) { + // predeclare the intrinsic + std::vector<const Type*> ceilArgs; + ceilArgs.push_back(Type::FloatTy); + AttrListPtr ceilPal; + FunctionType* ceilType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/ceilArgs, + /*isVarArg=*/false); + m_llvmCeil = Function::Create( + /*Type=*/ceilType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"ceilf", m_mod); + m_llvmCeil->setCallingConv(CallingConv::C); + m_llvmCeil->setAttributes(ceilPal); + } + CallInst *call = m_builder.CreateCall(m_llvmCeil, val, + name("ceilf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value *Instructions::callFAbs(llvm::Value *val) +{ + if (!m_llvmFAbs) { + // predeclare the intrinsic + std::vector<const Type*> fabsArgs; + fabsArgs.push_back(Type::FloatTy); + AttrListPtr fabsPal; + FunctionType* fabsType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/fabsArgs, + /*isVarArg=*/false); + m_llvmFAbs = Function::Create( + /*Type=*/fabsType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"fabs", m_mod); + m_llvmFAbs->setCallingConv(CallingConv::C); + m_llvmFAbs->setAttributes(fabsPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFAbs, val, + name("fabs")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callFExp(llvm::Value *val) +{ + if (!m_llvmFexp) { + // predeclare the intrinsic + std::vector<const Type*> fexpArgs; + fexpArgs.push_back(Type::FloatTy); + AttrListPtr fexpPal; + FunctionType* fexpType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/fexpArgs, + /*isVarArg=*/false); + m_llvmFexp = Function::Create( + /*Type=*/fexpType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"expf", m_mod); + m_llvmFexp->setCallingConv(CallingConv::C); + m_llvmFexp->setAttributes(fexpPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFexp, val, + name("expf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callFLog(llvm::Value *val) +{ + if (!m_llvmFlog) { + // predeclare the intrinsic + std::vector<const Type*> flogArgs; + flogArgs.push_back(Type::FloatTy); + AttrListPtr flogPal; + FunctionType* flogType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/flogArgs, + /*isVarArg=*/false); + m_llvmFlog = Function::Create( + /*Type=*/flogType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"logf", m_mod); + m_llvmFlog->setCallingConv(CallingConv::C); + m_llvmFlog->setAttributes(flogPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFlog, val, + name("logf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callFloor(llvm::Value *val) +{ + if (!m_llvmFloor) { + // predeclare the intrinsic + std::vector<const Type*> floorArgs; + floorArgs.push_back(Type::FloatTy); + AttrListPtr floorPal; + FunctionType* floorType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/floorArgs, + /*isVarArg=*/false); + m_llvmFloor = Function::Create( + /*Type=*/floorType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"floorf", m_mod); + m_llvmFloor->setCallingConv(CallingConv::C); + m_llvmFloor->setAttributes(floorPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFloor, val, + name("floorf")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value *Instructions::callFSqrt(llvm::Value *val) +{ + if (!m_llvmFSqrt) { + // predeclare the intrinsic + std::vector<const Type*> fsqrtArgs; + fsqrtArgs.push_back(Type::FloatTy); + AttrListPtr fsqrtPal; + FunctionType* fsqrtType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/fsqrtArgs, + /*isVarArg=*/false); + m_llvmFSqrt = Function::Create( + /*Type=*/fsqrtType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"llvm.sqrt.f32", m_mod); + m_llvmFSqrt->setCallingConv(CallingConv::C); + m_llvmFSqrt->setAttributes(fsqrtPal); + } + CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val, + name("sqrt")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2) +{ + if (!m_llvmPow) { + // predeclare the intrinsic + std::vector<const Type*> powArgs; + powArgs.push_back(Type::FloatTy); + powArgs.push_back(Type::FloatTy); + AttrListPtr powPal; + FunctionType* powType = FunctionType::get( + /*Result=*/Type::FloatTy, + /*Params=*/powArgs, + /*isVarArg=*/false); + m_llvmPow = Function::Create( + /*Type=*/powType, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"llvm.pow.f32", m_mod); + m_llvmPow->setCallingConv(CallingConv::C); + m_llvmPow->setAttributes(powPal); + } + std::vector<Value*> params; + params.push_back(val1); + params.push_back(val2); + CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(), + name("pow")); + call->setCallingConv(CallingConv::C); + call->setTailCall(false); + return call; +} + +llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y, + llvm::Value *z, llvm::Value *w) +{ + Constant *const_vec = Constant::getNullValue(m_floatVecType); + Value *res = m_builder.CreateInsertElement(const_vec, x, + m_storage->constantInt(0), + name("vecx")); + res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1), + name("vecxy")); + res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2), + name("vecxyz")); + if (w) + res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3), + name("vecxyzw")); + return res; +} + +llvm::Value * Instructions::constVector(float x, float y, float z, float w) +{ + std::vector<Constant*> vec(4); + vec[0] = ConstantFP::get(APFloat(x)); + vec[1] = ConstantFP::get(APFloat(y)); + vec[2] = ConstantFP::get(APFloat(z)); + vec[3] = ConstantFP::get(APFloat(w)); + return ConstantVector::get(m_floatVecType, vec); +} + +llvm::Function * Instructions::declarePrintf() +{ + std::vector<const Type*> args; + AttrListPtr params; + FunctionType* funcTy = FunctionType::get( + /*Result=*/IntegerType::get(32), + /*Params=*/args, + /*isVarArg=*/true); + Function* func_printf = Function::Create( + /*Type=*/funcTy, + /*Linkage=*/GlobalValue::ExternalLinkage, + /*Name=*/"printf", m_mod); + func_printf->setCallingConv(CallingConv::C); + func_printf->setAttributes(params); + return func_printf; } llvm::Function * Instructions::declareFunc(int label) @@ -763,7 +1148,7 @@ llvm::Function * Instructions::declareFunc(int label) args.push_back(vecPtr); args.push_back(vecPtr); args.push_back(vecPtr); - PAListPtr params; + AttrListPtr params; FunctionType *funcType = FunctionType::get( /*Result=*/Type::VoidTy, /*Params=*/args, @@ -774,31 +1159,10 @@ llvm::Function * Instructions::declareFunc(int label) /*Linkage=*/GlobalValue::ExternalLinkage, /*Name=*/name.c_str(), m_mod); func->setCallingConv(CallingConv::C); - func->setParamAttrs(params); + func->setAttributes(params); return func; } -void Instructions::bgnSub(unsigned label) -{ - llvm::Function *func = findFunction(label); - - Function::arg_iterator args = func->arg_begin(); - Value *ptr_INPUT = args++; - ptr_INPUT->setName("INPUT"); - m_storage->pushArguments(ptr_INPUT); - - llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0); - - m_func = func; - m_builder.SetInsertPoint(entry); -} - -void Instructions::endSub() -{ - m_func = 0; - m_builder.SetInsertPoint(0); -} - llvm::Function * Instructions::findFunction(int label) { llvm::Function *func = m_functions[label]; @@ -809,17 +1173,6 @@ llvm::Function * Instructions::findFunction(int label) return func; } -llvm::Value * Instructions::constVector(float x, float y, float z, float w) -{ - std::vector<Constant*> vec(4); - vec[0] = ConstantFP::get(APFloat(x)); - vec[1] = ConstantFP::get(APFloat(y)); - vec[2] = ConstantFP::get(APFloat(z)); - vec[3] = ConstantFP::get(APFloat(w)); - return ConstantVector::get(m_floatVecType, vec); -} - - std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec) { std::vector<llvm::Value*> elems(4); @@ -834,69 +1187,7 @@ std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec) return elems; } -llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3) -{ - llvm::Function *func = m_mod->getFunction("cmp"); - assert(func); - - std::vector<Value*> params; - params.push_back(in1); - params.push_back(in2); - params.push_back(in3); - CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres")); - call->setTailCall(false); - return call; -} - -llvm::Value * Instructions::cos(llvm::Value *in) -{ -#if 0 - llvm::Function *func = m_mod->getFunction("vcos"); - assert(func); - CallInst *call = m_builder.CreateCall(func, in, name("cosres")); - call->setTailCall(false); - return call; -#else - std::vector<llvm::Value*> elems = extractVector(in); - Function *func = m_mod->getFunction("cosf"); - assert(func); - CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres")); - cos->setCallingConv(CallingConv::C); - cos->setTailCall(true); - return vectorFromVals(cos, cos, cos, cos); -#endif -} - -llvm::Value * Instructions::scs(llvm::Value *in) -{ - llvm::Function *func = m_mod->getFunction("scs"); - assert(func); - - CallInst *call = m_builder.CreateCall(func, in, name("scsres")); - call->setTailCall(false); - return call; -} - -llvm::Value * Instructions::kil(llvm::Value *in) -{ - llvm::Function *func = m_mod->getFunction("kil"); - assert(func); - - CallInst *call = m_builder.CreateCall(func, in, name("kilpres")); - call->setTailCall(false); - return call; -} - -llvm::Value * Instructions::sin(llvm::Value *in) -{ - llvm::Function *func = m_mod->getFunction("vsin"); - assert(func); - - CallInst *call = m_builder.CreateCall(func, in, name("sinres")); - call->setTailCall(false); - return call; -} #endif //MESA_LLVM diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h index d286ce80c7..e18571251e 100644 --- a/src/gallium/auxiliary/gallivm/instructions.h +++ b/src/gallium/auxiliary/gallivm/instructions.h @@ -57,15 +57,24 @@ public: llvm::BasicBlock *currentBlock() const; llvm::Value *abs(llvm::Value *in1); - llvm::Value *arl(llvm::Value *in1); llvm::Value *add(llvm::Value *in1, llvm::Value *in2); + llvm::Value *arl(llvm::Value *in1); void beginLoop(); void bgnSub(unsigned); void brk(); void cal(int label, llvm::Value *input); + llvm::Value *ceil(llvm::Value *in); + llvm::Value *clamp(llvm::Value *in); llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); + llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); + llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); llvm::Value *cos(llvm::Value *in); llvm::Value *cross(llvm::Value *in1, llvm::Value *in2); + llvm::Value *ddx(llvm::Value *in); + llvm::Value *ddy(llvm::Value *in); + llvm::Value *div(llvm::Value *in1, llvm::Value *in2); + llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3); + llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2); llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2); llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2); llvm::Value *dph(llvm::Value *in1, llvm::Value *in2); @@ -75,6 +84,7 @@ public: void endLoop(); void end(); void endSub(); + llvm::Value *exp(llvm::Value *in); llvm::Value *ex2(llvm::Value *in); llvm::Value *floor(llvm::Value *in); llvm::Value *frc(llvm::Value *in); @@ -82,32 +92,43 @@ public: llvm::Value *kil(llvm::Value *in); llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); - llvm::Value *lit(llvm::Value *in); llvm::Value *lg2(llvm::Value *in); + llvm::Value *lit(llvm::Value *in); + llvm::Value *log(llvm::Value *in); llvm::Value *madd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); - llvm::Value *min(llvm::Value *in1, llvm::Value *in2); llvm::Value *max(llvm::Value *in1, llvm::Value *in2); + llvm::Value *min(llvm::Value *in1, llvm::Value *in2); llvm::Value *mul(llvm::Value *in1, llvm::Value *in2); + llvm::Value *neg(llvm::Value *in); + llvm::Value *nrm(llvm::Value *in); llvm::Value *pow(llvm::Value *in1, llvm::Value *in2); llvm::Value *rcp(llvm::Value *in); llvm::Value *rsq(llvm::Value *in); llvm::Value *scs(llvm::Value *in); + llvm::Value *seq(llvm::Value *in1, llvm::Value *in2); + llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2); llvm::Value *sge(llvm::Value *in1, llvm::Value *in2); llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2); llvm::Value *sin(llvm::Value *in); + llvm::Value *sle(llvm::Value *in1, llvm::Value *in2); llvm::Value *slt(llvm::Value *in1, llvm::Value *in2); + llvm::Value *sne(llvm::Value *in1, llvm::Value *in2); + llvm::Value *str(llvm::Value *in1, llvm::Value *in2); llvm::Value *sub(llvm::Value *in1, llvm::Value *in2); llvm::Value *trunc(llvm::Value *in); + llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3); void printVector(llvm::Value *val); private: const char *name(const char *prefix); + llvm::Value *callCeil(llvm::Value *val); llvm::Value *callFAbs(llvm::Value *val); + llvm::Value *callFExp(llvm::Value *val); + llvm::Value *callFLog(llvm::Value *val); llvm::Value *callFloor(llvm::Value *val); llvm::Value *callFSqrt(llvm::Value *val); - llvm::Value *callFLog(llvm::Value *val); llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2); llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y, @@ -125,16 +146,18 @@ private: llvm::Module *m_mod; llvm::Function *m_func; char m_name[32]; - llvm::IRBuilder m_builder; + llvm::IRBuilder<> m_builder; int m_idx; llvm::VectorType *m_floatVecType; + llvm::Function *m_llvmCeil; llvm::Function *m_llvmFSqrt; llvm::Function *m_llvmFAbs; llvm::Function *m_llvmPow; llvm::Function *m_llvmFloor; llvm::Function *m_llvmFlog; + llvm::Function *m_llvmFexp; llvm::Function *m_llvmLit; llvm::Constant *m_fmtPtr; diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp index efddc04e81..d5600fd22d 100644 --- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp +++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp @@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y, return res; } -std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in) -{ - std::vector<llvm::Value*> res(4); - - //Extract x's - llvm::Value *x1 = m_builder.CreateExtractElement(in[0], - m_storage->constantInt(0), - name("extractX")); - //cast it to an unsigned int - x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast")); - - res[0] = x1;//vectorFromVals(x1, x2, x3, x4); - //only x is valid. the others shouldn't be necessary - /* - res[1] = Constant::getNullValue(m_floatVecType); - res[2] = Constant::getNullValue(m_floatVecType); - res[3] = Constant::getNullValue(m_floatVecType); - */ - - return res; -} - - -std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - std::vector<llvm::Value*> res(4); - - res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx")); - res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy")); - res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz")); - res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw")); - - return res; -} - -std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - std::vector<llvm::Value*> res(4); - - res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx")); - res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly")); - res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz")); - res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw")); - - return res; -} - void InstructionsSoa::end() { m_builder.CreateRetVoid(); } -std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2, - const std::vector<llvm::Value*> in3) -{ - std::vector<llvm::Value*> res = mul(in1, in2); - return add(res, in3); -} - std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector) { std::vector<llvm::Value*> res(4); @@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector) return res; } +llvm::IRBuilder<>* InstructionsSoa::getIRBuilder() +{ + return &m_builder; +} + void InstructionsSoa::createFunctionMap() { m_functionsMap[TGSI_OPCODE_ABS] = "abs"; @@ -181,6 +129,7 @@ void InstructionsSoa::createFunctionMap() m_functionsMap[TGSI_OPCODE_POWER] = "pow"; m_functionsMap[TGSI_OPCODE_LIT] = "lit"; m_functionsMap[TGSI_OPCODE_RSQ] = "rsq"; + m_functionsMap[TGSI_OPCODE_SLT] = "slt"; } void InstructionsSoa::createDependencies() @@ -273,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i return callBuiltin(func, in1); } +std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + std::vector<llvm::Value*> res(4); + + res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx")); + res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy")); + res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz")); + res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw")); + + return res; +} + +std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in) +{ + std::vector<llvm::Value*> res(4); + + //Extract x's + llvm::Value *x1 = m_builder.CreateExtractElement(in[0], + m_storage->constantInt(0), + name("extractX")); + //cast it to an unsigned int + x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast")); + + res[0] = x1;//vectorFromVals(x1, x2, x3, x4); + //only x is valid. the others shouldn't be necessary + /* + res[1] = Constant::getNullValue(m_floatVecType); + res[2] = Constant::getNullValue(m_floatVecType); + res[3] = Constant::getNullValue(m_floatVecType); + */ + + return res; +} + std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1, const std::vector<llvm::Value*> in2) { @@ -280,6 +264,98 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i return callBuiltin(func, in1, in2); } +std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in) +{ + llvm::Function *func = function(TGSI_OPCODE_LIT); + return callBuiltin(func, in); +} + +std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2, + const std::vector<llvm::Value*> in3) +{ + std::vector<llvm::Value*> res = mul(in1, in2); + return add(res, in3); +} + +std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_MAX); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_MIN); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + std::vector<llvm::Value*> res(4); + + res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx")); + res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly")); + res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz")); + res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw")); + + return res; +} + +std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_POWER); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in) +{ + llvm::Function *func = function(TGSI_OPCODE_RSQ); + return callBuiltin(func, in); +} + +std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + llvm::Function *func = function(TGSI_OPCODE_SLT); + return callBuiltin(func, in1, in2); +} + +std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2) +{ + std::vector<llvm::Value*> res(4); + + res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx")); + res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby")); + res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz")); + res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw")); + + return res; +} + +void checkFunction(Function *func) +{ + for (Function::const_iterator BI = func->begin(), BE = func->end(); + BI != BE; ++BI) { + const BasicBlock &BB = *BI; + for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end(); + II != IE; ++II) { + const Instruction &I = *II; + std::cout<< "Instr = "<<I; + for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) { + const Value *Op = I.getOperand(op); + std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl; + //I->setOperand(op, V); + } + } + } +} + llvm::Value * InstructionsSoa::allocaTemp() { VectorType *vector = VectorType::get(Type::FloatTy, 4); @@ -399,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std return allocaToResult(allocaPtr); } -std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - llvm::Function *func = function(TGSI_OPCODE_POWER); - return callBuiltin(func, in1, in2); -} - -std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - llvm::Function *func = function(TGSI_OPCODE_MIN); - return callBuiltin(func, in1, in2); -} - - -std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - llvm::Function *func = function(TGSI_OPCODE_MAX); - return callBuiltin(func, in1, in2); -} - -void checkFunction(Function *func) -{ - for (Function::const_iterator BI = func->begin(), BE = func->end(); - BI != BE; ++BI) { - const BasicBlock &BB = *BI; - for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end(); - II != IE; ++II) { - const Instruction &I = *II; - std::cout<< "Instr = "<<I; - for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) { - const Value *Op = I.getOperand(op); - std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl; - //I->setOperand(op, V); - } - } - } -} - void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op) { assert(originalFunc); @@ -458,8 +494,8 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op) func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage, originalFunc->getName(), currentModule()); func->setCallingConv(CallingConv::C); - const PAListPtr pal; - func->setParamAttrs(pal); + const AttrListPtr pal; + func->setAttributes(pal); currentModule()->dump(); } else { DenseMap<const Value*, Value *> val; @@ -483,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op) } } -std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1, - const std::vector<llvm::Value*> in2) -{ - std::vector<llvm::Value*> res(4); - - res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx")); - res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby")); - res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz")); - res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw")); - - return res; -} - -std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in) -{ - llvm::Function *func = function(TGSI_OPCODE_LIT); - return callBuiltin(func, in); -} - -std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in) -{ - llvm::Function *func = function(TGSI_OPCODE_RSQ); - return callBuiltin(func, in); -} diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h index 3e20b652dd..d6831e0a6b 100644 --- a/src/gallium/auxiliary/gallivm/instructionssoa.h +++ b/src/gallium/auxiliary/gallivm/instructionssoa.h @@ -69,11 +69,14 @@ public: std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1, const std::vector<llvm::Value*> in2); std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1); + std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1, + const std::vector<llvm::Value*> in2); std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1, const std::vector<llvm::Value*> in2); void end(); std::vector<llvm::Value*> extractVector(llvm::Value *vector); + llvm::IRBuilder<>* getIRBuilder(); private: const char * name(const char *prefix) const; llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y, @@ -96,7 +99,7 @@ private: const std::vector<llvm::Value*> in3); void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST); private: - llvm::IRBuilder m_builder; + llvm::IRBuilder<> m_builder; StorageSoa *m_storage; std::map<int, std::string> m_functionsMap; diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c index 78f84510e2..cb85e1734e 100644 --- a/src/gallium/auxiliary/gallivm/soabuiltins.c +++ b/src/gallium/auxiliary/gallivm/soabuiltins.c @@ -36,6 +36,8 @@ typedef __attribute__(( ext_vector_type(4) )) float float4; extern float fabsf(float val); +/* helpers */ + float4 absvec(float4 vec) { float4 res; @@ -47,6 +49,58 @@ float4 absvec(float4 vec) return res; } +float4 maxvec(float4 a, float4 b) +{ + return (float4){(a.x > b.x) ? a.x : b.x, + (a.y > b.y) ? a.y : b.y, + (a.z > b.z) ? a.z : b.z, + (a.w > b.w) ? a.w : b.w}; +} + +float4 minvec(float4 a, float4 b) +{ + return (float4){(a.x < b.x) ? a.x : b.x, + (a.y < b.y) ? a.y : b.y, + (a.z < b.z) ? a.z : b.z, + (a.w < b.w) ? a.w : b.w}; +} + +extern float powf(float num, float p); +extern float sqrtf(float x); + +float4 powvec(float4 vec, float4 q) +{ + float4 p; + p.x = powf(vec.x, q.x); + p.y = powf(vec.y, q.y); + p.z = powf(vec.z, q.z); + p.w = powf(vec.w, q.w); + return p; +} + +float4 sqrtvec(float4 vec) +{ + float4 p; + p.x = sqrtf(vec.x); + p.y = sqrtf(vec.y); + p.z = sqrtf(vec.z); + p.w = sqrtf(vec.w); + return p; +} + +float4 sltvec(float4 v1, float4 v2) +{ + float4 p; + p.x = (v1.x < v2.x) ? 1.0 : 0.0; + p.y = (v1.y < v2.y) ? 1.0 : 0.0; + p.z = (v1.z < v2.z) ? 1.0 : 0.0; + p.w = (v1.w < v2.w) ? 1.0 : 0.0; + return p; +} + + +/* instructions */ + void abs(float4 *res, float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w) { @@ -69,7 +123,6 @@ void dp3(float4 *res, res[3] = dot; } - void dp4(float4 *res, float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) @@ -83,35 +136,25 @@ void dp4(float4 *res, res[3] = dot; } -extern float powf(float num, float p); -extern float sqrtf(float x); - -float4 powvec(float4 vec, float4 q) -{ - float4 p; - p.x = powf(vec.x, q.x); - p.y = powf(vec.y, q.y); - p.z = powf(vec.z, q.z); - p.w = powf(vec.w, q.w); - return p; -} - -void pow(float4 *res, - float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, - float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) +void lit(float4 *res, + float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w) { - res[0] = powvec(tmp0x, tmp1x); - res[1] = res[0]; - res[2] = res[0]; - res[3] = res[0]; -} + const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0}; + const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f}; + const float4 plus128 = (float4) {128.f, 128.f, 128.f, 128.f}; -float4 minvec(float4 a, float4 b) -{ - return (float4){(a.x < b.x) ? a.x : b.x, - (a.y < b.y) ? a.y : b.y, - (a.z < b.z) ? a.z : b.z, - (a.w < b.w) ? a.w : b.w}; + res[0] = (float4){1.0, 1.0, 1.0, 1.0}; + if (tmp0x.x > 0) { + float4 tmpy = maxvec(tmp0y, zerovec); + float4 tmpw = minvec(tmp0w, plus128); + tmpw = maxvec(tmpw, min128); + res[1] = tmp0x; + res[2] = powvec(tmpy, tmpw); + } else { + res[1] = zerovec; + res[2] = zerovec; + } + res[3] = (float4){1.0, 1.0, 1.0, 1.0}; } void min(float4 *res, @@ -125,14 +168,6 @@ void min(float4 *res, } -float4 maxvec(float4 a, float4 b) -{ - return (float4){(a.x > b.x) ? a.x : b.x, - (a.y > b.y) ? a.y : b.y, - (a.z > b.z) ? a.z : b.z, - (a.w > b.w) ? a.w : b.w}; -} - void max(float4 *res, float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) @@ -143,37 +178,14 @@ void max(float4 *res, res[3] = maxvec(tmp0w, tmp1w); } - -void lit(float4 *res, - float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w) -{ - const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0}; - const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f}; - const float4 plus128 = (float4) {128.f, 128.f, 128.f, 128.f}; - - res[0] = (float4){1.0, 1.0, 1.0, 1.0}; - if (tmp0x.x > 0) { - float4 tmpy = maxvec(tmp0y, zerovec); - float4 tmpw = minvec(tmp0w, plus128); - tmpw = maxvec(tmpw, min128); - res[1] = tmp0x; - res[2] = powvec(tmpy, tmpw); - } else { - res[1] = zerovec; - res[2] = zerovec; - } - res[3] = (float4){1.0, 1.0, 1.0, 1.0}; -} - - -float4 sqrtvec(float4 vec) +void pow(float4 *res, + float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, + float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) { - float4 p; - p.x = sqrtf(vec.x); - p.y = sqrtf(vec.y); - p.z = sqrtf(vec.z); - p.w = sqrtf(vec.w); - return p; + res[0] = powvec(tmp0x, tmp1x); + res[1] = res[0]; + res[2] = res[0]; + res[3] = res[0]; } void rsq(float4 *res, @@ -185,3 +197,14 @@ void rsq(float4 *res, res[2] = onevec/sqrtvec(absvec(tmp0z)); res[3] = onevec/sqrtvec(absvec(tmp0w)); } + +void slt(float4 *res, + float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w, + float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w) +{ + res[0] = sltvec(tmp0x, tmp1x); + res[1] = sltvec(tmp0y, tmp1y); + res[2] = sltvec(tmp0z, tmp1z); + res[3] = sltvec(tmp0w, tmp1w); +} + diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp index 78d754371f..4fc075cf6d 100644 --- a/src/gallium/auxiliary/gallivm/storagesoa.cpp +++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp @@ -93,7 +93,7 @@ void StorageSoa::declareImmediates() std::vector<float> vals(4); std::vector<Constant*> channelArray; - vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0]; + vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3]; llvm::Constant *xChannel = createConstGlobalVector(vals); vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1]; @@ -144,22 +144,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx) return res; } -std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx) +llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc) { - std::vector<llvm::Value*> res(4); + std::vector<llvm::Value*> x(4); + x[0] = m_builder->CreateExtractElement(vector, + constantInt(cc), + name("x")); + + VectorType *vectorType = VectorType::get(Type::FloatTy, 4); + Constant *constVector = Constant::getNullValue(vectorType); + Value *res = m_builder->CreateInsertElement(constVector, x[0], + constantInt(0), + name("vecx")); + res = m_builder->CreateInsertElement(res, x[0], constantInt(1), + name("vecxx")); + res = m_builder->CreateInsertElement(res, x[0], constantInt(2), + name("vecxxx")); + res = m_builder->CreateInsertElement(res, x[0], constantInt(3), + name("vecxxxx")); + return res; +} + +std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx) +{ + llvm::Value* res; + std::vector<llvm::Value*> res2(4); llvm::Value *xChannel, *yChannel, *zChannel, *wChannel; xChannel = elementPointer(m_consts, idx, 0); - yChannel = elementPointer(m_consts, idx, 1); - zChannel = elementPointer(m_consts, idx, 2); - wChannel = elementPointer(m_consts, idx, 3); - res[0] = alignedArrayLoad(xChannel); - res[1] = alignedArrayLoad(yChannel); - res[2] = alignedArrayLoad(zChannel); - res[3] = alignedArrayLoad(wChannel); + res = alignedArrayLoad(xChannel); - return res; + res2[0]=unpackConstElement(m_builder, res,0); + res2[1]=unpackConstElement(m_builder, res,1); + res2[2]=unpackConstElement(m_builder, res,2); + res2[3]=unpackConstElement(m_builder, res,3); + + return res2; } std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx) @@ -260,6 +281,12 @@ llvm::Module * StorageSoa::currentModule() const return m_block->getParent()->getParent(); } +llvm::Constant * StorageSoa::createConstGlobalFloat(const float val) +{ + Constant*c = ConstantFP::get(APFloat(val)); + return c; +} + llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec) { VectorType *vectorType = VectorType::get(Type::FloatTy, 4); @@ -278,7 +305,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v } std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle, - llvm::Value *indIdx) + llvm::IRBuilder<>* m_builder,llvm::Value *indIdx) { std::vector<llvm::Value*> val(4); @@ -302,7 +329,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in val = tempElement(realIndex); break; case TGSI_FILE_CONSTANT: - val = constElement(realIndex); + val = constElement(m_builder, realIndex); break; case TGSI_FILE_IMMEDIATE: val = immediateElement(realIndex); diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h index ae2fc7c6ae..f21ca6ec43 100644 --- a/src/gallium/auxiliary/gallivm/storagesoa.h +++ b/src/gallium/auxiliary/gallivm/storagesoa.h @@ -29,6 +29,7 @@ #define STORAGESOA_H #include <pipe/p_shader_tokens.h> +#include <llvm/Support/IRBuilder.h> #include <vector> #include <list> @@ -56,7 +57,7 @@ public: std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, - llvm::Value *indIdx =0); + llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0); void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val, int mask); @@ -76,10 +77,12 @@ private: const char *name(const char *prefix) const; llvm::Value *alignedArrayLoad(llvm::Value *val); llvm::Module *currentModule() const; + llvm::Constant *createConstGlobalFloat(const float val); llvm::Constant *createConstGlobalVector(const std::vector<float> &vec); std::vector<llvm::Value*> inputElement(llvm::Value *indIdx); - std::vector<llvm::Value*> constElement(llvm::Value *indIdx); + llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc); + std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx); std::vector<llvm::Value*> outputElement(llvm::Value *indIdx); std::vector<llvm::Value*> tempElement(llvm::Value *indIdx); std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx); diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp index cc1516a45e..1191a6cae9 100644 --- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp +++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp @@ -52,7 +52,7 @@ static inline FunctionType *vertexShaderFunctionType() // pass are castable to the following: // [4 x <4 x float>] inputs, // [4 x <4 x float>] output, - // [4 x [4 x float]] consts, + // [4 x [1 x float]] consts, // [4 x <4 x float>] temps std::vector<const Type*> funcArgs; @@ -61,7 +61,7 @@ static inline FunctionType *vertexShaderFunctionType() PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0); ArrayType *floatArray = ArrayType::get(Type::FloatTy, 4); - ArrayType *constsArray = ArrayType::get(floatArray, 4); + ArrayType *constsArray = ArrayType::get(floatArray, 1); PointerType *constsArrayPtr = PointerType::get(constsArray, 0); funcArgs.push_back(vectorArrayPtr);//inputs @@ -246,6 +246,7 @@ translate_instruction(llvm::Module *module, val = storage->constElement(src->SrcRegister.Index, indIdx); } else if (src->SrcRegister.File == TGSI_FILE_INPUT) { val = storage->inputElement(src->SrcRegister.Index, indIdx); + // FIXME we should not be generating elements for temporaries, this creates useless memory writes } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) { val = storage->tempElement(src->SrcRegister.Index); } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) { @@ -286,9 +287,13 @@ translate_instruction(llvm::Module *module, out = instr->rsq(inputs[0]); } break; - case TGSI_OPCODE_EXP: + case TGSI_OPCODE_EXP: { + out = instr->exp(inputs[0]); + } break; - case TGSI_OPCODE_LOG: + case TGSI_OPCODE_LOG: { + out = instr->log(inputs[0]); + } break; case TGSI_OPCODE_MUL: { out = instr->mul(inputs[0], inputs[1]); @@ -338,21 +343,31 @@ translate_instruction(llvm::Module *module, out = instr->lerp(inputs[0], inputs[1], inputs[2]); } break; - case TGSI_OPCODE_CND: + case TGSI_OPCODE_CND: { + out = instr->cnd(inputs[0], inputs[1], inputs[2]); + } break; - case TGSI_OPCODE_CND0: + case TGSI_OPCODE_CND0: { + out = instr->cnd0(inputs[0], inputs[1], inputs[2]); + } break; - case TGSI_OPCODE_DOT2ADD: + case TGSI_OPCODE_DOT2ADD: { + out = instr->dot2add(inputs[0], inputs[1], inputs[2]); + } break; case TGSI_OPCODE_INDEX: break; - case TGSI_OPCODE_NEGATE: + case TGSI_OPCODE_NEGATE: { + out = instr->neg(inputs[0]); + } break; case TGSI_OPCODE_FRAC: { out = instr->frc(inputs[0]); } break; - case TGSI_OPCODE_CLAMP: + case TGSI_OPCODE_CLAMP: { + out = instr->clamp(inputs[0]); + } break; case TGSI_OPCODE_FLOOR: { out = instr->floor(inputs[0]); @@ -392,9 +407,13 @@ translate_instruction(llvm::Module *module, out = instr->cos(inputs[0]); } break; - case TGSI_OPCODE_DDX: + case TGSI_OPCODE_DDX: { + out = instr->ddx(inputs[0]); + } break; - case TGSI_OPCODE_DDY: + case TGSI_OPCODE_DDY: { + out = instr->ddy(inputs[0]); + } break; case TGSI_OPCODE_KILP: break; @@ -408,9 +427,13 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_RFL: break; - case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SEQ: { + out = instr->seq(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_SFL: + case TGSI_OPCODE_SFL: { + out = instr->sfl(inputs[0], inputs[1]); + } break; case TGSI_OPCODE_SGT: { out = instr->sgt(inputs[0], inputs[1]); @@ -420,11 +443,17 @@ translate_instruction(llvm::Module *module, out = instr->sin(inputs[0]); } break; - case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SLE: { + out = instr->sle(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_SNE: + case TGSI_OPCODE_SNE: { + out = instr->sne(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_STR: + case TGSI_OPCODE_STR: { + out = instr->str(inputs[0], inputs[1]); + } break; case TGSI_OPCODE_TEX: break; @@ -438,7 +467,9 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_UP4UB: break; - case TGSI_OPCODE_X2D: + case TGSI_OPCODE_X2D: { + out = instr->x2d(inputs[0], inputs[1], inputs[2]); + } break; case TGSI_OPCODE_ARA: break; @@ -468,11 +499,18 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_TXB: break; - case TGSI_OPCODE_NRM: + case TGSI_OPCODE_NRM4: + case TGSI_OPCODE_NRM: { + out = instr->nrm(inputs[0]); + } break; - case TGSI_OPCODE_DIV: + case TGSI_OPCODE_DIV: { + out = instr->div(inputs[0], inputs[1]); + } break; - case TGSI_OPCODE_DP2: + case TGSI_OPCODE_DP2: { + out = instr->dp2(inputs[0], inputs[1]); + } break; case TGSI_OPCODE_TXL: break; @@ -590,8 +628,6 @@ translate_instruction(llvm::Module *module, break; case TGSI_OPCODE_M3X2: break; - case TGSI_OPCODE_NRM4: - break; case TGSI_OPCODE_CALLNZ: break; case TGSI_OPCODE_IFC: @@ -641,6 +677,7 @@ translate_instruction(llvm::Module *module, if (dst->DstRegister.File == TGSI_FILE_OUTPUT) { storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask); + // FIXME we should not be generating elements for temporaries, this creates useless memory writes } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) { storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask); } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) { @@ -672,9 +709,8 @@ translate_instructionir(llvm::Module *module, if (src->SrcRegister.Indirect) { indIdx = storage->addrElement(src->SrcRegisterInd.Index); } - val = storage->load((enum tgsi_file_type)src->SrcRegister.File, - src->SrcRegister.Index, swizzle, indIdx); + src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx); inputs[i] = val; } @@ -732,6 +768,7 @@ translate_instructionir(llvm::Module *module, } break; case TGSI_OPCODE_SLT: { + out = instr->slt(inputs[0], inputs[1]); } break; case TGSI_OPCODE_SGE: { @@ -989,7 +1026,6 @@ translate_instructionir(llvm::Module *module, /* store results */ for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) { struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i]; - storage->store((enum tgsi_file_type)dst->DstRegister.File, dst->DstRegister.Index, out, dst->DstRegister.WriteMask); } diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c index 20fc87b39d..1bf22a2ec0 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c @@ -129,7 +129,7 @@ pb_malloc_buffer_create(size_t size, static struct pb_buffer * -pb_malloc_buffer_create_buffer(struct pb_manager *mgr, +pb_malloc_bufmgr_create_buffer(struct pb_manager *mgr, size_t size, const struct pb_desc *desc) { @@ -138,6 +138,13 @@ pb_malloc_buffer_create_buffer(struct pb_manager *mgr, static void +pb_malloc_bufmgr_flush(struct pb_manager *mgr) +{ + /* No-op */ +} + + +static void pb_malloc_bufmgr_destroy(struct pb_manager *mgr) { /* No-op */ @@ -146,8 +153,9 @@ pb_malloc_bufmgr_destroy(struct pb_manager *mgr) static struct pb_manager pb_malloc_bufmgr = { - pb_malloc_buffer_create_buffer, - pb_malloc_bufmgr_destroy + pb_malloc_bufmgr_destroy, + pb_malloc_bufmgr_create_buffer, + pb_malloc_bufmgr_flush }; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h index 32867029ee..cafbee045a 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h @@ -69,13 +69,22 @@ struct pipe_winsys; */ struct pb_manager { + void + (*destroy)( struct pb_manager *mgr ); + struct pb_buffer * (*create_buffer)( struct pb_manager *mgr, size_t size, const struct pb_desc *desc); + /** + * Flush all temporary-held buffers. + * + * Used mostly to aid debugging memory issues or to clean up resources when + * the drivers are long lived. + */ void - (*destroy)( struct pb_manager *mgr ); + (*flush)( struct pb_manager *mgr ); }; @@ -153,9 +162,6 @@ struct pb_manager * pb_cache_manager_create(struct pb_manager *provider, unsigned usecs); -void -pb_cache_flush(struct pb_manager *mgr); - /** * Fenced buffer manager. diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c index 2afaeafa1a..c956924cc7 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c @@ -76,6 +76,21 @@ pb_alt_manager_create_buffer(struct pb_manager *_mgr, static void +pb_alt_manager_flush(struct pb_manager *_mgr) +{ + struct pb_alt_manager *mgr = pb_alt_manager(_mgr); + + assert(mgr->provider1->flush); + if(mgr->provider1->flush) + mgr->provider1->flush(mgr->provider1); + + assert(mgr->provider2->flush); + if(mgr->provider2->flush) + mgr->provider2->flush(mgr->provider2); +} + + +static void pb_alt_manager_destroy(struct pb_manager *mgr) { FREE(mgr); @@ -97,6 +112,7 @@ pb_alt_manager_create(struct pb_manager *provider1, mgr->base.destroy = pb_alt_manager_destroy; mgr->base.create_buffer = pb_alt_manager_create_buffer; + mgr->base.flush = pb_alt_manager_flush; mgr->provider1 = provider1; mgr->provider2 = provider2; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c index 1ec422fb19..8f118874ec 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c @@ -306,8 +306,8 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, } -void -pb_cache_flush(struct pb_manager *_mgr) +static void +pb_cache_manager_flush(struct pb_manager *_mgr) { struct pb_cache_manager *mgr = pb_cache_manager(_mgr); struct list_head *curr, *next; @@ -323,13 +323,17 @@ pb_cache_flush(struct pb_manager *_mgr) next = curr->next; } pipe_mutex_unlock(mgr->mutex); + + assert(mgr->provider->flush); + if(mgr->provider->flush) + mgr->provider->flush(mgr->provider); } static void pb_cache_manager_destroy(struct pb_manager *mgr) { - pb_cache_flush(mgr); + pb_cache_manager_flush(mgr); FREE(mgr); } @@ -349,6 +353,7 @@ pb_cache_manager_create(struct pb_manager *provider, mgr->base.destroy = pb_cache_manager_destroy; mgr->base.create_buffer = pb_cache_manager_create_buffer; + mgr->base.flush = pb_cache_manager_flush; mgr->provider = provider; mgr->usecs = usecs; LIST_INITHEAD(&mgr->delayed); diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c index 5f1ed3e5a8..1675e6e182 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c @@ -314,6 +314,16 @@ pb_debug_manager_create_buffer(struct pb_manager *_mgr, static void +pb_debug_manager_flush(struct pb_manager *_mgr) +{ + struct pb_debug_manager *mgr = pb_debug_manager(_mgr); + assert(mgr->provider->flush); + if(mgr->provider->flush) + mgr->provider->flush(mgr->provider); +} + + +static void pb_debug_manager_destroy(struct pb_manager *_mgr) { struct pb_debug_manager *mgr = pb_debug_manager(_mgr); @@ -336,6 +346,7 @@ pb_debug_manager_create(struct pb_manager *provider, size_t band_size) mgr->base.destroy = pb_debug_manager_destroy; mgr->base.create_buffer = pb_debug_manager_create_buffer; + mgr->base.flush = pb_debug_manager_flush; mgr->provider = provider; mgr->band_size = band_size; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c index 8fc63ce648..633ee70a75 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c @@ -95,6 +95,19 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr, static void +fenced_bufmgr_flush(struct pb_manager *mgr) +{ + struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr); + + fenced_buffer_list_check_free(fenced_mgr->fenced_list, TRUE); + + assert(fenced_mgr->provider->flush); + if(fenced_mgr->provider->flush) + fenced_mgr->provider->flush(fenced_mgr->provider); +} + + +static void fenced_bufmgr_destroy(struct pb_manager *mgr) { struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr); @@ -123,6 +136,7 @@ fenced_bufmgr_create(struct pb_manager *provider, fenced_mgr->base.destroy = fenced_bufmgr_destroy; fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer; + fenced_mgr->base.flush = fenced_bufmgr_flush; fenced_mgr->provider = provider; fenced_mgr->fenced_list = fenced_buffer_list_create(winsys); diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c index e8c7f8e1f8..fe80ca30ee 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c @@ -200,6 +200,13 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr, static void +mm_bufmgr_flush(struct pb_manager *mgr) +{ + /* No-op */ +} + + +static void mm_bufmgr_destroy(struct pb_manager *mgr) { struct mm_pb_manager *mm = mm_pb_manager(mgr); @@ -230,8 +237,9 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, if (!mm) return NULL; - mm->base.create_buffer = mm_bufmgr_create_buffer; mm->base.destroy = mm_bufmgr_destroy; + mm->base.create_buffer = mm_bufmgr_create_buffer; + mm->base.flush = mm_bufmgr_flush; mm->size = size; mm->align2 = align2; /* 64-byte alignment */ diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c index 3ef72c5bbb..61ac291ed7 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c @@ -203,6 +203,13 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr, static void +pool_bufmgr_flush(struct pb_manager *mgr) +{ + /* No-op */ +} + + +static void pool_bufmgr_destroy(struct pb_manager *mgr) { struct pool_pb_manager *pool = pool_pb_manager(mgr); @@ -238,6 +245,7 @@ pool_bufmgr_create(struct pb_manager *provider, pool->base.destroy = pool_bufmgr_destroy; pool->base.create_buffer = pool_bufmgr_create_buffer; + pool->base.flush = pool_bufmgr_flush; LIST_INITHEAD(&pool->free); diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c index 8698c4cff6..2a80154920 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c @@ -407,6 +407,17 @@ pb_slab_manager_create_buffer(struct pb_manager *_mgr, static void +pb_slab_manager_flush(struct pb_manager *_mgr) +{ + struct pb_slab_manager *mgr = pb_slab_manager(_mgr); + + assert(mgr->provider->flush); + if(mgr->provider->flush) + mgr->provider->flush(mgr->provider); +} + + +static void pb_slab_manager_destroy(struct pb_manager *_mgr) { struct pb_slab_manager *mgr = pb_slab_manager(_mgr); @@ -430,6 +441,7 @@ pb_slab_manager_create(struct pb_manager *provider, mgr->base.destroy = pb_slab_manager_destroy; mgr->base.create_buffer = pb_slab_manager_create_buffer; + mgr->base.flush = pb_slab_manager_flush; mgr->provider = provider; mgr->bufSize = bufSize; @@ -466,6 +478,19 @@ pb_slab_range_manager_create_buffer(struct pb_manager *_mgr, static void +pb_slab_range_manager_flush(struct pb_manager *_mgr) +{ + struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr); + + /* Individual slabs don't hold any temporary buffers so no need to call them */ + + assert(mgr->provider->flush); + if(mgr->provider->flush) + mgr->provider->flush(mgr->provider); +} + + +static void pb_slab_range_manager_destroy(struct pb_manager *_mgr) { struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr); @@ -499,6 +524,7 @@ pb_slab_range_manager_create(struct pb_manager *provider, mgr->base.destroy = pb_slab_range_manager_destroy; mgr->base.create_buffer = pb_slab_range_manager_create_buffer; + mgr->base.flush = pb_slab_range_manager_flush; mgr->provider = provider; mgr->minBufSize = minBufSize; diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile index 39b8a4dbd7..252dc5274a 100644 --- a/src/gallium/auxiliary/rtasm/Makefile +++ b/src/gallium/auxiliary/rtasm/Makefile @@ -7,6 +7,7 @@ C_SOURCES = \ rtasm_cpu.c \ rtasm_execmem.c \ rtasm_x86sse.c \ + rtasm_ppc.c \ rtasm_ppc_spe.c include ../../Makefile.template diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript index 8ea25922aa..eb48368acc 100644 --- a/src/gallium/auxiliary/rtasm/SConscript +++ b/src/gallium/auxiliary/rtasm/SConscript @@ -6,6 +6,7 @@ rtasm = env.ConvenienceLibrary( 'rtasm_cpu.c', 'rtasm_execmem.c', 'rtasm_x86sse.c', + 'rtasm_ppc.c', 'rtasm_ppc_spe.c', ]) diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c new file mode 100644 index 0000000000..7dd8263749 --- /dev/null +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c @@ -0,0 +1,924 @@ +/************************************************************************** + * + * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * PPC code generation. + * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf + * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf + * + * Other PPC refs: + * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2 + * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html + * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf + * + * \author Brian Paul + */ + + +#include <stdio.h> +#include "util/u_memory.h" +#include "pipe/p_debug.h" +#include "rtasm_ppc.h" + + +void +ppc_init_func(struct ppc_function *p, unsigned max_inst) +{ + uint i; + + p->store = align_malloc(max_inst * PPC_INST_SIZE, 16); + p->num_inst = 0; + p->max_inst = max_inst; + p->reg_used = 0x0; + p->fp_used = 0x0; + p->vec_used = 0x0; + + /* only allow using gp registers 3..12 for now */ + for (i = 0; i < 3; i++) + ppc_reserve_register(p, i); + for (i = 12; i < PPC_NUM_REGS; i++) + ppc_reserve_register(p, i); +} + + +void +ppc_release_func(struct ppc_function *p) +{ + assert(p->num_inst <= p->max_inst); + if (p->store != NULL) { + align_free(p->store); + } + p->store = NULL; +} + + +void (*ppc_get_func(struct ppc_function *p))(void) +{ +#if 0 + DUMP_END(); + if (DISASSEM && p->store) + debug_printf("disassemble %p %p\n", p->store, p->csr); + + if (p->store == p->error_overflow) + return (void (*)(void)) NULL; + else +#endif + return (void (*)(void)) p->store; +} + + +void +ppc_dump_func(const struct ppc_function *p) +{ + uint i; + for (i = 0; i < p->num_inst; i++) { + debug_printf("%3u: 0x%08x\n", i, p->store[i]); + } +} + + +/** + * Mark a register as being unavailable. + */ +int +ppc_reserve_register(struct ppc_function *p, int reg) +{ + assert(reg < PPC_NUM_REGS); + p->reg_used |= (1 << reg); + return reg; +} + + +/** + * Allocate a general purpose register. + * \return register index or -1 if none left. + */ +int +ppc_allocate_register(struct ppc_function *p) +{ + unsigned i; + for (i = 0; i < PPC_NUM_REGS; i++) { + const uint64_t mask = 1 << i; + if ((p->reg_used & mask) == 0) { + p->reg_used |= mask; + return i; + } + } + return -1; +} + + +/** + * Mark the given general purpose register as "unallocated". + */ +void +ppc_release_register(struct ppc_function *p, int reg) +{ + assert(reg < PPC_NUM_REGS); + assert(p->reg_used & (1 << reg)); + p->reg_used &= ~(1 << reg); +} + + +/** + * Allocate a floating point register. + * \return register index or -1 if none left. + */ +int +ppc_allocate_fp_register(struct ppc_function *p) +{ + unsigned i; + for (i = 0; i < PPC_NUM_FP_REGS; i++) { + const uint64_t mask = 1 << i; + if ((p->fp_used & mask) == 0) { + p->fp_used |= mask; + return i; + } + } + return -1; +} + + +/** + * Mark the given floating point register as "unallocated". + */ +void +ppc_release_fp_register(struct ppc_function *p, int reg) +{ + assert(reg < PPC_NUM_FP_REGS); + assert(p->fp_used & (1 << reg)); + p->fp_used &= ~(1 << reg); +} + + +/** + * Allocate a vector register. + * \return register index or -1 if none left. + */ +int +ppc_allocate_vec_register(struct ppc_function *p) +{ + unsigned i; + for (i = 0; i < PPC_NUM_VEC_REGS; i++) { + const uint64_t mask = 1 << i; + if ((p->vec_used & mask) == 0) { + p->vec_used |= mask; + return i; + } + } + return -1; +} + + +/** + * Mark the given vector register as "unallocated". + */ +void +ppc_release_vec_register(struct ppc_function *p, int reg) +{ + assert(reg < PPC_NUM_VEC_REGS); + assert(p->vec_used & (1 << reg)); + p->vec_used &= ~(1 << reg); +} + + + +union vx_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned vD:5; + unsigned vA:5; + unsigned vB:5; + unsigned op2:11; + } inst; +}; + +static inline void +emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB) +{ + union vx_inst inst; + inst.inst.op = 4; + inst.inst.vD = vD; + inst.inst.vA = vA; + inst.inst.vB = vB; + inst.inst.op2 = op2; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + + +union vxr_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned vD:5; + unsigned vA:5; + unsigned vB:5; + unsigned rC:1; + unsigned op2:10; + } inst; +}; + +static inline void +emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB) +{ + union vxr_inst inst; + inst.inst.op = 4; + inst.inst.vD = vD; + inst.inst.vA = vA; + inst.inst.vB = vB; + inst.inst.rC = 0; + inst.inst.op2 = op2; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + + +union va_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned vD:5; + unsigned vA:5; + unsigned vB:5; + unsigned vC:5; + unsigned op2:6; + } inst; +}; + +static inline void +emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC) +{ + union va_inst inst; + inst.inst.op = 4; + inst.inst.vD = vD; + inst.inst.vA = vA; + inst.inst.vB = vB; + inst.inst.vC = vC; + inst.inst.op2 = op2; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + + +union i_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned li:24; + unsigned aa:1; + unsigned lk:1; + } inst; +}; + +static INLINE void +emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk) +{ + union i_inst inst; + inst.inst.op = op; + inst.inst.li = li; + inst.inst.aa = aa; + inst.inst.lk = lk; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +} + + +union xl_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned bo:5; + unsigned bi:5; + unsigned unused:3; + unsigned bh:2; + unsigned op2:10; + unsigned lk:1; + } inst; +}; + +static INLINE void +emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh, + uint op2, uint lk) +{ + union xl_inst inst; + inst.inst.op = op; + inst.inst.bo = bo; + inst.inst.bi = bi; + inst.inst.unused = 0x0; + inst.inst.bh = bh; + inst.inst.op2 = op2; + inst.inst.lk = lk; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +} + +static INLINE void +dump_xl(const char *name, uint inst) +{ + union xl_inst i; + + i.bits = inst; + debug_printf("%s = 0x%08x\n", name, inst); + debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op); + debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo); + debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi); + debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused); + debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh); + debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2); + debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk); +} + + +union x_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned vrs:5; + unsigned ra:5; + unsigned rb:5; + unsigned op2:10; + unsigned unused:1; + } inst; +}; + +static INLINE void +emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2) +{ + union x_inst inst; + inst.inst.op = op; + inst.inst.vrs = vrs; + inst.inst.ra = ra; + inst.inst.rb = rb; + inst.inst.op2 = op2; + inst.inst.unused = 0x0; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +} + + +union d_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned rt:5; + unsigned ra:5; + unsigned si:16; + } inst; +}; + +static inline void +emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si) +{ + union d_inst inst; + assert(si >= -32768); + assert(si <= 32767); + inst.inst.op = op; + inst.inst.rt = rt; + inst.inst.ra = ra; + inst.inst.si = (unsigned) (si & 0xffff); + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + + +union a_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned frt:5; + unsigned fra:5; + unsigned frb:5; + unsigned unused:5; + unsigned op2:5; + unsigned rc:1; + } inst; +}; + +static inline void +emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2, + uint rc) +{ + union a_inst inst; + inst.inst.op = op; + inst.inst.frt = frt; + inst.inst.fra = fra; + inst.inst.frb = frb; + inst.inst.unused = 0x0; + inst.inst.op2 = op2; + inst.inst.rc = rc; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +}; + + +union xo_inst { + uint32_t bits; + struct { + unsigned op:6; + unsigned rt:5; + unsigned ra:5; + unsigned rb:5; + unsigned oe:1; + unsigned op2:9; + unsigned rc:1; + } inst; +}; + +static INLINE void +emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe, + uint op2, uint rc) +{ + union xo_inst inst; + inst.inst.op = op; + inst.inst.rt = rt; + inst.inst.ra = ra; + inst.inst.rb = rb; + inst.inst.oe = oe; + inst.inst.op2 = op2; + inst.inst.rc = rc; + p->store[p->num_inst++] = inst.bits; + assert(p->num_inst <= p->max_inst); +} + + + + + +/** + ** float vector arithmetic + **/ + +/** vector float add */ +void +ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 10, vD, vA, vB); +} + +/** vector float substract */ +void +ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 74, vD, vA, vB); +} + +/** vector float min */ +void +ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1098, vD, vA, vB); +} + +/** vector float max */ +void +ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1034, vD, vA, vB); +} + +/** vector float mult add: vD = vA * vB + vC */ +void +ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) +{ + emit_va(p, 46, vD, vA, vC, vB); /* note arg order */ +} + +/** vector float compare greater than */ +void +ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vxr(p, 710, vD, vA, vB); +} + +/** vector float compare greater than or equal to */ +void +ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vxr(p, 454, vD, vA, vB); +} + +/** vector float compare equal */ +void +ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vxr(p, 198, vD, vA, vB); +} + +/** vector float 2^x */ +void +ppc_vexptefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 394, vD, 0, vB); +} + +/** vector float log2(x) */ +void +ppc_vlogefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 458, vD, 0, vB); +} + +/** vector float reciprocol */ +void +ppc_vrefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 266, vD, 0, vB); +} + +/** vector float reciprocol sqrt estimate */ +void +ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 330, vD, 0, vB); +} + +/** vector float round to negative infinity */ +void +ppc_vrfim(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 714, vD, 0, vB); +} + +/** vector float round to positive infinity */ +void +ppc_vrfip(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 650, vD, 0, vB); +} + +/** vector float round to nearest int */ +void +ppc_vrfin(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 522, vD, 0, vB); +} + +/** vector float round to int toward zero */ +void +ppc_vrfiz(struct ppc_function *p, uint vD, uint vB) +{ + emit_vx(p, 586, vD, 0, vB); +} + +/** vector store: store vR at mem[vA+vB] */ +void +ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB) +{ + emit_x(p, 31, vR, vA, vB, 231); +} + +/** vector load: vR = mem[vA+vB] */ +void +ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB) +{ + emit_x(p, 31, vR, vA, vB, 103); +} + +/** load vector element word: vR = mem_word[ra+rb] */ +void +ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb) +{ + emit_x(p, 31, vr, ra, rb, 71); +} + + + + +/** + ** vector bitwise operations + **/ + +/** vector and */ +void +ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1028, vD, vA, vB); +} + +/** vector and complement */ +void +ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1092, vD, vA, vB); +} + +/** vector or */ +void +ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1156, vD, vA, vB); +} + +/** vector nor */ +void +ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1284, vD, vA, vB); +} + +/** vector xor */ +void +ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 1220, vD, vA, vB); +} + +/** Pseudo-instruction: vector move */ +void +ppc_vmove(struct ppc_function *p, uint vD, uint vA) +{ + ppc_vor(p, vD, vA, vA); +} + +/** Set vector register to {0,0,0,0} */ +void +ppc_vzero(struct ppc_function *p, uint vr) +{ + ppc_vxor(p, vr, vr, vr); +} + + + + +/** + ** Vector shuffle / select / splat / etc + **/ + +/** vector permute */ +void +ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) +{ + emit_va(p, 43, vD, vA, vB, vC); +} + +/** vector select */ +void +ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC) +{ + emit_va(p, 42, vD, vA, vB, vC); +} + +/** vector splat byte */ +void +ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm) +{ + emit_vx(p, 42, vD, imm, vB); +} + +/** vector splat half word */ +void +ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm) +{ + emit_vx(p, 588, vD, imm, vB); +} + +/** vector splat word */ +void +ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm) +{ + emit_vx(p, 652, vD, imm, vB); +} + +/** vector splat signed immediate word */ +void +ppc_vspltisw(struct ppc_function *p, uint vD, int imm) +{ + assert(imm >= -16); + assert(imm < 15); + emit_vx(p, 908, vD, imm, 0); +} + +/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */ +void +ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB) +{ + emit_vx(p, 388, vD, vA, vB); +} + + + + +/** + ** integer arithmetic + **/ + +/** rt = ra + imm */ +void +ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm) +{ + emit_d(p, 14, rt, ra, imm); +} + +/** rt = ra + (imm << 16) */ +void +ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm) +{ + emit_d(p, 15, rt, ra, imm); +} + +/** rt = ra + rb */ +void +ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb) +{ + emit_xo(p, 31, rt, ra, rb, 0, 266, 0); +} + +/** rt = ra AND ra */ +void +ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb) +{ + emit_x(p, 31, ra, rt, rb, 28); /* note argument order */ +} + +/** rt = ra AND imm */ +void +ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm) +{ + emit_d(p, 28, ra, rt, imm); /* note argument order */ +} + +/** rt = ra OR ra */ +void +ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb) +{ + emit_x(p, 31, ra, rt, rb, 444); /* note argument order */ +} + +/** rt = ra OR imm */ +void +ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm) +{ + emit_d(p, 24, ra, rt, imm); /* note argument order */ +} + +/** rt = ra XOR ra */ +void +ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb) +{ + emit_x(p, 31, ra, rt, rb, 316); /* note argument order */ +} + +/** rt = ra XOR imm */ +void +ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm) +{ + emit_d(p, 26, ra, rt, imm); /* note argument order */ +} + +/** pseudo instruction: move: rt = ra */ +void +ppc_mr(struct ppc_function *p, uint rt, uint ra) +{ + ppc_or(p, rt, ra, ra); +} + +/** pseudo instruction: load immediate: rt = imm */ +void +ppc_li(struct ppc_function *p, uint rt, int imm) +{ + ppc_addi(p, rt, 0, imm); +} + +/** rt = imm << 16 */ +void +ppc_lis(struct ppc_function *p, uint rt, int imm) +{ + ppc_addis(p, rt, 0, imm); +} + +/** rt = imm */ +void +ppc_load_int(struct ppc_function *p, uint rt, int imm) +{ + ppc_lis(p, rt, (imm >> 16)); /* rt = imm >> 16 */ + ppc_ori(p, rt, rt, (imm & 0xffff)); /* rt = rt | (imm & 0xffff) */ +} + + + + +/** + ** integer load/store + **/ + +/** store rs at memory[(ra)+d], + * then update ra = (ra)+d + */ +void +ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d) +{ + emit_d(p, 37, rs, ra, d); +} + +/** store rs at memory[(ra)+d] */ +void +ppc_stw(struct ppc_function *p, uint rs, uint ra, int d) +{ + emit_d(p, 36, rs, ra, d); +} + +/** Load rt = mem[(ra)+d]; then zero set high 32 bits to zero. */ +void +ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d) +{ + emit_d(p, 32, rt, ra, d); +} + + + +/** + ** Float (non-vector) arithmetic + **/ + +/** add: frt = fra + frb */ +void +ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb) +{ + emit_a(p, 63, frt, fra, frb, 21, 0); +} + +/** sub: frt = fra - frb */ +void +ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb) +{ + emit_a(p, 63, frt, fra, frb, 20, 0); +} + +/** convert to int: rt = (int) ra */ +void +ppc_fctiwz(struct ppc_function *p, uint rt, uint fra) +{ + emit_x(p, 63, rt, 0, fra, 15); +} + +/** store frs at mem[(ra)+offset] */ +void +ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset) +{ + emit_d(p, 52, frs, ra, offset); +} + +/** store frs at mem[(ra)+(rb)] */ +void +ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb) +{ + emit_x(p, 31, frs, ra, rb, 983); +} + +/** load frt = mem[(ra)+offset] */ +void +ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset) +{ + emit_d(p, 48, frt, ra, offset); +} + + + + + +/** + ** branch instructions + **/ + +/** BLR: Branch to link register (p. 35) */ +void +ppc_blr(struct ppc_function *p) +{ + emit_i(p, 18, 0, 0, 1); +} + +/** Branch Conditional to Link Register (p. 36) */ +void +ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg) +{ + emit_xl(p, 19, condOp, condReg, branchHint, 16, 0); +} + +/** Pseudo instruction: return from subroutine */ +void +ppc_return(struct ppc_function *p) +{ + ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0); +} diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h new file mode 100644 index 0000000000..f938d8d759 --- /dev/null +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h @@ -0,0 +1,324 @@ +/************************************************************************** + * + * Copyright (C) 2008 Tungsten Graphics, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * PPC code generation. + * \author Brian Paul + */ + + +#ifndef RTASM_PPC_H +#define RTASM_PPC_H + + +#include "pipe/p_compiler.h" + + +#define PPC_INST_SIZE 4 /**< 4 bytes / instruction */ + +#define PPC_NUM_REGS 32 +#define PPC_NUM_FP_REGS 32 +#define PPC_NUM_VEC_REGS 32 + +/** Stack pointer register */ +#define PPC_REG_SP 1 + +/** Branch conditions */ +#define BRANCH_COND_ALWAYS 0x14 /* binary 1z1zz (z=ignored) */ + +/** Branch hints */ +#define BRANCH_HINT_SUB_RETURN 0x0 /* binary 00 */ + + +struct ppc_function +{ + uint32_t *store; /**< instruction buffer */ + uint num_inst; + uint max_inst; + uint32_t reg_used; /** used/free general-purpose registers bitmask */ + uint32_t fp_used; /** used/free floating point registers bitmask */ + uint32_t vec_used; /** used/free vector registers bitmask */ +}; + + + +extern void ppc_init_func(struct ppc_function *p, unsigned max_inst); +extern void ppc_release_func(struct ppc_function *p); +extern void (*ppc_get_func( struct ppc_function *p ))( void ); +extern void ppc_dump_func(const struct ppc_function *p); + +extern int ppc_reserve_register(struct ppc_function *p, int reg); +extern int ppc_allocate_register(struct ppc_function *p); +extern void ppc_release_register(struct ppc_function *p, int reg); +extern int ppc_allocate_fp_register(struct ppc_function *p); +extern void ppc_release_fp_register(struct ppc_function *p, int reg); +extern int ppc_allocate_vec_register(struct ppc_function *p); +extern void ppc_release_vec_register(struct ppc_function *p, int reg); + + + +/** + ** float vector arithmetic + **/ + +/** vector float add */ +extern void +ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB); + +/** vector float substract */ +extern void +ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float min */ +extern void +ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float max */ +extern void +ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float mult add */ +extern void +ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); + +/** vector float compare greater than */ +extern void +ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float compare greater than or equal to */ +extern void +ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float compare equal */ +extern void +ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector float 2^x */ +extern void +ppc_vexptefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float log2(x) */ +extern void +ppc_vlogefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float reciprocol */ +extern void +ppc_vrefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float reciprocol sqrt estimate */ +extern void +ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to negative infinity */ +extern void +ppc_vrfim(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to positive infinity */ +extern void +ppc_vrfip(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to nearest int */ +extern void +ppc_vrfin(struct ppc_function *p, uint vD, uint vB); + +/** vector float round to int toward zero */ +extern void +ppc_vrfiz(struct ppc_function *p, uint vD, uint vB); + + +/** vector store: store vR at mem[vA+vB] */ +extern void +ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB); + +/** vector load: vR = mem[vA+vB] */ +extern void +ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB); + +/** load vector element word: vR = mem_word[vA+vB] */ +extern void +ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB); + + + +/** + ** vector bitwise operations + **/ + + +/** vector and */ +extern void +ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector and complement */ +extern void +ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector or */ +extern void +ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector nor */ +extern void +ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** vector xor */ +extern void +ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB); + +/** Pseudo-instruction: vector move */ +extern void +ppc_vmove(struct ppc_function *p, uint vD, uint vA); + +/** Set vector register to {0,0,0,0} */ +extern void +ppc_vzero(struct ppc_function *p, uint vr); + + + +/** + ** Vector shuffle / select / splat / etc + **/ + +/** vector permute */ +extern void +ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); + +/** vector select */ +extern void +ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC); + +/** vector splat byte */ +extern void +ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm); + +/** vector splat half word */ +extern void +ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm); + +/** vector splat word */ +extern void +ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm); + +/** vector splat signed immediate word */ +extern void +ppc_vspltisw(struct ppc_function *p, uint vD, int imm); + +/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */ +extern void +ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB); + + + +/** + ** scalar arithmetic + **/ + +extern void +ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb); + +extern void +ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm); + +extern void +ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb); + +extern void +ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm); + +extern void +ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb); + +extern void +ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm); + +extern void +ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb); + +extern void +ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm); + +extern void +ppc_mr(struct ppc_function *p, uint rt, uint ra); + +extern void +ppc_li(struct ppc_function *p, uint rt, int imm); + +extern void +ppc_lis(struct ppc_function *p, uint rt, int imm); + +extern void +ppc_load_int(struct ppc_function *p, uint rt, int imm); + + + +/** + ** scalar load/store + **/ + +extern void +ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d); + +extern void +ppc_stw(struct ppc_function *p, uint rs, uint ra, int d); + +extern void +ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d); + + + +/** + ** Float (non-vector) arithmetic + **/ + +extern void +ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb); + +extern void +ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb); + +extern void +ppc_fctiwz(struct ppc_function *p, uint rt, uint ra); + +extern void +ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset); + +extern void +ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb); + + + +/** + ** branch instructions + **/ + +extern void +ppc_blr(struct ppc_function *p); + +void +ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg); + +extern void +ppc_return(struct ppc_function *p); + + +#endif /* RTASM_PPC_H */ diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c index a04cc6c4ff..dea1aed032 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c @@ -27,12 +27,16 @@ * Real-time assembly generation interface for Cell B.E. SPEs. * * \author Ian Romanick <idr@us.ibm.com> + * \author Brian Paul */ + +#include <stdio.h> #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "rtasm_ppc_spe.h" + #ifdef GALLIUM_CELL /** * SPE instruction types @@ -143,8 +147,46 @@ union spe_inst_RI18 { /*@}*/ +static void +indent(const struct spe_function *p) +{ + int i; + for (i = 0; i < p->indent; i++) { + putchar(' '); + } +} + + +static const char * +rem_prefix(const char *longname) +{ + return longname + 4; +} + + +static const char * +reg_name(int reg) +{ + switch (reg) { + case SPE_REG_SP: + return "$sp"; + case SPE_REG_RA: + return "$lr"; + default: + { + /* cycle through four buffers to handle multiple calls per printf */ + static char buf[4][10]; + static int b = 0; + b = (b + 1) % 4; + sprintf(buf[b], "$%d", reg); + return buf[b]; + } + } +} + + static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, unsigned rB) + unsigned rA, unsigned rB, const char *name) { union spe_inst_RR inst; inst.inst.op = op; @@ -153,11 +195,16 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t%s, %s, %s\n", + rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB)); + } } static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, unsigned rB, unsigned rC) + unsigned rA, unsigned rB, unsigned rC, const char *name) { union spe_inst_RRR inst; inst.inst.op = op; @@ -167,11 +214,16 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rC = rC; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT), + reg_name(rA), reg_name(rB), reg_name(rC)); + } } static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, int imm) + unsigned rA, int imm, const char *name) { union spe_inst_RI7 inst; inst.inst.op = op; @@ -180,12 +232,17 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t%s, %s, 0x%x\n", + rem_prefix(name), reg_name(rT), reg_name(rA), imm); + } } static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, int imm) + unsigned rA, int imm, const char *name) { union spe_inst_RI8 inst; inst.inst.op = op; @@ -194,12 +251,17 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t%s, %s, 0x%x\n", + rem_prefix(name), reg_name(rT), reg_name(rA), imm); + } } static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT, - unsigned rA, int imm) + unsigned rA, int imm, const char *name) { union spe_inst_RI10 inst; inst.inst.op = op; @@ -208,11 +270,26 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t%s, %s, 0x%x\n", + rem_prefix(name), reg_name(rT), reg_name(rA), imm); + } +} + + +/** As above, but do range checking on signed immediate value */ +static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT, + unsigned rA, int imm, const char *name) +{ + assert(imm <= 511); + assert(imm >= -512); + emit_RI10(p, op, rT, rA, imm, name); } static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, - int imm) + int imm, const char *name) { union spe_inst_RI16 inst; inst.inst.op = op; @@ -220,11 +297,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm); + } } static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, - int imm) + int imm, const char *name) { union spe_inst_RI18 inst; inst.inst.op = op; @@ -232,6 +313,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, inst.inst.rT = rT; p->store[p->num_inst++] = inst.bits; assert(p->num_inst <= p->max_inst); + if (p->print) { + indent(p); + printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm); + } } @@ -240,80 +325,97 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT, #define EMIT_(_name, _op) \ void _name (struct spe_function *p, unsigned rT) \ { \ - emit_RR(p, _op, rT, 0, 0); \ + emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \ } #define EMIT_R(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA) \ { \ - emit_RR(p, _op, rT, rA, 0); \ + emit_RR(p, _op, rT, rA, 0, __FUNCTION__); \ } #define EMIT_RR(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \ { \ - emit_RR(p, _op, rT, rA, rB); \ + emit_RR(p, _op, rT, rA, rB, __FUNCTION__); \ } #define EMIT_RRR(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \ { \ - emit_RRR(p, _op, rT, rA, rB, rC); \ + emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__); \ } #define EMIT_RI7(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ { \ - emit_RI7(p, _op, rT, rA, imm); \ + emit_RI7(p, _op, rT, rA, imm, __FUNCTION__); \ } #define EMIT_RI8(_name, _op, bias) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ { \ - emit_RI8(p, _op, rT, rA, bias - imm); \ + emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__); \ } #define EMIT_RI10(_name, _op) \ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ { \ - emit_RI10(p, _op, rT, rA, imm); \ + emit_RI10(p, _op, rT, rA, imm, __FUNCTION__); \ +} + +#define EMIT_RI10s(_name, _op) \ +void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \ +{ \ + emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__); \ } #define EMIT_RI16(_name, _op) \ void _name (struct spe_function *p, unsigned rT, int imm) \ { \ - emit_RI16(p, _op, rT, imm); \ + emit_RI16(p, _op, rT, imm, __FUNCTION__); \ } #define EMIT_RI18(_name, _op) \ void _name (struct spe_function *p, unsigned rT, int imm) \ { \ - emit_RI18(p, _op, rT, imm); \ + emit_RI18(p, _op, rT, imm, __FUNCTION__); \ } #define EMIT_I16(_name, _op) \ void _name (struct spe_function *p, int imm) \ { \ - emit_RI16(p, _op, 0, imm); \ + emit_RI16(p, _op, 0, imm, __FUNCTION__); \ } #include "rtasm_ppc_spe.h" + /** * Initialize an spe_function. * \param code_size size of instruction buffer to allocate, in bytes. */ void spe_init_func(struct spe_function *p, unsigned code_size) { + unsigned int i; + p->store = align_malloc(code_size, 16); p->num_inst = 0; p->max_inst = code_size / SPE_INST_SIZE; + p->set_count = 0; + memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0])); + /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile. */ - p->regs[0] = ~7; - p->regs[1] = (1U << (80 - 64)) - 1; + p->regs[0] = p->regs[1] = p->regs[2] = 1; + for (i = 80; i <= 127; i++) { + p->regs[i] = 1; + } + + p->print = false; + p->indent = 0; } @@ -327,20 +429,23 @@ void spe_release_func(struct spe_function *p) } +/** Return current code size in bytes. */ +unsigned spe_code_size(const struct spe_function *p) +{ + return p->num_inst * SPE_INST_SIZE; +} + + /** - * Alloate a SPE register. + * Allocate a SPE register. * \return register index or -1 if none left. */ int spe_allocate_available_register(struct spe_function *p) { unsigned i; for (i = 0; i < SPE_NUM_REGS; i++) { - const uint64_t mask = (1ULL << (i % 64)); - const unsigned idx = i / 64; - - assert(idx < 2); - if ((p->regs[idx] & mask) != 0) { - p->regs[idx] &= ~mask; + if (p->regs[i] == 0) { + p->regs[i] = 1; return i; } } @@ -354,31 +459,160 @@ int spe_allocate_available_register(struct spe_function *p) */ int spe_allocate_register(struct spe_function *p, int reg) { - const unsigned idx = reg / 64; - const unsigned bit = reg % 64; - assert(reg < SPE_NUM_REGS); - assert((p->regs[idx] & (1ULL << bit)) != 0); - - p->regs[idx] &= ~(1ULL << bit); + assert(p->regs[reg] == 0); + p->regs[reg] = 1; return reg; } /** - * Mark the given SPE register as "unallocated". + * Mark the given SPE register as "unallocated". Note that this should + * only be used on registers allocated in the current register set; an + * assertion will fail if an attempt is made to deallocate a register + * allocated in an earlier register set. */ void spe_release_register(struct spe_function *p, int reg) { - const unsigned idx = reg / 64; - const unsigned bit = reg % 64; + assert(reg < SPE_NUM_REGS); + assert(p->regs[reg] == 1); - assert(idx < 2); + p->regs[reg] = 0; +} + +/** + * Start a new set of registers. This can be called if + * it will be difficult later to determine exactly what + * registers were actually allocated during a code generation + * sequence, and you really just want to deallocate all of them. + */ +void spe_allocate_register_set(struct spe_function *p) +{ + unsigned int i; + + /* Keep track of the set count. If it ever wraps around to 0, + * we're in trouble. + */ + p->set_count++; + assert(p->set_count > 0); + + /* Increment the allocation count of all registers currently + * allocated. Then any registers that are allocated in this set + * will be the only ones with a count of 1; they'll all be released + * when the register set is released. + */ + for (i = 0; i < SPE_NUM_REGS; i++) { + if (p->regs[i] > 0) + p->regs[i]++; + } +} + +void spe_release_register_set(struct spe_function *p) +{ + unsigned int i; + + /* If the set count drops below zero, we're in trouble. */ + assert(p->set_count > 0); + p->set_count--; + + /* Drop the allocation level of all registers. Any allocated + * during this register set will drop to 0 and then become + * available. + */ + for (i = 0; i < SPE_NUM_REGS; i++) { + if (p->regs[i] > 0) + p->regs[i]--; + } +} + + +unsigned +spe_get_registers_used(const struct spe_function *p, ubyte used[]) +{ + unsigned i, num = 0; + /* only count registers in the range available to callers */ + for (i = 2; i < 80; i++) { + if (p->regs[i]) { + used[num++] = i; + } + } + return num; +} + + +void +spe_print_code(struct spe_function *p, boolean enable) +{ + p->print = enable; +} + + +void +spe_indent(struct spe_function *p, int spaces) +{ + p->indent += spaces; +} + + +void +spe_comment(struct spe_function *p, int rel_indent, const char *s) +{ + if (p->print) { + p->indent += rel_indent; + indent(p); + p->indent -= rel_indent; + printf("# %s\n", s); + } +} - assert(reg < SPE_NUM_REGS); - assert((p->regs[idx] & (1ULL << bit)) == 0); - p->regs[idx] |= (1ULL << bit); +/** + * Load quad word. + * NOTE: offset is in bytes and the least significant 4 bits must be zero! + */ +void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset) +{ + const boolean pSave = p->print; + + /* offset must be a multiple of 16 */ + assert(offset % 16 == 0); + /* offset must fit in 10-bit signed int field, after shifting */ + assert((offset >> 4) <= 511); + assert((offset >> 4) >= -512); + + p->print = FALSE; + emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd"); + p->print = pSave; + + if (p->print) { + indent(p); + printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA)); + } +} + + +/** + * Store quad word. + * NOTE: offset is in bytes and the least significant 4 bits must be zero! + */ +void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset) +{ + const boolean pSave = p->print; + + /* offset must be a multiple of 16 */ + assert(offset % 16 == 0); + /* offset must fit in 10-bit signed int field, after shifting */ + assert((offset >> 4) <= 511); + assert((offset >> 4) >= -512); + + p->print = FALSE; + emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd"); + p->print = pSave; + + if (p->print) { + indent(p); + printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA)); + } } @@ -392,51 +626,51 @@ void spe_release_register(struct spe_function *p, int reg) /** Branch Indirect to address in rA */ void spe_bi(struct spe_function *p, unsigned rA, int d, int e) { - emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Interupt Return */ void spe_iret(struct spe_function *p, unsigned rA, int d, int e) { - emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect and set link on external data */ void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect and set link. Save PC in rT, jump to rA. */ void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if zero word. If rT.word[0]==0, jump to rA. */ void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if non-zero word. If rT.word[0]!=0, jump to rA. */ void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if zero halfword. If rT.halfword[1]==0, jump to rA. */ void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } /** Branch indirect if non-zero halfword. If rT.halfword[1]!=0, jump to rA. */ void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e) { - emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4)); + emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__); } @@ -505,30 +739,223 @@ spe_load_int(struct spe_function *p, unsigned rT, int i) } else { spe_ilhu(p, rT, i >> 16); - spe_iohl(p, rT, i & 0xffff); + if (i & 0xffff) + spe_iohl(p, rT, i & 0xffff); + } +} + +void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui) +{ + /* If the whole value is in the lower 18 bits, use ila, which + * doesn't sign-extend. Otherwise, if the two halfwords of + * the constant are identical, use ilh. Otherwise, if every byte of + * the desired value is 0x00 or 0xff, we can use Form Select Mask for + * Bytes Immediate (fsmbi) to load the value in a single instruction. + * Otherwise, in the general case, we have to use ilhu followed by iohl. + */ + if ((ui & 0x0003ffff) == ui) { + spe_ila(p, rT, ui); + } + else if ((ui >> 16) == (ui & 0xffff)) { + spe_ilh(p, rT, ui & 0xffff); + } + else if ( + ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) && + ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) && + ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) && + ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000) + ) { + unsigned int mask = 0; + /* fsmbi duplicates each bit in the given mask eight times, + * using a 16-bit value to initialize a 16-byte quadword. + * Each 4-bit nybble of the mask corresponds to a full word + * of the result; look at the value and figure out the mask + * (replicated for each word in the quadword), and then + * form the "select mask" to get the value. + */ + if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111; + if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222; + if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444; + if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888; + spe_fsmbi(p, rT, mask); + } + else { + /* The general case: this usually uses two instructions, but + * may use only one if the low-order 16 bits of each word are 0. + */ + spe_ilhu(p, rT, ui >> 16); + if (ui & 0xffff) + spe_iohl(p, rT, ui & 0xffff); } } +/** + * This function is constructed identically to spe_xor_uint() below. + * Changes to one should be made in the other. + */ +void +spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If we can, emit a single instruction, either And Byte Immediate + * (which uses the same constant across each byte), And Halfword Immediate + * (which sign-extends a 10-bit immediate to 16 bits and uses that + * across each halfword), or And Word Immediate (which sign-extends + * a 10-bit immediate to 32 bits). + * + * Otherwise, we'll need to use a temporary register. + */ + unsigned int tmp; + + /* If the upper 23 bits are all 0s or all 1s, sign extension + * will work and we can use And Word Immediate + */ + tmp = ui & 0xfffffe00; + if (tmp == 0xfffffe00 || tmp == 0) { + spe_andi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric along halfword boundaries and + * the upper 7 bits of each halfword are all 0s or 1s, we + * can use And Halfword Immediate + */ + tmp = ui & 0xfe00fe00; + if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { + spe_andhi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric in each byte, then we can use + * the And Byte Immediate instruction. + */ + tmp = ui & 0x000000ff; + if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { + spe_andbi(p, rT, rA, tmp); + return; + } + + /* Otherwise, we'll have to use a temporary register. */ + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_and(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); +} + + +/** + * This function is constructed identically to spe_and_uint() above. + * Changes to one should be made in the other. + */ +void +spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If we can, emit a single instruction, either Exclusive Or Byte + * Immediate (which uses the same constant across each byte), Exclusive + * Or Halfword Immediate (which sign-extends a 10-bit immediate to + * 16 bits and uses that across each halfword), or Exclusive Or Word + * Immediate (which sign-extends a 10-bit immediate to 32 bits). + * + * Otherwise, we'll need to use a temporary register. + */ + unsigned int tmp; + + /* If the upper 23 bits are all 0s or all 1s, sign extension + * will work and we can use Exclusive Or Word Immediate + */ + tmp = ui & 0xfffffe00; + if (tmp == 0xfffffe00 || tmp == 0) { + spe_xori(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric along halfword boundaries and + * the upper 7 bits of each halfword are all 0s or 1s, we + * can use Exclusive Or Halfword Immediate + */ + tmp = ui & 0xfe00fe00; + if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) { + spe_xorhi(p, rT, rA, ui & 0x000003ff); + return; + } + + /* If the ui field is symmetric in each byte, then we can use + * the Exclusive Or Byte Immediate instruction. + */ + tmp = ui & 0x000000ff; + if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) { + spe_xorbi(p, rT, rA, tmp); + return; + } + + /* Otherwise, we'll have to use a temporary register. */ + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_xor(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); +} + +void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If the comparison value is 9 bits or less, it fits inside a + * Compare Equal Word Immediate instruction. + */ + if ((ui & 0x000001ff) == ui) { + spe_ceqi(p, rT, rA, ui); + } + /* Otherwise, we're going to have to load a word first. */ + else { + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_ceq(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); + } +} + +void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui) +{ + /* If the comparison value is 10 bits or less, it fits inside a + * Compare Logical Greater Than Word Immediate instruction. + */ + if ((ui & 0x000003ff) == ui) { + spe_clgti(p, rT, rA, ui); + } + /* Otherwise, we're going to have to load a word first. */ + else { + unsigned int tmp_reg = spe_allocate_available_register(p); + spe_load_uint(p, tmp_reg, ui); + spe_clgt(p, rT, rA, tmp_reg); + spe_release_register(p, tmp_reg); + } +} void spe_splat(struct spe_function *p, unsigned rT, unsigned rA) { - spe_ila(p, rT, 66051); + /* Duplicate bytes 0, 1, 2, and 3 across the whole register */ + spe_ila(p, rT, 0x00010203); spe_shufb(p, rT, rA, rA, rT); } void -spe_complement(struct spe_function *p, unsigned rT) +spe_complement(struct spe_function *p, unsigned rT, unsigned rA) { - spe_nor(p, rT, rT, rT); + spe_nor(p, rT, rA, rA); } void spe_move(struct spe_function *p, unsigned rT, unsigned rA) { - spe_ori(p, rT, rA, 0); + /* Use different instructions depending on the instruction address + * to take advantage of the dual pipelines. + */ + if (p->num_inst & 1) + spe_shlqbyi(p, rT, rA, 0); /* odd pipe */ + else + spe_ori(p, rT, rA, 0); /* even pipe */ } @@ -539,4 +966,70 @@ spe_zero(struct spe_function *p, unsigned rT) } +void +spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word) +{ + assert(word >= 0); + assert(word <= 3); + + if (word == 0) { + int tmp1 = rT; + spe_ila(p, tmp1, 66051); + spe_shufb(p, rT, rA, rA, tmp1); + } + else { + /* XXX review this, we may not need the rotqbyi instruction */ + int tmp1 = rT; + int tmp2 = spe_allocate_available_register(p); + + spe_ila(p, tmp1, 66051); + spe_rotqbyi(p, tmp2, rA, 4 * word); + spe_shufb(p, rT, tmp2, tmp2, tmp1); + + spe_release_register(p, tmp2); + } +} + +/** + * For each 32-bit float element of rA and rB, choose the smaller of the + * two, compositing them into the rT register. + * + * The Float Compare Greater Than (fcgt) instruction will put 1s into + * compare_reg where rA > rB, and 0s where rA <= rB. + * + * Then the Select Bits (selb) instruction will take bits from rA where + * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA + * where rA <= rB and from rB where rB > rA, which is exactly the + * "min" operation. + * + * The compare_reg could in many cases be the same as rT, unless + * rT == rA || rt == rB. But since this is common in constructions + * like "x = min(x, a)", we always allocate a new register to be safe. + */ +void +spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) +{ + unsigned int compare_reg = spe_allocate_available_register(p); + spe_fcgt(p, compare_reg, rA, rB); + spe_selb(p, rT, rA, rB, compare_reg); + spe_release_register(p, compare_reg); +} + +/** + * For each 32-bit float element of rA and rB, choose the greater of the + * two, compositing them into the rT register. + * + * The logic is similar to that of spe_float_min() above; the only + * difference is that the registers on spe_selb() have been reversed, + * so that the larger of the two is selected instead of the smaller. + */ +void +spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) +{ + unsigned int compare_reg = spe_allocate_available_register(p); + spe_fcgt(p, compare_reg, rA, rB); + spe_selb(p, rT, rB, rA, compare_reg); + spe_release_register(p, compare_reg); +} + #endif /* GALLIUM_CELL */ diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h index d95e5aace3..d6a3c02f20 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h +++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h @@ -28,6 +28,7 @@ * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf * * \author Ian Romanick <idr@us.ibm.com> + * \author Brian Paul */ #ifndef RTASM_PPC_SPE_H @@ -39,10 +40,10 @@ /** number of general-purpose SIMD registers */ #define SPE_NUM_REGS 128 -/** Return Address register */ +/** Return Address register (aka $lr / Link Register) */ #define SPE_REG_RA 0 -/** Stack Pointer register */ +/** Stack Pointer register (aka $sp) */ #define SPE_REG_SP 1 @@ -52,25 +53,49 @@ struct spe_function uint num_inst; uint max_inst; - /** - * Mask of used / unused registers - * - * Each set bit corresponds to an available register. Each cleared bit - * corresponds to an allocated register. + /** + * The "set count" reflects the number of nested register sets + * are allowed. In the unlikely case that we exceed the set count, + * register allocation will start to be confused, which is critical + * enough that we check for it. + */ + unsigned char set_count; + + /** + * Flags for used and unused registers. Each byte corresponds to a + * register; a 0 in that byte means that the register is available. + * A value of 1 means that the register was allocated in the current + * register set. Any other value N means that the register was allocated + * N register sets ago. * * \sa * spe_allocate_register, spe_allocate_available_register, - * spe_release_register + * spe_allocate_register_set, spe_release_register_set, spe_release_register, */ - uint64_t regs[SPE_NUM_REGS / 64]; + unsigned char regs[SPE_NUM_REGS]; + + boolean print; /**< print/dump instructions as they're emitted? */ + int indent; /**< number of spaces to indent */ }; + extern void spe_init_func(struct spe_function *p, unsigned code_size); extern void spe_release_func(struct spe_function *p); +extern unsigned spe_code_size(const struct spe_function *p); extern int spe_allocate_available_register(struct spe_function *p); extern int spe_allocate_register(struct spe_function *p, int reg); extern void spe_release_register(struct spe_function *p, int reg); +extern void spe_allocate_register_set(struct spe_function *p); +extern void spe_release_register_set(struct spe_function *p); + +extern unsigned +spe_get_registers_used(const struct spe_function *p, ubyte used[]); + +extern void spe_print_code(struct spe_function *p, boolean enable); +extern void spe_indent(struct spe_function *p, int spaces); +extern void spe_comment(struct spe_function *p, int rel_indent, const char *s); + #endif /* RTASM_PPC_SPE_H */ @@ -94,6 +119,9 @@ extern void spe_release_register(struct spe_function *p, int reg); #define EMIT_RI10(_name, _op) \ extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \ int imm) +#define EMIT_RI10s(_name, _op) \ + extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \ + int imm) #define EMIT_RI16(_name, _op) \ extern void _name (struct spe_function *p, unsigned rT, int imm) #define EMIT_RI18(_name, _op) \ @@ -106,11 +134,9 @@ extern void spe_release_register(struct spe_function *p, int reg); /* Memory load / store instructions */ -EMIT_RI10(spe_lqd, 0x034); EMIT_RR (spe_lqx, 0x1c4); EMIT_RI16(spe_lqa, 0x061); EMIT_RI16(spe_lqr, 0x067); -EMIT_RI10(spe_stqd, 0x024); EMIT_RR (spe_stqx, 0x144); EMIT_RI16(spe_stqa, 0x041); EMIT_RI16(spe_stqr, 0x047); @@ -140,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065); EMIT_RR (spe_ah, 0x0c8); EMIT_RI10(spe_ahi, 0x01d); EMIT_RR (spe_a, 0x0c0); -EMIT_RI10(spe_ai, 0x01c); +EMIT_RI10s(spe_ai, 0x01c); EMIT_RR (spe_sfh, 0x048); EMIT_RI10(spe_sfhi, 0x00d); EMIT_RR (spe_sf, 0x040); @@ -178,19 +204,19 @@ EMIT_R (spe_xshw, 0x2ae); EMIT_R (spe_xswd, 0x2a6); EMIT_RR (spe_and, 0x0c1); EMIT_RR (spe_andc, 0x2c1); -EMIT_RI10(spe_andbi, 0x016); -EMIT_RI10(spe_andhi, 0x015); -EMIT_RI10(spe_andi, 0x014); +EMIT_RI10s(spe_andbi, 0x016); +EMIT_RI10s(spe_andhi, 0x015); +EMIT_RI10s(spe_andi, 0x014); EMIT_RR (spe_or, 0x041); EMIT_RR (spe_orc, 0x2c9); -EMIT_RI10(spe_orbi, 0x006); -EMIT_RI10(spe_orhi, 0x005); -EMIT_RI10(spe_ori, 0x004); +EMIT_RI10s(spe_orbi, 0x006); +EMIT_RI10s(spe_orhi, 0x005); +EMIT_RI10s(spe_ori, 0x004); EMIT_R (spe_orx, 0x1f0); EMIT_RR (spe_xor, 0x241); -EMIT_RI10(spe_xorbi, 0x026); -EMIT_RI10(spe_xorhi, 0x025); -EMIT_RI10(spe_xori, 0x024); +EMIT_RI10s(spe_xorbi, 0x026); +EMIT_RI10s(spe_xorhi, 0x025); +EMIT_RI10s(spe_xori, 0x024); EMIT_RR (spe_nand, 0x0c9); EMIT_RR (spe_nor, 0x049); EMIT_RR (spe_eqv, 0x249); @@ -268,6 +294,12 @@ EMIT_RI16(spe_brz, 0x040); EMIT_RI16(spe_brhnz, 0x046); EMIT_RI16(spe_brhz, 0x044); +extern void +spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset); + +extern void +spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset); + extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e); extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e); extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, @@ -292,13 +324,33 @@ spe_load_float(struct spe_function *p, unsigned rT, float x); extern void spe_load_int(struct spe_function *p, unsigned rT, int i); +/** Load/splat immediate unsigned int into rT. */ +extern void +spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui); + +/** And immediate value into rT. */ +extern void +spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Xor immediate value into rT. */ +extern void +spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare equal with immediate value. */ +extern void +spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + +/** Compare greater with immediate value. */ +extern void +spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui); + /** Replicate word 0 of rA across rT. */ extern void spe_splat(struct spe_function *p, unsigned rT, unsigned rA); -/** Complement/invert all bits in rT. */ +/** rT = complement_all_bits(rA). */ extern void -spe_complement(struct spe_function *p, unsigned rT); +spe_complement(struct spe_function *p, unsigned rT, unsigned rA); /** rT = rA. */ extern void @@ -308,6 +360,18 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA); extern void spe_zero(struct spe_function *p, unsigned rT); +/** rT = splat(rA, word) */ +extern void +spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word); + +/** rT = float min(rA, rB) */ +extern void +spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB); + +/** rT = float max(rA, rB) */ +extern void +spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB); + /* Floating-point instructions */ @@ -361,6 +425,7 @@ EMIT_R (spe_wrch, 0x10d); #undef EMIT_RI7 #undef EMIT_RI8 #undef EMIT_RI10 +#undef EMIT_RI10s #undef EMIT_RI16 #undef EMIT_RI18 #undef EMIT_I16 diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 6d4c081e04..99ee74cf14 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -240,7 +240,8 @@ static void emit_modrm( struct x86_function *p, /* Oh-oh we've stumbled into the SIB thing. */ if (regmem.file == file_REG32 && - regmem.idx == reg_SP) { + regmem.idx == reg_SP && + regmem.mod != mod_REG) { emit_1ub(p, 0x24); /* simplistic! */ } @@ -370,7 +371,11 @@ void x86_jcc( struct x86_function *p, DUMP_I(cc); if (offset < 0) { - assert(p->csr - p->store > -offset); + /*assert(p->csr - p->store > -offset);*/ + if (p->csr - p->store <= -offset) { + /* probably out of memory (using the error_overflow buffer) */ + return; + } } if (offset <= 127 && offset >= -128) { @@ -435,25 +440,70 @@ void x86_call( struct x86_function *p, struct x86_reg reg) } -/* michal: - * Temporary. As I need immediate operands, and dont want to mess with the codegen, - * I load the immediate into general purpose register and use it. - */ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ) { DUMP_RI( dst, imm ); + assert(dst.file == file_REG32); assert(dst.mod == mod_REG); emit_1ub(p, 0xb8 + dst.idx); emit_1i(p, imm); } -void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm ) +/** + * Immediate group 1 instructions. + */ +static INLINE void +x86_group1_imm( struct x86_function *p, + unsigned op, struct x86_reg dst, int imm ) { - DUMP_RI( dst, imm ); + assert(dst.file == file_REG32); assert(dst.mod == mod_REG); - emit_1ub(p, 0x80); - emit_modrm_noreg(p, 0, dst); - emit_1ub(p, imm); + if(-0x80 <= imm && imm < 0x80) { + emit_1ub(p, 0x83); + emit_modrm_noreg(p, op, dst); + emit_1b(p, (char)imm); + } + else { + emit_1ub(p, 0x81); + emit_modrm_noreg(p, op, dst); + emit_1i(p, imm); + } +} + +void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + x86_group1_imm(p, 0, dst, imm); +} + +void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + x86_group1_imm(p, 1, dst, imm); +} + +void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + x86_group1_imm(p, 4, dst, imm); +} + +void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + x86_group1_imm(p, 5, dst, imm); +} + +void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + x86_group1_imm(p, 6, dst, imm); +} + +void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + x86_group1_imm(p, 7, dst, imm); } @@ -629,6 +679,44 @@ void x86_and( struct x86_function *p, * SSE instructions */ +void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr) +{ + DUMP_R( ptr ); + assert(ptr.mod != mod_REG); + emit_2ub(p, 0x0f, 0x18); + emit_modrm_noreg(p, 0, ptr); +} + +void sse_prefetch0( struct x86_function *p, struct x86_reg ptr) +{ + DUMP_R( ptr ); + assert(ptr.mod != mod_REG); + emit_2ub(p, 0x0f, 0x18); + emit_modrm_noreg(p, 1, ptr); +} + +void sse_prefetch1( struct x86_function *p, struct x86_reg ptr) +{ + DUMP_R( ptr ); + assert(ptr.mod != mod_REG); + emit_2ub(p, 0x0f, 0x18); + emit_modrm_noreg(p, 2, ptr); +} + +void sse_movntps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src) +{ + DUMP_RR( dst, src ); + + assert(dst.mod != mod_REG); + assert(src.mod == mod_REG); + emit_2ub(p, 0x0f, 0x2b); + emit_modrm(p, src, dst); +} + + + void sse_movss( struct x86_function *p, struct x86_reg dst, diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h index af94577aab..1b5eaaca85 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h @@ -152,12 +152,13 @@ void x86_jmp( struct x86_function *p, int label ); /* void x86_call( struct x86_function *p, void (*label)() ); */ void x86_call( struct x86_function *p, struct x86_reg reg); -/* michal: - * Temporary. As I need immediate operands, and dont want to mess with the codegen, - * I load the immediate into general purpose register and use it. - */ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ); -void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm ); +void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm ); +void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm ); +void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm ); +void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm ); +void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm ); +void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm ); /* Macro for sse_shufps() and sse2_pshufd(): @@ -184,6 +185,13 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr); +void sse_prefetch0( struct x86_function *p, struct x86_reg ptr); +void sse_prefetch1( struct x86_function *p, struct x86_reg ptr); + +void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src); + void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile index c5d2082087..d7df9490cf 100644 --- a/src/gallium/auxiliary/tgsi/Makefile +++ b/src/gallium/auxiliary/tgsi/Makefile @@ -11,8 +11,10 @@ C_SOURCES = \ tgsi_info.c \ tgsi_iterate.c \ tgsi_parse.c \ + tgsi_ppc.c \ tgsi_scan.c \ tgsi_sse2.c \ + tgsi_text.c \ tgsi_transform.c \ tgsi_util.c diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript index 45bf3f6d57..8200cce42f 100644 --- a/src/gallium/auxiliary/tgsi/SConscript +++ b/src/gallium/auxiliary/tgsi/SConscript @@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary( 'tgsi_parse.c', 'tgsi_sanity.c', 'tgsi_scan.c', + 'tgsi_ppc.c', 'tgsi_sse2.c', 'tgsi_text.c', 'tgsi_transform.c', diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c index 74614d3688..38fcaf8829 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c @@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void ) return instruction_ext_nv; } -union token_u32 + +/** test for inequality of 32-bit values pointed to by a and b */ +static INLINE boolean +compare32(const void *a, const void *b) { - unsigned u32; -}; + return *((uint32_t *) a) != *((uint32_t *) b); +} + unsigned tgsi_compare_instruction_ext_nv( @@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_instruction_ext_nv @@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_instruction_ext_label @@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_instruction_ext_texture @@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_src_register_ext_swz @@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_src_register_ext_mod @@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_dst_register_ext_concode @@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate( { a.Padding = b.Padding = 0; a.Extended = b.Extended = 0; - return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32; + return compare32(&a, &b); } struct tgsi_dst_register_ext_modulate diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c index afc8ffa553..3177f54952 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c @@ -68,6 +68,7 @@ dump_enum( #define CHR(C) ctx->printf( ctx, "%c", C ) #define UIX(I) ctx->printf( ctx, "0x%x", I ) #define UID(I) ctx->printf( ctx, "%u", I ) +#define INSTID(I) ctx->printf( ctx, "% 3u", I ) #define SID(I) ctx->printf( ctx, "%d", I ) #define FLT(F) ctx->printf( ctx, "%10.4f", F ) #define ENM(E,ENUMS) dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) ) @@ -315,8 +316,8 @@ iter_instruction( uint i; boolean first_reg = TRUE; - UID( instno ); - CHR( ':' ); + INSTID( instno ); + TXT( ": " ); TXT( tgsi_get_opcode_info( inst->Instruction.Opcode )->mnemonic ); switch (inst->Instruction.Saturate) { diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index df002939c6..1a5294eabc 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -1674,6 +1674,7 @@ exec_declaration( break; default: + eval = NULL; assert( 0 ); } diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c index 3757486ba9..2cd56e413a 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.c +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c @@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens( 1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize; } + +/** + * This function is used to avoid and work-around type punning/aliasing + * warnings. The warnings seem harmless on x86 but on PPC they cause + * real failures. + */ +static INLINE void +copy_token(void *dst, const void *src) +{ + memcpy(dst, src, 4); +} + + +/** + * Get next 4-byte token, return it at address specified by 'token' + */ static void next_token( struct tgsi_parse_context *ctx, void *token ) { assert( !tgsi_parse_end_of_tokens( ctx ) ); - - *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++]; + copy_token(token, &ctx->Tokens[ctx->Position]); + ctx->Position++; } + void tgsi_parse_token( struct tgsi_parse_context *ctx ) @@ -116,7 +133,7 @@ tgsi_parse_token( struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration; *decl = tgsi_default_full_declaration(); - decl->Declaration = *(struct tgsi_declaration *) &token; + copy_token(&decl->Declaration, &token); next_token( ctx, &decl->DeclarationRange ); @@ -132,8 +149,7 @@ tgsi_parse_token( struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate; *imm = tgsi_default_full_immediate(); - imm->Immediate = *(struct tgsi_immediate *) &token; - + copy_token(&imm->Immediate, &token); assert( !imm->Immediate.Extended ); switch (imm->Immediate.DataType) { @@ -158,8 +174,7 @@ tgsi_parse_token( unsigned extended; *inst = tgsi_default_full_instruction(); - inst->Instruction = *(struct tgsi_instruction *) &token; - + copy_token(&inst->Instruction, &token); extended = inst->Instruction.Extended; while( extended ) { @@ -169,18 +184,15 @@ tgsi_parse_token( switch( token.Type ) { case TGSI_INSTRUCTION_EXT_TYPE_NV: - inst->InstructionExtNv = - *(struct tgsi_instruction_ext_nv *) &token; + copy_token(&inst->InstructionExtNv, &token); break; case TGSI_INSTRUCTION_EXT_TYPE_LABEL: - inst->InstructionExtLabel = - *(struct tgsi_instruction_ext_label *) &token; + copy_token(&inst->InstructionExtLabel, &token); break; case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE: - inst->InstructionExtTexture = - *(struct tgsi_instruction_ext_texture *) &token; + copy_token(&inst->InstructionExtTexture, &token); break; default: @@ -212,13 +224,13 @@ tgsi_parse_token( switch( token.Type ) { case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE: - inst->FullDstRegisters[i].DstRegisterExtConcode = - *(struct tgsi_dst_register_ext_concode *) &token; + copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode, + &token); break; case TGSI_DST_REGISTER_EXT_TYPE_MODULATE: - inst->FullDstRegisters[i].DstRegisterExtModulate = - *(struct tgsi_dst_register_ext_modulate *) &token; + copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate, + &token); break; default: @@ -245,13 +257,13 @@ tgsi_parse_token( switch( token.Type ) { case TGSI_SRC_REGISTER_EXT_TYPE_SWZ: - inst->FullSrcRegisters[i].SrcRegisterExtSwz = - *(struct tgsi_src_register_ext_swz *) &token; + copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz, + &token); break; case TGSI_SRC_REGISTER_EXT_TYPE_MOD: - inst->FullSrcRegisters[i].SrcRegisterExtMod = - *(struct tgsi_src_register_ext_mod *) &token; + copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod, + &token); break; default: diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c new file mode 100644 index 0000000000..9ad7ecd7cf --- /dev/null +++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c @@ -0,0 +1,910 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * TGSI to PowerPC code generation. + */ + +#include "pipe/p_config.h" + +#if defined(PIPE_ARCH_PPC) + +#include "pipe/p_debug.h" +#include "pipe/p_shader_tokens.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_sse.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi_exec.h" +#include "tgsi_ppc.h" +#include "rtasm/rtasm_ppc.h" + + +/** + * Since it's pretty much impossible to form PPC vector immediates, load + * them from memory here: + */ +const float ppc_builtin_constants[] ALIGN16_ATTRIB = { + 1.0f, -128.0f, 128.0, 0.0 +}; + + +#define FOR_EACH_CHANNEL( CHAN )\ + for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++) + +#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ + ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN))) + +#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\ + if (IS_DST0_CHANNEL_ENABLED( INST, CHAN )) + +#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\ + FOR_EACH_CHANNEL( CHAN )\ + IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN ) + +#define CHAN_X 0 +#define CHAN_Y 1 +#define CHAN_Z 2 +#define CHAN_W 3 + +#define TEMP_ONE_I TGSI_EXEC_TEMP_ONE_I +#define TEMP_ONE_C TGSI_EXEC_TEMP_ONE_C + +#define TEMP_R0 TGSI_EXEC_TEMP_R0 +#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR + + +/** + * Context/state used during code gen. + */ +struct gen_context +{ + struct ppc_function *f; + int inputs_reg; /**< GP register pointing to input params */ + int outputs_reg; /**< GP register pointing to output params */ + int temps_reg; /**< GP register pointing to temporary "registers" */ + int immed_reg; /**< GP register pointing to immediates buffer */ + int const_reg; /**< GP register pointing to constants buffer */ + int builtins_reg; /**< GP register pointint to built-in constants */ + + int one_vec; /**< vector register with {1.0, 1.0, 1.0, 1.0} */ + int bit31_vec; /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */ +}; + + +/** + * Load the given vector register with {value, value, value, value}. + * The value must be in the ppu_builtin_constants[] array. + * We wouldn't need this if there was a simple way to load PPC vector + * registers with immediate values! + */ +static void +load_constant_vec(struct gen_context *gen, int dst_vec, float value) +{ + uint pos; + for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) { + if (ppc_builtin_constants[pos] == value) { + int offset_reg = ppc_allocate_register(gen->f); + int offset = pos * 4; + + ppc_li(gen->f, offset_reg, offset); + /* Load 4-byte word into vector register. + * The vector slot depends on the effective address we load from. + * We know that our builtins start at a 16-byte boundary so we + * know that 'swizzle' tells us which vector slot will have the + * loaded word. The other vector slots will be undefined. + */ + ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg); + /* splat word[pos % 4] across the vector reg */ + ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4); + ppc_release_register(gen->f, offset_reg); + return; + } + } + assert(0 && "Need to add new constant to ppc_builtin_constants array"); +} + + +/** + * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}. + */ +static int +gen_one_vec(struct gen_context *gen) +{ + if (gen->one_vec < 0) { + gen->one_vec = ppc_allocate_vec_register(gen->f); + load_constant_vec(gen, gen->one_vec, 1.0f); + } + return gen->one_vec; +} + +/** + * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}. + */ +static int +gen_get_bit31_vec(struct gen_context *gen) +{ + if (gen->bit31_vec < 0) { + gen->bit31_vec = ppc_allocate_vec_register(gen->f); + ppc_vspltisw(gen->f, gen->bit31_vec, -1); + ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec); + } + return gen->bit31_vec; +} + + +/** + * Register fetch, put result in 'dst_vec'. + */ +static void +emit_fetch(struct gen_context *gen, + unsigned dst_vec, + const struct tgsi_full_src_register *reg, + const unsigned chan_index) +{ + uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index); + + switch (swizzle) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + switch (reg->SrcRegister.File) { + case TGSI_FILE_INPUT: + { + int offset_reg = ppc_allocate_register(gen->f); + int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16; + ppc_li(gen->f, offset_reg, offset); + ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg); + ppc_release_register(gen->f, offset_reg); + } + break; + case TGSI_FILE_TEMPORARY: + { + int offset_reg = ppc_allocate_register(gen->f); + int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16; + ppc_li(gen->f, offset_reg, offset); + ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg); + ppc_release_register(gen->f, offset_reg); + } + break; + case TGSI_FILE_IMMEDIATE: + { + int offset_reg = ppc_allocate_register(gen->f); + int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16; + ppc_li(gen->f, offset_reg, offset); + ppc_lvx(gen->f, dst_vec, gen->immed_reg, offset_reg); + ppc_release_register(gen->f, offset_reg); + } + break; + case TGSI_FILE_CONSTANT: + { + int offset_reg = ppc_allocate_register(gen->f); + int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4; + ppc_li(gen->f, offset_reg, offset); + /* Load 4-byte word into vector register. + * The vector slot depends on the effective address we load from. + * We know that our constants start at a 16-byte boundary so we + * know that 'swizzle' tells us which vector slot will have the + * loaded word. The other vector slots will be undefined. + */ + ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg); + /* splat word[swizzle] across the vector reg */ + ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle); + ppc_release_register(gen->f, offset_reg); + } + break; + default: + assert( 0 ); + } + break; + case TGSI_EXTSWIZZLE_ZERO: + ppc_vzero(gen->f, dst_vec); + break; + case TGSI_EXTSWIZZLE_ONE: + { + int one_vec = gen_one_vec(gen); + ppc_vmove(gen->f, dst_vec, one_vec); + } + break; + default: + assert( 0 ); + } + + { + uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index); + if (sign_op != TGSI_UTIL_SIGN_KEEP) { + int bit31_vec = gen_get_bit31_vec(gen); + + switch (sign_op) { + case TGSI_UTIL_SIGN_CLEAR: + /* vec = vec & ~bit31 */ + ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec); + break; + case TGSI_UTIL_SIGN_SET: + /* vec = vec | bit31 */ + ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec); + break; + case TGSI_UTIL_SIGN_TOGGLE: + /* vec = vec ^ bit31 */ + ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec); + break; + default: + assert(0); + } + } + } +} + +#define FETCH( GEN, INST, DST_VEC, SRC_REG, CHAN ) \ + emit_fetch( GEN, DST_VEC, &(INST).FullSrcRegisters[SRC_REG], CHAN ) + + + +/** + * Register store. Store 'src_vec' at location indicated by 'reg'. + */ +static void +emit_store(struct gen_context *gen, + unsigned src_vec, + const struct tgsi_full_dst_register *reg, + const struct tgsi_full_instruction *inst, + unsigned chan_index) +{ + switch (reg->DstRegister.File) { + case TGSI_FILE_OUTPUT: + { + int offset_reg = ppc_allocate_register(gen->f); + int offset = (reg->DstRegister.Index * 4 + chan_index) * 16; + ppc_li(gen->f, offset_reg, offset); + ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg); + ppc_release_register(gen->f, offset_reg); + } + break; + case TGSI_FILE_TEMPORARY: + { + int offset_reg = ppc_allocate_register(gen->f); + int offset = (reg->DstRegister.Index * 4 + chan_index) * 16; + ppc_li(gen->f, offset_reg, offset); + ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg); + ppc_release_register(gen->f, offset_reg); + } + break; +#if 0 + case TGSI_FILE_ADDRESS: + emit_addrs( + func, + xmm, + reg->DstRegister.Index, + chan_index ); + break; +#endif + default: + assert( 0 ); + } + +#if 0 + switch( inst->Instruction.Saturate ) { + case TGSI_SAT_NONE: + break; + + case TGSI_SAT_ZERO_ONE: + /* assert( 0 ); */ + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + assert( 0 ); + break; + } +#endif +} + + +#define STORE( GEN, INST, XMM, INDEX, CHAN )\ + emit_store( GEN, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN ) + + + +static void +emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int v0 = ppc_allocate_vec_register(gen->f); + int v1 = ppc_allocate_vec_register(gen->f); + uint chan_index; + + FETCH(gen, *inst, v0, 0, CHAN_X); + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_RSQ: + /* v1 = 1.0 / sqrt(v0) */ + ppc_vrsqrtefp(gen->f, v1, v0); + break; + case TGSI_OPCODE_RCP: + /* v1 = 1.0 / v0 */ + ppc_vrefp(gen->f, v1, v0); + break; + default: + assert(0); + } + + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + STORE(gen, *inst, v1, 0, chan_index); + } + ppc_release_vec_register(gen->f, v0); + ppc_release_vec_register(gen->f, v1); +} + + +static void +emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int v0 = ppc_allocate_vec_register(gen->f); + uint chan_index; + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { + FETCH(gen, *inst, 0, 0, chan_index); /* v0 = srcreg[0] */ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ABS: + /* turn off the most significant bit of each vector float word */ + { + int v1 = ppc_allocate_vec_register(gen->f); + ppc_vspltisw(gen->f, v1, -1); /* v1 = {-1, -1, -1, -1} */ + ppc_vslw(gen->f, v1, v1, v1); /* v1 = {1<<31, 1<<31, 1<<31, 1<<31} */ + ppc_vandc(gen->f, v0, v0, v1); /* v0 = v0 & ~v1 */ + ppc_release_vec_register(gen->f, v1); + } + break; + case TGSI_OPCODE_FLOOR: + ppc_vrfim(gen->f, v0, v0); /* v0 = floor(v0) */ + break; + case TGSI_OPCODE_FRAC: + { + int v1 = ppc_allocate_vec_register(gen->f); + ppc_vrfim(gen->f, v1, v0); /* v1 = floor(v0) */ + ppc_vsubfp(gen->f, v0, v0, v1); /* v0 = v0 - v1 */ + ppc_release_vec_register(gen->f, v1); + } + break; + case TGSI_OPCODE_EXPBASE2: + ppc_vexptefp(gen->f, v0, v0); /* v0 = 2^v0 */ + break; + case TGSI_OPCODE_LOGBASE2: + /* XXX this may be broken! */ + ppc_vlogefp(gen->f, v0, v0); /* v0 = log2(v0) */ + break; + case TGSI_OPCODE_MOV: + /* nothing */ + break; + default: + assert(0); + } + STORE(gen, *inst, v0, 0, chan_index); /* store v0 */ + } + ppc_release_vec_register(gen->f, v0); +} + + +static void +emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int v0 = ppc_allocate_vec_register(gen->f); + int v1 = ppc_allocate_vec_register(gen->f); + int v2 = ppc_allocate_vec_register(gen->f); + uint chan_index; + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { + FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */ + FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ADD: + ppc_vaddfp(gen->f, v2, v0, v1); + break; + case TGSI_OPCODE_SUB: + ppc_vsubfp(gen->f, v2, v0, v1); + break; + case TGSI_OPCODE_MUL: + ppc_vxor(gen->f, v2, v2, v2); /* v2 = {0, 0, 0, 0} */ + ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v0 */ + break; + case TGSI_OPCODE_MIN: + ppc_vminfp(gen->f, v2, v0, v1); + break; + case TGSI_OPCODE_MAX: + ppc_vmaxfp(gen->f, v2, v0, v1); + break; + default: + assert(0); + } + STORE(gen, *inst, v2, 0, chan_index); /* store v2 */ + } + ppc_release_vec_register(gen->f, v0); + ppc_release_vec_register(gen->f, v1); + ppc_release_vec_register(gen->f, v2); +} + + +/** + * Vector comparisons, resulting in 1.0 or 0.0 values. + */ +static void +emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int v0 = ppc_allocate_vec_register(gen->f); + int v1 = ppc_allocate_vec_register(gen->f); + int v2 = ppc_allocate_vec_register(gen->f); + uint chan_index; + boolean complement = FALSE; + int one_vec = gen_one_vec(gen); + + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { + FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */ + FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */ + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_SNE: + complement = TRUE; + /* fall-through */ + case TGSI_OPCODE_SEQ: + ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */ + break; + + case TGSI_OPCODE_SGE: + complement = TRUE; + /* fall-through */ + case TGSI_OPCODE_SLT: + ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */ + break; + + case TGSI_OPCODE_SLE: + complement = TRUE; + /* fall-through */ + case TGSI_OPCODE_SGT: + ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */ + break; + default: + assert(0); + } + + /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */ + + if (complement) + ppc_vandc(gen->f, v2, one_vec, v2); /* v2 = one_vec & ~v2 */ + else + ppc_vand(gen->f, v2, one_vec, v2); /* v2 = one_vec & v2 */ + + STORE(gen, *inst, v2, 0, chan_index); /* store v2 */ + } + + ppc_release_vec_register(gen->f, v0); + ppc_release_vec_register(gen->f, v1); + ppc_release_vec_register(gen->f, v2); +} + + +static void +emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int v0 = ppc_allocate_vec_register(gen->f); + int v1 = ppc_allocate_vec_register(gen->f); + int v2 = ppc_allocate_vec_register(gen->f); + uint chan_index; + + ppc_vxor(gen->f, v2, v2, v2); /* v2 = {0, 0, 0, 0} */ + + FETCH(gen, *inst, v0, 0, CHAN_X); /* v0 = src0.XXXX */ + FETCH(gen, *inst, v1, 1, CHAN_X); /* v1 = src1.XXXX */ + ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ + + FETCH(gen, *inst, v0, 0, CHAN_Y); /* v0 = src0.YYYY */ + FETCH(gen, *inst, v1, 1, CHAN_Y); /* v1 = src1.YYYY */ + ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ + + FETCH(gen, *inst, v0, 0, CHAN_Z); /* v0 = src0.ZZZZ */ + FETCH(gen, *inst, v1, 1, CHAN_Z); /* v1 = src1.ZZZZ */ + ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ + + if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) { + FETCH(gen, *inst, v0, 0, CHAN_W); /* v0 = src0.WWWW */ + FETCH(gen, *inst, v1, 1, CHAN_W); /* v1 = src1.WWWW */ + ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */ + } + else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) { + FETCH(gen, *inst, v1, 1, CHAN_W); /* v1 = src1.WWWW */ + ppc_vaddfp(gen->f, v2, v2, v1); /* v2 = v2 + v1 */ + } + + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { + STORE(gen, *inst, v2, 0, chan_index); /* store v2 */ + } + ppc_release_vec_register(gen->f, v0); + ppc_release_vec_register(gen->f, v1); + ppc_release_vec_register(gen->f, v2); +} + + +static void +emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int v0 = ppc_allocate_vec_register(gen->f); + int v1 = ppc_allocate_vec_register(gen->f); + int v2 = ppc_allocate_vec_register(gen->f); + int v3 = ppc_allocate_vec_register(gen->f); + uint chan_index; + FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) { + FETCH(gen, *inst, v0, 0, chan_index); /* v0 = srcreg[0] */ + FETCH(gen, *inst, v1, 1, chan_index); /* v1 = srcreg[1] */ + FETCH(gen, *inst, v2, 2, chan_index); /* v2 = srcreg[2] */ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_MAD: + ppc_vmaddfp(gen->f, v3, v0, v1, v2); /* v3 = v0 * v1 + v2 */ + break; + case TGSI_OPCODE_LRP: + ppc_vsubfp(gen->f, v3, v1, v2); /* v3 = v1 - v2 */ + ppc_vmaddfp(gen->f, v3, v0, v3, v2); /* v3 = v0 * v3 + v2 */ + break; + default: + assert(0); + } + STORE(gen, *inst, v3, 0, chan_index); /* store v3 */ + } + ppc_release_vec_register(gen->f, v0); + ppc_release_vec_register(gen->f, v1); + ppc_release_vec_register(gen->f, v2); + ppc_release_vec_register(gen->f, v3); +} + + + +/** Approximation for vr = pow(va, vb) */ +static void +ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb) +{ + /* pow(a,b) ~= exp2(log2(a) * b) */ + int t_vec = ppc_allocate_vec_register(f); + int zero_vec = ppc_allocate_vec_register(f); + + ppc_vzero(f, zero_vec); + + ppc_vlogefp(f, t_vec, va); /* t = log2(va) */ + ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec); /* t = t * vb */ + ppc_vexptefp(f, vr, t_vec); /* vr = 2^t */ + + ppc_release_vec_register(f, t_vec); + ppc_release_vec_register(f, zero_vec); +} + + +static void +emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst) +{ + int one_vec = gen_one_vec(gen); + + /* Compute X */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { + STORE(gen, *inst, one_vec, 0, CHAN_X); + } + + /* Compute Y, Z */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + int x_vec = ppc_allocate_vec_register(gen->f); + int zero_vec = ppc_allocate_vec_register(gen->f); + + FETCH(gen, *inst, x_vec, 0, CHAN_X); /* x_vec = src[0].x */ + + ppc_vzero(gen->f, zero_vec); /* zero = {0,0,0,0} */ + ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */ + + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + STORE(gen, *inst, x_vec, 0, CHAN_Y); /* store Y */ + } + + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + int y_vec = ppc_allocate_vec_register(gen->f); + int z_vec = ppc_allocate_vec_register(gen->f); + int w_vec = ppc_allocate_vec_register(gen->f); + int pow_vec = ppc_allocate_vec_register(gen->f); + int pos_vec = ppc_allocate_vec_register(gen->f); + int p128_vec = ppc_allocate_vec_register(gen->f); + int n128_vec = ppc_allocate_vec_register(gen->f); + + FETCH(gen, *inst, y_vec, 0, CHAN_Y); /* y_vec = src[0].y */ + ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */ + + FETCH(gen, *inst, w_vec, 0, CHAN_W); /* w_vec = src[0].w */ + + /* clamp Y to [-128, 128] */ + load_constant_vec(gen, p128_vec, 128.0f); + load_constant_vec(gen, n128_vec, -128.0f); + ppc_vmaxfp(gen->f, y_vec, y_vec, n128_vec); /* y = max(y, -128) */ + ppc_vminfp(gen->f, y_vec, y_vec, p128_vec); /* y = min(y, 128) */ + + /* if temp.x > 0 + * z = pow(tmp.y, tmp.w) + * else + * z = 0.0 + */ + ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec); /* pow = pow(y, w) */ + ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */ + ppc_vand(gen->f, z_vec, pow_vec, pos_vec); /* z = pow & pos */ + + STORE(gen, *inst, z_vec, 0, CHAN_Z); /* store Z */ + + ppc_release_vec_register(gen->f, y_vec); + ppc_release_vec_register(gen->f, z_vec); + ppc_release_vec_register(gen->f, w_vec); + ppc_release_vec_register(gen->f, pow_vec); + ppc_release_vec_register(gen->f, pos_vec); + ppc_release_vec_register(gen->f, p128_vec); + ppc_release_vec_register(gen->f, n128_vec); + } + + ppc_release_vec_register(gen->f, x_vec); + ppc_release_vec_register(gen->f, zero_vec); + } + + /* Compute W */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { + STORE(gen, *inst, one_vec, 0, CHAN_W); + } +} + + +static int +emit_instruction(struct gen_context *gen, + struct tgsi_full_instruction *inst) +{ + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_FLOOR: + case TGSI_OPCODE_FRAC: + case TGSI_OPCODE_EXPBASE2: + case TGSI_OPCODE_LOGBASE2: + emit_unaryop(gen, inst); + break; + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_RCP: + emit_scalar_unaryop(gen, inst); + break; + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_MUL: + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_MAX: + emit_binop(gen, inst); + break; + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SGE: + emit_inequality(gen, inst); + break; + case TGSI_OPCODE_MAD: + case TGSI_OPCODE_LRP: + emit_triop(gen, inst); + break; + case TGSI_OPCODE_DP3: + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + emit_dotprod(gen, inst); + break; + case TGSI_OPCODE_LIT: + emit_lit(gen, inst); + break; + case TGSI_OPCODE_END: + /* normal end */ + return 1; + default: + return 0; + } + + + return 1; +} + +static void +emit_declaration( + struct ppc_function *func, + struct tgsi_full_declaration *decl ) +{ + if( decl->Declaration.File == TGSI_FILE_INPUT ) { +#if 0 + unsigned first, last, mask; + unsigned i, j; + + first = decl->DeclarationRange.First; + last = decl->DeclarationRange.Last; + mask = decl->Declaration.UsageMask; + + for( i = first; i <= last; i++ ) { + for( j = 0; j < NUM_CHANNELS; j++ ) { + if( mask & (1 << j) ) { + switch( decl->Declaration.Interpolate ) { + case TGSI_INTERPOLATE_CONSTANT: + emit_coef_a0( func, 0, i, j ); + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_LINEAR: + emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); + emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ + emit_coef_a0( func, 4, i, j ); + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 4 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_inputs( func, 0, i, j ); + break; + + case TGSI_INTERPOLATE_PERSPECTIVE: + emit_tempf( func, 0, 0, TGSI_SWIZZLE_X ); + emit_coef_dadx( func, 1, i, j ); + emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y ); + emit_coef_dady( func, 3, i, j ); + emit_mul( func, 0, 1 ); /* x * dadx */ + emit_tempf( func, 4, 0, TGSI_SWIZZLE_W ); + emit_coef_a0( func, 5, i, j ); + emit_rcp( func, 4, 4 ); /* 1.0 / w */ + emit_mul( func, 2, 3 ); /* y * dady */ + emit_add( func, 0, 5 ); /* x * dadx + a0 */ + emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */ + emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */ + emit_inputs( func, 0, i, j ); + break; + + default: + assert( 0 ); + break; + } + } + } + } +#endif + } +} + + + +static void +emit_prologue(struct ppc_function *func) +{ + /* XXX set up stack frame */ +} + + +static void +emit_epilogue(struct ppc_function *func) +{ + ppc_return(func); + /* XXX restore prev stack frame */ +} + + + +/** + * Translate a TGSI vertex/fragment shader to PPC code. + * + * \param tokens the TGSI input shader + * \param func the output PPC code/function + * \param immediates buffer to place immediates, later passed to PPC func + * \return TRUE for success, FALSE if translation failed + */ +boolean +tgsi_emit_ppc(const struct tgsi_token *tokens, + struct ppc_function *func, + float (*immediates)[4], + boolean do_swizzles ) +{ + static int use_ppc_asm = -1; + struct tgsi_parse_context parse; + /*boolean instruction_phase = FALSE;*/ + unsigned ok = 1; + uint num_immediates = 0; + struct gen_context gen; + + if (use_ppc_asm < 0) { + /* If GALLIUM_NOPPC is set, don't use PPC codegen */ + use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE); + } + if (!use_ppc_asm) + return FALSE; + + util_init_math(); + + gen.f = func; + gen.inputs_reg = ppc_reserve_register(func, 3); /* first function param */ + gen.outputs_reg = ppc_reserve_register(func, 4); /* second function param */ + gen.temps_reg = ppc_reserve_register(func, 5); /* ... */ + gen.immed_reg = ppc_reserve_register(func, 6); + gen.const_reg = ppc_reserve_register(func, 7); + gen.builtins_reg = ppc_reserve_register(func, 8); + gen.one_vec = -1; + gen.bit31_vec = -1; + + emit_prologue(func); + + tgsi_parse_init( &parse, tokens ); + + while (!tgsi_parse_end_of_tokens(&parse) && ok) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_DECLARATION: + if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) { + emit_declaration(func, &parse.FullToken.FullDeclaration ); + } + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + ok = emit_instruction(&gen, &parse.FullToken.FullInstruction); + + if (!ok) { + debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", + parse.FullToken.FullInstruction.Instruction.Opcode, + parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ? + "vertex shader" : "fragment shader"); + } + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + /* splat each immediate component into a float[4] vector for SoA */ + { + const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1; + float *imm = (float *) immediates; + uint i; + assert(size <= 4); + assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES); + for (i = 0; i < size; i++) { + const float value = + parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float; + imm[num_immediates * 4 + 0] = + imm[num_immediates * 4 + 1] = + imm[num_immediates * 4 + 2] = + imm[num_immediates * 4 + 3] = value; + num_immediates++; + } + } + break; + + default: + ok = 0; + assert( 0 ); + } + } + + emit_epilogue(func); + + tgsi_parse_free( &parse ); + + return ok; +} + +#endif /* PIPE_ARCH_PPC */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h new file mode 100644 index 0000000000..829ec075e7 --- /dev/null +++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h @@ -0,0 +1,51 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef TGSI_PPC_H +#define TGSI_PPC_H + +#if defined __cplusplus +extern "C" { +#endif + +struct tgsi_token; +struct ppc_function; + +extern const float ppc_builtin_constants[]; + + +boolean +tgsi_emit_ppc(const struct tgsi_token *tokens, + struct ppc_function *function, + float (*immediates)[4], + boolean do_swizzles); + +#if defined __cplusplus +} +#endif + +#endif /* TGSI_PPC_H */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 4681b29f52..f79170b9d6 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -25,9 +25,14 @@ * **************************************************************************/ +#include "pipe/p_config.h" + +#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE) + #include "pipe/p_debug.h" #include "pipe/p_shader_tokens.h" #include "util/u_math.h" +#include "util/u_sse.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi_exec.h" @@ -35,8 +40,6 @@ #include "rtasm/rtasm_x86sse.h" -#ifdef PIPE_ARCH_X86 - /* for 1/sqrt() * * This costs about 100fps (close to 10%) in gears: @@ -480,10 +483,31 @@ emit_coef_dady( * Function call helpers. */ +/** + * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be + * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee + * that the stack pointer is 16 byte aligned, as expected. + */ static void -emit_push_gp( - struct x86_function *func ) +emit_func_call_dst( + struct x86_function *func, + unsigned xmm_save, + unsigned xmm_dst, + void (PIPE_CDECL *code)() ) { + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + unsigned i, n, xmm; + unsigned xmm_mask; + + /* Bitmask of the xmm registers to save */ + xmm_mask = (1 << xmm_save) - 1; + xmm_mask &= ~(1 << xmm_dst); + + sse_movaps( + func, + get_temp( TEMP_R0, 0 ), + make_xmm( xmm_dst ) ); + x86_push( func, x86_make_reg( file_REG32, reg_AX) ); @@ -493,12 +517,49 @@ emit_push_gp( x86_push( func, x86_make_reg( file_REG32, reg_DX) ); -} + + for(i = 0, n = 0; i < 8; ++i) + if(xmm_mask & (1 << i)) + ++n; + + x86_sub_imm( + func, + x86_make_reg( file_REG32, reg_SP ), + n*16); + + for(i = 0, n = 0; i < 8; ++i) + if(xmm_mask & (1 << i)) { + sse_movups( + func, + x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ), + make_xmm( xmm ) ); + ++n; + } + + x86_lea( + func, + ecx, + get_temp( TEMP_R0, 0 ) ); + + x86_push( func, ecx ); + x86_mov_reg_imm( func, ecx, (unsigned long) code ); + x86_call( func, ecx ); + x86_pop(func, ecx ); + + for(i = 0, n = 0; i < 8; ++i) + if(xmm_mask & (1 << i)) { + sse_movups( + func, + make_xmm( xmm ), + x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) ); + ++n; + } + + x86_add_imm( + func, + x86_make_reg( file_REG32, reg_SP ), + n*16); -static void -x86_pop_gp( - struct x86_function *func ) -{ /* Restore GP registers in a reverse order. */ x86_pop( @@ -510,39 +571,6 @@ x86_pop_gp( x86_pop( func, x86_make_reg( file_REG32, reg_AX) ); -} - -static void -emit_func_call_dst( - struct x86_function *func, - unsigned xmm_dst, - void (PIPE_CDECL *code)() ) -{ - sse_movaps( - func, - get_temp( TEMP_R0, 0 ), - make_xmm( xmm_dst ) ); - - emit_push_gp( - func ); - - { - struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - - x86_lea( - func, - ecx, - get_temp( TEMP_R0, 0 ) ); - - x86_push( func, ecx ); - x86_mov_reg_imm( func, ecx, (unsigned long) code ); - x86_call( func, ecx ); - x86_pop(func, ecx ); - } - - - x86_pop_gp( - func ); sse_movaps( func, @@ -553,6 +581,7 @@ emit_func_call_dst( static void emit_func_call_dst_src( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst, unsigned xmm_src, void (PIPE_CDECL *code)() ) @@ -564,10 +593,111 @@ emit_func_call_dst_src( emit_func_call_dst( func, + xmm_save, xmm_dst, code ); } +/* + * Fast SSE2 implementation of special math functions. + */ + +#define POLY0(x, c0) _mm_set1_ps(c0) +#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) +#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) +#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) + +#define EXP_POLY_DEGREE 3 +#define LOG_POLY_DEGREE 5 + +/** + * See http://www.devmaster.net/forums/showthread.php?p=43580 + */ +static INLINE __m128 +exp2f4(__m128 x) +{ + __m128i ipart; + __m128 fpart, expipart, expfpart; + + x = _mm_min_ps(x, _mm_set1_ps( 129.00000f)); + x = _mm_max_ps(x, _mm_set1_ps(-126.99999f)); + + /* ipart = int(x - 0.5) */ + ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); + + /* fpart = x - ipart */ + fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); + + /* expipart = (float) (1 << ipart) */ + expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); + + /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */ +#if EXP_POLY_DEGREE == 5 + expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); +#elif EXP_POLY_DEGREE == 4 + expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f); +#elif EXP_POLY_DEGREE == 3 + expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f); +#elif EXP_POLY_DEGREE == 2 + expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f); +#else +#error +#endif + + return _mm_mul_ps(expipart, expfpart); +} + +/** + * See http://www.devmaster.net/forums/showthread.php?p=43580 + */ +static INLINE __m128 +log2f4(__m128 x) +{ + __m128i expmask = _mm_set1_epi32(0x7f800000); + __m128i mantmask = _mm_set1_epi32(0x007fffff); + __m128 one = _mm_set1_ps(1.0f); + + __m128i i = _mm_castps_si128(x); + + /* exp = (float) exponent(x) */ + __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127))); + + /* mant = (float) mantissa(x) */ + __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one); + + __m128 logmant; + + /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ + * These coefficients can be generate with + * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html + */ +#if LOG_POLY_DEGREE == 6 + logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f); +#elif LOG_POLY_DEGREE == 5 + logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f); +#elif LOG_POLY_DEGREE == 4 + logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f); +#elif LOG_POLY_DEGREE == 3 + logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f); +#else +#error +#endif + + /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ + logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one)); + + return _mm_add_ps(logmant, exp); +} + +static INLINE __m128 +powf4(__m128 x, __m128 y) +{ + return exp2f4(_mm_mul_ps(log2f4(x), y)); +} + + /** * Low-level instruction translators. */ @@ -610,38 +740,35 @@ cos4f( static void emit_cos( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, cos4f ); } static void PIPE_CDECL +#if defined(PIPE_CC_GCC) +__attribute__((force_align_arg_pointer)) +#endif ex24f( float *store ) { -#if FAST_MATH - store[0] = util_fast_exp2( store[0] ); - store[1] = util_fast_exp2( store[1] ); - store[2] = util_fast_exp2( store[2] ); - store[3] = util_fast_exp2( store[3] ); -#else - store[0] = powf( 2.0f, store[0] ); - store[1] = powf( 2.0f, store[1] ); - store[2] = powf( 2.0f, store[2] ); - store[3] = powf( 2.0f, store[3] ); -#endif + _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) )); } static void emit_ex2( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, ex24f ); } @@ -670,10 +797,12 @@ flr4f( static void emit_flr( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, flr4f ); } @@ -691,31 +820,35 @@ frc4f( static void emit_frc( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, frc4f ); } static void PIPE_CDECL +#if defined(PIPE_CC_GCC) +__attribute__((force_align_arg_pointer)) +#endif lg24f( float *store ) { - store[0] = util_fast_log2( store[0] ); - store[1] = util_fast_log2( store[1] ); - store[2] = util_fast_log2( store[2] ); - store[3] = util_fast_log2( store[3] ); + _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) )); } static void emit_lg2( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst ) { emit_func_call_dst( func, + xmm_save, xmm_dst, lg24f ); } @@ -757,14 +890,14 @@ emit_neg( } static void PIPE_CDECL +#if defined(PIPE_CC_GCC) +__attribute__((force_align_arg_pointer)) +#endif pow4f( float *store ) { -#if FAST_MATH - store[0] = util_fast_pow( store[0], store[4] ); - store[1] = util_fast_pow( store[1], store[5] ); - store[2] = util_fast_pow( store[2], store[6] ); - store[3] = util_fast_pow( store[3], store[7] ); +#if 1 + _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) )); #else store[0] = powf( store[0], store[4] ); store[1] = powf( store[1], store[5] ); @@ -776,11 +909,13 @@ pow4f( static void emit_pow( struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst, unsigned xmm_src ) { emit_func_call_dst_src( func, + xmm_save, xmm_dst, xmm_src, pow4f ); @@ -873,10 +1008,12 @@ sin4f( static void emit_sin (struct x86_function *func, + unsigned xmm_save, unsigned xmm_dst) { emit_func_call_dst( func, + xmm_save, xmm_dst, sin4f ); } @@ -1296,7 +1433,7 @@ emit_instruction( get_temp( TGSI_EXEC_TEMP_MINUS_128_I, TGSI_EXEC_TEMP_MINUS_128_C ) ); - emit_pow( func, 1, 2 ); + emit_pow( func, 3, 1, 2 ); FETCH( func, *inst, 0, 0, CHAN_X ); sse_xorps( func, @@ -1342,11 +1479,11 @@ emit_instruction( if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { emit_MOV( func, 1, 0 ); - emit_flr( func, 1 ); + emit_flr( func, 2, 1 ); /* dst.x = ex2(floor(src.x)) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { emit_MOV( func, 2, 1 ); - emit_ex2( func, 2 ); + emit_ex2( func, 3, 2 ); STORE( func, *inst, 2, 0, CHAN_X ); } /* dst.y = src.x - floor(src.x) */ @@ -1358,7 +1495,7 @@ emit_instruction( } /* dst.z = ex2(src.x) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { - emit_ex2( func, 0 ); + emit_ex2( func, 3, 0 ); STORE( func, *inst, 0, 0, CHAN_Z ); } } @@ -1376,21 +1513,21 @@ emit_instruction( FETCH( func, *inst, 0, 0, CHAN_X ); emit_abs( func, 0 ); emit_MOV( func, 1, 0 ); - emit_lg2( func, 1 ); + emit_lg2( func, 2, 1 ); /* dst.z = lg2(abs(src.x)) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) { STORE( func, *inst, 1, 0, CHAN_Z ); } if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_flr( func, 1 ); + emit_flr( func, 2, 1 ); /* dst.x = floor(lg2(abs(src.x))) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) { STORE( func, *inst, 1, 0, CHAN_X ); } /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */ if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) { - emit_ex2( func, 1 ); + emit_ex2( func, 2, 1 ); emit_rcp( func, 1, 1 ); emit_mul( func, 0, 1 ); STORE( func, *inst, 0, 0, CHAN_Y ); @@ -1580,7 +1717,7 @@ emit_instruction( /* TGSI_OPCODE_FRC */ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); - emit_frc( func, 0 ); + emit_frc( func, 0, 0 ); STORE( func, *inst, 0, 0, chan_index ); } break; @@ -1593,7 +1730,7 @@ emit_instruction( /* TGSI_OPCODE_FLR */ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( func, *inst, 0, 0, chan_index ); - emit_flr( func, 0 ); + emit_flr( func, 0, 0 ); STORE( func, *inst, 0, 0, chan_index ); } break; @@ -1605,7 +1742,7 @@ emit_instruction( case TGSI_OPCODE_EXPBASE2: /* TGSI_OPCODE_EX2 */ FETCH( func, *inst, 0, 0, CHAN_X ); - emit_ex2( func, 0 ); + emit_ex2( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1614,7 +1751,7 @@ emit_instruction( case TGSI_OPCODE_LOGBASE2: /* TGSI_OPCODE_LG2 */ FETCH( func, *inst, 0, 0, CHAN_X ); - emit_lg2( func, 0 ); + emit_lg2( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1624,7 +1761,7 @@ emit_instruction( /* TGSI_OPCODE_POW */ FETCH( func, *inst, 0, 0, CHAN_X ); FETCH( func, *inst, 1, 1, CHAN_X ); - emit_pow( func, 0, 1 ); + emit_pow( func, 0, 0, 1 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1715,7 +1852,7 @@ emit_instruction( case TGSI_OPCODE_COS: FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0 ); + emit_cos( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1774,7 +1911,7 @@ emit_instruction( case TGSI_OPCODE_SIN: FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0 ); + emit_sin( func, 0, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { STORE( func, *inst, 0, 0, chan_index ); } @@ -1868,12 +2005,12 @@ emit_instruction( case TGSI_OPCODE_SCS: IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) { FETCH( func, *inst, 0, 0, CHAN_X ); - emit_cos( func, 0 ); + emit_cos( func, 0, 0 ); STORE( func, *inst, 0, 0, CHAN_X ); } IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) { FETCH( func, *inst, 0, 0, CHAN_X ); - emit_sin( func, 0 ); + emit_sin( func, 0, 0 ); STORE( func, *inst, 0, 0, CHAN_Y ); } IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) { diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile index d3951e4e7d..b3d1045a8f 100644 --- a/src/gallium/auxiliary/util/Makefile +++ b/src/gallium/auxiliary/util/Makefile @@ -10,6 +10,7 @@ C_SOURCES = \ u_gen_mipmap.c \ u_handle_table.c \ u_hash_table.c \ + u_keymap.c \ u_math.c \ u_mm.c \ u_rect.c \ diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript index e65c17b1cc..8a04955a16 100644 --- a/src/gallium/auxiliary/util/SConscript +++ b/src/gallium/auxiliary/util/SConscript @@ -11,13 +11,14 @@ util = env.ConvenienceLibrary( 'u_gen_mipmap.c', 'u_handle_table.c', 'u_hash_table.c', + 'u_keymap.c', 'u_math.c', 'u_mm.c', 'u_rect.c', 'u_simple_shaders.c', 'u_snprintf.c', - 'u_stream_stdc.c', - 'u_stream_wd.c', + 'u_stream_stdc.c', + 'u_stream_wd.c', 'u_tile.c', 'u_time.c', ]) diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c index b6cff281e6..3ed8bdfdf3 100644 --- a/src/gallium/auxiliary/util/p_debug.c +++ b/src/gallium/auxiliary/util/p_debug.c @@ -306,6 +306,13 @@ debug_get_flags_option(const char *name, str = _debug_get_option(name); if(!str) result = dfault; + else if (!util_strcmp(str, "help")) { + result = dfault; + while (flags->name) { + debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value); + flags++; + } + } else { result = 0; while( flags->name ) { @@ -315,7 +322,12 @@ debug_get_flags_option(const char *name, } } - debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result); + if (str) { + debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str); + } + else { + debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result); + } return result; } diff --git a/src/gallium/auxiliary/util/p_debug_mem.c b/src/gallium/auxiliary/util/p_debug_mem.c index ed18c6540e..9511479cbb 100644 --- a/src/gallium/auxiliary/util/p_debug_mem.c +++ b/src/gallium/auxiliary/util/p_debug_mem.c @@ -122,8 +122,12 @@ debug_malloc(const char *file, unsigned line, const char *function, struct debug_memory_footer *ftr; hdr = real_malloc(sizeof(*hdr) + size + sizeof(*ftr)); - if(!hdr) + if(!hdr) { + debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n", + file, line, function, + (long unsigned)size); return NULL; + } hdr->no = last_no++; hdr->file = file; @@ -219,8 +223,12 @@ debug_realloc(const char *file, unsigned line, const char *function, /* alloc new */ new_hdr = real_malloc(sizeof(*new_hdr) + new_size + sizeof(*new_ftr)); - if(!new_hdr) + if(!new_hdr) { + debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n", + file, line, function, + (long unsigned)new_size); return NULL; + } new_hdr->no = old_hdr->no; new_hdr->file = old_hdr->file; new_hdr->line = old_hdr->line; @@ -261,8 +269,19 @@ debug_memory_end(unsigned long start_no) for (; entry != &list; entry = entry->prev) { struct debug_memory_header *hdr; void *ptr; + struct debug_memory_footer *ftr; + hdr = LIST_ENTRY(struct debug_memory_header, entry, head); ptr = data_from_header(hdr); + ftr = footer_from_header(hdr); + + if(hdr->magic != DEBUG_MEMORY_MAGIC) { + debug_printf("%s:%u:%s: bad or corrupted memory %p\n", + hdr->file, hdr->line, hdr->function, + ptr); + debug_assert(0); + } + if((start_no <= hdr->no && hdr->no < last_no) || (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) { debug_printf("%s:%u:%s: %u bytes at %p not freed\n", @@ -270,7 +289,15 @@ debug_memory_end(unsigned long start_no) hdr->size, ptr); total_size += hdr->size; } + + if(ftr->magic != DEBUG_MEMORY_MAGIC) { + debug_printf("%s:%u:%s: buffer overflow %p\n", + hdr->file, hdr->line, hdr->function, + ptr); + debug_assert(0); + } } + if(total_size) { debug_printf("Total of %u KB of system memory apparently leaked\n", (total_size + 1023)/1024); diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index 9adf72944e..d28201ac8d 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -104,6 +104,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->rasterizer.cull_mode = PIPE_WINDING_NONE; ctx->rasterizer.bypass_clipping = 1; /*ctx->rasterizer.bypass_vs = 1;*/ + ctx->rasterizer.gl_rasterization_rules = 1; /* samplers */ memset(&ctx->sampler, 0, sizeof(ctx->sampler)); diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c index b19a649bbc..9d305ad763 100644 --- a/src/gallium/auxiliary/util/u_gen_mipmap.c +++ b/src/gallium/auxiliary/util/u_gen_mipmap.c @@ -725,6 +725,7 @@ util_create_gen_mipmap(struct pipe_context *pipe, ctx->rasterizer.cull_mode = PIPE_WINDING_NONE; ctx->rasterizer.bypass_clipping = 1; /*ctx->rasterizer.bypass_vs = 1;*/ + ctx->rasterizer.gl_rasterization_rules = 1; /* sampler state */ memset(&ctx->sampler, 0, sizeof(ctx->sampler)); diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c new file mode 100644 index 0000000000..01b17ddb1b --- /dev/null +++ b/src/gallium/auxiliary/util/u_keymap.c @@ -0,0 +1,309 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Key lookup/associative container. + * + * Like Jose's u_hash_table, based on CSO cache code for now. + * + * Author: Brian Paul + */ + + +#include "pipe/p_compiler.h" +#include "pipe/p_debug.h" +#include "pipe/p_error.h" + +#include "cso_cache/cso_hash.h" + +#include "util/u_memory.h" +#include "util/u_keymap.h" + + +struct keymap +{ + struct cso_hash *cso; + unsigned key_size; + unsigned max_entries; /* XXX not obeyed net */ + unsigned num_entries; + keymap_delete_func delete_func; +}; + + +struct keymap_item +{ + void *key, *value; +}; + + +/** + * This the default key-delete function used when the client doesn't + * provide one. + */ +static void +default_delete_func(const struct keymap *map, + const void *key, void *data, void *user) +{ + FREE((void*) data); +} + + +static INLINE struct keymap_item * +hash_table_item(struct cso_hash_iter iter) +{ + return (struct keymap_item *) cso_hash_iter_data(iter); +} + + +/** + * Return 4-byte hash key for a block of bytes. + */ +static unsigned +hash(const void *key, unsigned keySize) +{ + unsigned i, hash; + + keySize /= 4; /* convert from bytes to uints */ + + hash = 0; + for (i = 0; i < keySize; i++) { + hash ^= (i + 1) * ((const unsigned *) key)[i]; + } + + /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/ + + return hash; +} + + +/** + * Create a new map. + * \param keySize size of the keys in bytes + * \param maxEntries max number of entries to allow (~0 = infinity) + * \param deleteFunc optional callback to call when entries + * are deleted/replaced + */ +struct keymap * +util_new_keymap(unsigned keySize, unsigned maxEntries, + keymap_delete_func deleteFunc) +{ + struct keymap *map = MALLOC_STRUCT(keymap); + if (!map) + return NULL; + + map->cso = cso_hash_create(); + if (!map->cso) { + FREE(map); + return NULL; + } + + map->max_entries = maxEntries; + map->num_entries = 0; + map->key_size = keySize; + map->delete_func = deleteFunc ? deleteFunc : default_delete_func; + + return map; +} + + +/** + * Delete/free a keymap and all entries. The deleteFunc that was given at + * create time will be called for each entry. + * \param user user-provided pointer passed through to the delete callback + */ +void +util_delete_keymap(struct keymap *map, void *user) +{ + util_keymap_remove_all(map, user); + cso_hash_delete(map->cso); + FREE(map); +} + + +static INLINE struct cso_hash_iter +hash_table_find_iter(const struct keymap *map, const void *key, + unsigned key_hash) +{ + struct cso_hash_iter iter; + struct keymap_item *item; + + iter = cso_hash_find(map->cso, key_hash); + while (!cso_hash_iter_is_null(iter)) { + item = (struct keymap_item *) cso_hash_iter_data(iter); + if (!memcmp(item->key, key, map->key_size)) + break; + iter = cso_hash_iter_next(iter); + } + + return iter; +} + + +static INLINE struct keymap_item * +hash_table_find_item(const struct keymap *map, const void *key, + unsigned key_hash) +{ + struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash); + if (cso_hash_iter_is_null(iter)) { + return NULL; + } + else { + return hash_table_item(iter); + } +} + + +/** + * Insert a new key + data pointer into the table. + * Note: we create a copy of the key, but not the data! + * If the key is already present in the table, replace the existing + * entry (calling the delete callback on the previous entry). + * If the maximum capacity of the map is reached an old entry + * will be deleted (the delete callback will be called). + */ +boolean +util_keymap_insert(struct keymap *map, const void *key, + const void *data, void *user) +{ + unsigned key_hash; + struct keymap_item *item; + struct cso_hash_iter iter; + + assert(map); + + key_hash = hash(key, map->key_size); + + item = hash_table_find_item(map, key, key_hash); + if (item) { + /* call delete callback for old entry/item */ + map->delete_func(map, item->key, item->value, user); + item->value = (void *) data; + return TRUE; + } + + item = MALLOC_STRUCT(keymap_item); + if (!item) + return FALSE; + + item->key = mem_dup(key, map->key_size); + item->value = (void *) data; + + iter = cso_hash_insert(map->cso, key_hash, item); + if (cso_hash_iter_is_null(iter)) { + FREE(item); + return FALSE; + } + + map->num_entries++; + + return TRUE; +} + + +/** + * Look up a key in the map and return the associated data pointer. + */ +const void * +util_keymap_lookup(const struct keymap *map, const void *key) +{ + unsigned key_hash; + struct keymap_item *item; + + assert(map); + + key_hash = hash(key, map->key_size); + + item = hash_table_find_item(map, key, key_hash); + if (!item) + return NULL; + + return item->value; +} + + +/** + * Remove an entry from the map. + * The delete callback will be called if the given key/entry is found. + * \param user passed to the delete callback as the last param. + */ +void +util_keymap_remove(struct keymap *map, const void *key, void *user) +{ + unsigned key_hash; + struct cso_hash_iter iter; + struct keymap_item *item; + + assert(map); + + key_hash = hash(key, map->key_size); + + iter = hash_table_find_iter(map, key, key_hash); + if (cso_hash_iter_is_null(iter)) + return; + + item = hash_table_item(iter); + assert(item); + map->delete_func(map, item->key, item->value, user); + FREE(item->key); + FREE(item); + + map->num_entries--; + + cso_hash_erase(map->cso, iter); +} + + +/** + * Remove all entries from the map, calling the delete callback for each. + * \param user passed to the delete callback as the last param. + */ +void +util_keymap_remove_all(struct keymap *map, void *user) +{ + struct cso_hash_iter iter; + struct keymap_item *item; + + assert(map); + + iter = cso_hash_first_node(map->cso); + while (!cso_hash_iter_is_null(iter)) { + item = (struct keymap_item *) + cso_hash_take(map->cso, cso_hash_iter_key(iter)); + map->delete_func(map, item->key, item->value, user); + FREE(item->key); + FREE(item); + iter = cso_hash_first_node(map->cso); + } +} + + +extern void +util_keymap_info(const struct keymap *map) +{ + debug_printf("Keymap %p: %u of max %u entries\n", + (void *) map, map->num_entries, map->max_entries); +} diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h new file mode 100644 index 0000000000..8d60a76fc3 --- /dev/null +++ b/src/gallium/auxiliary/util/u_keymap.h @@ -0,0 +1,68 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_KEYMAP_H +#define U_KEYMAP_H + +#include "pipe/p_compiler.h" + + +/** opaque keymap type */ +struct keymap; + + +/** Delete/callback function type */ +typedef void (*keymap_delete_func)(const struct keymap *map, + const void *key, void *data, + void *user); + + +extern struct keymap * +util_new_keymap(unsigned keySize, unsigned maxEntries, + keymap_delete_func deleteFunc); + +extern void +util_delete_keymap(struct keymap *map, void *user); + +extern boolean +util_keymap_insert(struct keymap *map, const void *key, + const void *data, void *user); + +extern const void * +util_keymap_lookup(const struct keymap *map, const void *key); + +extern void +util_keymap_remove(struct keymap *map, const void *key, void *user); + +extern void +util_keymap_remove_all(struct keymap *map, void *user); + +extern void +util_keymap_info(const struct keymap *map); + + +#endif /* U_KEYMAP_H */ diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c index 0729114d6a..5b3cab4642 100644 --- a/src/gallium/auxiliary/util/u_math.c +++ b/src/gallium/auxiliary/util/u_math.c @@ -30,7 +30,7 @@ #include "util/u_math.h" - +/** 2^x, for x in [-1.0, 1.0[ */ float pow2_table[POW2_TABLE_SIZE]; @@ -38,9 +38,21 @@ static void init_pow2_table(void) { int i; - for (i = 0; i < POW2_TABLE_SIZE; i++) { - pow2_table[i] = (float) pow(2.0, i / POW2_TABLE_SCALE); - } + for (i = 0; i < POW2_TABLE_SIZE; i++) + pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE); +} + + +/** log2(x), for x in [1.0, 2.0[ */ +float log2_table[LOG2_TABLE_SIZE]; + + +static void +init_log2_table(void) +{ + unsigned i; + for (i = 0; i < LOG2_TABLE_SIZE; i++) + log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SIZE)); } @@ -53,6 +65,7 @@ util_init_math(void) static boolean initialized = FALSE; if (!initialized) { init_pow2_table(); + init_log2_table(); initialized = TRUE; } } diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index 0b10622ee7..be7303e550 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -40,6 +40,7 @@ #include "pipe/p_compiler.h" +#include "pipe/p_debug.h" #ifdef __cplusplus @@ -173,8 +174,10 @@ static INLINE float logf( float f ) -#define POW2_TABLE_SIZE 256 -#define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1)) +#define POW2_TABLE_SIZE_LOG2 9 +#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2) +#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2) +#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2)) extern float pow2_table[POW2_TABLE_SIZE]; @@ -185,98 +188,78 @@ util_init_math(void); union fi { float f; - int i; - unsigned ui; + int32_t i; + uint32_t ui; }; /** - * Fast approximation to exp(x). - * Compute with base 2 exponents: exp(x) = exp2(log2(e) * x) - * Note: log2(e) is a constant, k = 1.44269 - * So, exp(x) = exp2(k * x); + * Fast version of 2^x * Identity: exp2(a + b) = exp2(a) * exp2(b) - * Let ipart = int(k*x) - * Let fpart = k*x - ipart; - * So, exp2(k*x) = exp2(ipart) * exp2(fpart) + * Let ipart = int(x) + * Let fpart = x - ipart; + * So, exp2(x) = exp2(ipart) * exp2(fpart) * Compute exp2(ipart) with i << ipart * Compute exp2(fpart) with lookup table. */ static INLINE float -util_fast_exp(float x) +util_fast_exp2(float x) { - if (x >= 0.0f) { - float k = 1.44269f; /* = log2(e) */ - float kx = k * x; - int ipart = (int) kx; - float fpart = kx - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return y; - } - else { - /* exp(-x) = 1.0 / exp(x) */ - float k = -1.44269f; - float kx = k * x; - int ipart = (int) kx; - float fpart = kx - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return 1.0f / y; - } + int32_t ipart; + float fpart, mpart; + union fi epart; + + if(x > 129.00000f) + return 3.402823466e+38f; + + if(x < -126.99999f) + return 0.0f; + + ipart = (int32_t) x; + fpart = x - (float) ipart; + + /* same as + * epart.f = (float) (1 << ipart) + * but faster and without integer overflow for ipart > 31 */ + epart.i = (ipart + 127 ) << 23; + + mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)]; + + return epart.f * mpart; } /** - * Fast version of 2^x - * XXX the above function could be implemented in terms of this one. + * Fast approximation to exp(x). */ static INLINE float -util_fast_exp2(float x) +util_fast_exp(float x) { - if (x >= 0.0f) { - int ipart = (int) x; - float fpart = x - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return y; - } - else { - /* exp(-x) = 1.0 / exp(x) */ - int ipart = (int) -x; - float fpart = -x - (float) ipart; - float y = (float) (1 << ipart) - * pow2_table[(int) (fpart * POW2_TABLE_SCALE)]; - return 1.0f / y; - } + const float k = 1.44269f; /* = log2(e) */ + return util_fast_exp2(k * x); } -/** - * Based on code from http://www.flipcode.com/totd/ - */ +#define LOG2_TABLE_SIZE_LOG2 8 +#define LOG2_TABLE_SIZE (1 << LOG2_TABLE_SIZE_LOG2) +extern float log2_table[LOG2_TABLE_SIZE]; + + static INLINE float -util_fast_log2(float val) +util_fast_log2(float x) { union fi num; - int log_2; - num.f = val; - log_2 = ((num.i >> 23) & 255) - 128; - num.i &= ~(255 << 23); - num.i += 127 << 23; - num.f = ((-1.0f/3) * num.f + 2) * num.f - 2.0f/3; - return num.f + log_2; + float epart, mpart; + num.f = x; + epart = (float)(((num.i & 0x7f800000) >> 23) - 127); + mpart = log2_table[(num.i & 0x007fffff) >> (23 - LOG2_TABLE_SIZE_LOG2)]; + return epart + mpart; } static INLINE float util_fast_pow(float x, float y) { - /* XXX these tests may need adjustment */ - if (y >= 3.0f && (-0.02f <= x && x <= 0.02f)) - return 0.0f; - if (y >= 50.0f && (-0.9f <= x && x <= 0.9f)) - return 0.0f; return util_fast_exp2(util_fast_log2(x) * y); } diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h new file mode 100644 index 0000000000..e2a8491e62 --- /dev/null +++ b/src/gallium/auxiliary/util/u_sse.h @@ -0,0 +1,77 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * @file + * SSE intrinsics portability header. + * + * Although the SSE intrinsics are support by all modern x86 and x86-64 + * compilers, there are some intrisincs missing in some implementations + * (especially older MSVC versions). This header abstracts that away. + */ + +#ifndef U_SSE_H_ +#define U_SSE_H_ + +#include "pipe/p_config.h" + +#if defined(PIPE_ARCH_SSE) + +#include <xmmintrin.h> +#include <emmintrin.h> + + +/* MSVC before VC8 does not support the _mm_castxxx_yyy */ +#if defined(_MSC_VER) && _MSC_VER < 1500 + +union __declspec(align(16)) m128_types { + __m128 m128; + __m128i m128i; + __m128d m128d; +}; + +static __inline __m128 +_mm_castsi128_ps(__m128i a) +{ + union m128_types u; + u.m128i = a; + return u.m128; +} + +static __inline __m128i +_mm_castps_si128(__m128 a) +{ + union m128_types u; + u.m128 = a; + return u.m128i; +} + +#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */ + +#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ + +#endif /* U_SSE_H_ */ |