diff options
author | Christoph Bumiller <e0425955@student.tuwien.ac.at> | 2010-09-02 18:31:49 +0200 |
---|---|---|
committer | Christoph Bumiller <e0425955@student.tuwien.ac.at> | 2010-09-02 18:31:49 +0200 |
commit | 222d2f2ac2c7d93cbc0643082c78278ad2c8cfce (patch) | |
tree | b79152c238022b2a901201c22e5809ac520732bf /src/gallium/auxiliary | |
parent | 443abc80db9e1a288ce770e76cccd43664348098 (diff) | |
parent | e73c5501b2fe20290d1b691c85a5d82ac3a0431c (diff) |
Merge remote branch 'origin/master' into nv50-compiler
Conflicts:
src/gallium/drivers/nv50/nv50_program.c
Diffstat (limited to 'src/gallium/auxiliary')
98 files changed, 4680 insertions, 2838 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile index 9544e90a96..eb86d83d2a 100644 --- a/src/gallium/auxiliary/Makefile +++ b/src/gallium/auxiliary/Makefile @@ -4,8 +4,8 @@ include $(TOP)/configs/current LIBNAME = gallium C_SOURCES = \ - cso_cache/cso_context.c \ cso_cache/cso_cache.c \ + cso_cache/cso_context.c \ cso_cache/cso_hash.c \ draw/draw_context.c \ draw/draw_gs.c \ @@ -26,7 +26,6 @@ C_SOURCES = \ draw/draw_pipe_wide_line.c \ draw/draw_pipe_wide_point.c \ draw/draw_pt.c \ - draw/draw_pt_elts.c \ draw/draw_pt_emit.c \ draw/draw_pt_fetch.c \ draw/draw_pt_fetch_emit.c \ @@ -35,24 +34,24 @@ C_SOURCES = \ draw/draw_pt_post_vs.c \ draw/draw_pt_so_emit.c \ draw/draw_pt_util.c \ - draw/draw_pt_varray.c \ - draw/draw_pt_vcache.c \ + draw/draw_pt_vsplit.c \ draw/draw_vertex.c \ draw/draw_vs.c \ - draw/draw_vs_varient.c \ draw/draw_vs_aos.c \ draw/draw_vs_aos_io.c \ draw/draw_vs_aos_machine.c \ draw/draw_vs_exec.c \ draw/draw_vs_ppc.c \ draw/draw_vs_sse.c \ + draw/draw_vs_varient.c \ indices/u_indices_gen.c \ indices/u_unfilled_gen.c \ os/os_misc.c \ + os/os_stream.c \ os/os_stream_log.c \ + os/os_stream_null.c \ os/os_stream_stdc.c \ os/os_stream_str.c \ - os/os_stream_null.c \ os/os_time.c \ pipebuffer/pb_buffer_fenced.c \ pipebuffer/pb_buffer_malloc.c \ @@ -65,17 +64,16 @@ C_SOURCES = \ pipebuffer/pb_bufmgr_slab.c \ pipebuffer/pb_validate.c \ rbug/rbug_connection.c \ + rbug/rbug_context.c \ rbug/rbug_core.c \ + rbug/rbug_demarshal.c \ rbug/rbug_texture.c \ - rbug/rbug_context.c \ rbug/rbug_shader.c \ - rbug/rbug_demarshal.c \ rtasm/rtasm_cpu.c \ rtasm/rtasm_execmem.c \ - rtasm/rtasm_x86sse.c \ rtasm/rtasm_ppc.c \ rtasm/rtasm_ppc_spe.c \ - tgsi/tgsi_sanity.c \ + rtasm/rtasm_x86sse.c \ tgsi/tgsi_build.c \ tgsi/tgsi_dump.c \ tgsi/tgsi_exec.c \ @@ -83,19 +81,22 @@ C_SOURCES = \ tgsi/tgsi_iterate.c \ tgsi/tgsi_parse.c \ tgsi/tgsi_ppc.c \ + tgsi/tgsi_sanity.c \ tgsi/tgsi_scan.c \ tgsi/tgsi_sse2.c \ tgsi/tgsi_text.c \ tgsi/tgsi_transform.c \ tgsi/tgsi_ureg.c \ tgsi/tgsi_util.c \ - translate/translate_generic.c \ - translate/translate_sse.c \ translate/translate.c \ translate/translate_cache.c \ + translate/translate_generic.c \ + translate/translate_sse.c \ util/u_debug.c \ - util/u_debug_symbol.c \ + util/u_debug_describe.c \ + util/u_debug_refcnt.c \ util/u_debug_stack.c \ + util/u_debug_symbol.c \ util/u_dump_defines.c \ util/u_dump_state.c \ util/u_bitmask.c \ @@ -118,10 +119,11 @@ C_SOURCES = \ util/u_gen_mipmap.c \ util/u_half.c \ util/u_handle_table.c \ - util/u_hash_table.c \ util/u_hash.c \ + util/u_hash_table.c \ util/u_keymap.c \ util/u_linear.c \ + util/u_linkage.c \ util/u_network.c \ util/u_math.c \ util/u_mempool.c \ @@ -172,10 +174,10 @@ GALLIVM_SOURCES = \ gallivm/lp_bld_tgsi_soa.c \ gallivm/lp_bld_type.c \ draw/draw_llvm.c \ - draw/draw_vs_llvm.c \ - draw/draw_pt_fetch_shade_pipeline_llvm.c \ + draw/draw_llvm_sample.c \ draw/draw_llvm_translate.c \ - draw/draw_llvm_sample.c + draw/draw_vs_llvm.c \ + draw/draw_pt_fetch_shade_pipeline_llvm.c GALLIVM_CPP_SOURCES = \ gallivm/lp_bld_misc.cpp diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript index 3124e20ce8..6210ada990 100644 --- a/src/gallium/auxiliary/SConscript +++ b/src/gallium/auxiliary/SConscript @@ -50,10 +50,11 @@ env.Depends('util/u_format_table.c', [ ]) source = [ - 'cso_cache/cso_context.c', 'cso_cache/cso_cache.c', + 'cso_cache/cso_context.c', 'cso_cache/cso_hash.c', 'draw/draw_context.c', + 'draw/draw_gs.c', 'draw/draw_pipe.c', 'draw/draw_pipe_aaline.c', 'draw/draw_pipe_aapoint.c', @@ -71,7 +72,6 @@ source = [ 'draw/draw_pipe_wide_line.c', 'draw/draw_pipe_wide_point.c', 'draw/draw_pt.c', - 'draw/draw_pt_elts.c', 'draw/draw_pt_emit.c', 'draw/draw_pt_fetch.c', 'draw/draw_pt_fetch_emit.c', @@ -80,8 +80,7 @@ source = [ 'draw/draw_pt_post_vs.c', 'draw/draw_pt_so_emit.c', 'draw/draw_pt_util.c', - 'draw/draw_pt_varray.c', - 'draw/draw_pt_vcache.c', + 'draw/draw_pt_vsplit.c', 'draw/draw_vertex.c', 'draw/draw_vs.c', 'draw/draw_vs_aos.c', @@ -91,16 +90,16 @@ source = [ 'draw/draw_vs_ppc.c', 'draw/draw_vs_sse.c', 'draw/draw_vs_varient.c', - 'draw/draw_gs.c', #'indices/u_indices.c', #'indices/u_unfilled_indices.c', 'indices/u_indices_gen.c', 'indices/u_unfilled_gen.c', 'os/os_misc.c', + 'os/os_stream.c', 'os/os_stream_log.c', + 'os/os_stream_null.c', 'os/os_stream_stdc.c', 'os/os_stream_str.c', - 'os/os_stream_null.c', 'os/os_time.c', 'pipebuffer/pb_buffer_fenced.c', 'pipebuffer/pb_buffer_malloc.c', @@ -112,35 +111,35 @@ source = [ 'pipebuffer/pb_bufmgr_pool.c', 'pipebuffer/pb_bufmgr_slab.c', 'pipebuffer/pb_validate.c', + 'rbug/rbug_connection.c', + 'rbug/rbug_context.c', 'rbug/rbug_core.c', + 'rbug/rbug_demarshal.c', 'rbug/rbug_shader.c', - 'rbug/rbug_context.c', 'rbug/rbug_texture.c', - 'rbug/rbug_demarshal.c', - 'rbug/rbug_connection.c', 'rtasm/rtasm_cpu.c', 'rtasm/rtasm_execmem.c', - 'rtasm/rtasm_x86sse.c', 'rtasm/rtasm_ppc.c', 'rtasm/rtasm_ppc_spe.c', + 'rtasm/rtasm_x86sse.c', 'tgsi/tgsi_build.c', 'tgsi/tgsi_dump.c', 'tgsi/tgsi_exec.c', 'tgsi/tgsi_info.c', 'tgsi/tgsi_iterate.c', 'tgsi/tgsi_parse.c', + 'tgsi/tgsi_ppc.c', 'tgsi/tgsi_sanity.c', 'tgsi/tgsi_scan.c', - 'tgsi/tgsi_ppc.c', 'tgsi/tgsi_sse2.c', 'tgsi/tgsi_text.c', 'tgsi/tgsi_transform.c', 'tgsi/tgsi_ureg.c', 'tgsi/tgsi_util.c', - 'translate/translate_generic.c', - 'translate/translate_sse.c', 'translate/translate.c', 'translate/translate_cache.c', + 'translate/translate_generic.c', + 'translate/translate_sse.c', 'util/u_bitmask.c', 'util/u_blit.c', 'util/u_blitter.c', @@ -148,7 +147,9 @@ source = [ 'util/u_caps.c', 'util/u_cpu_detect.c', 'util/u_debug.c', + 'util/u_debug_describe.c', 'util/u_debug_memory.c', + 'util/u_debug_refcnt.c', 'util/u_debug_stack.c', 'util/u_debug_symbol.c', 'util/u_dump_defines.c', @@ -170,6 +171,8 @@ source = [ 'util/u_hash.c', 'util/u_hash_table.c', 'util/u_keymap.c', + 'util/u_linear.c', + 'util/u_linkage.c', 'util/u_network.c', 'util/u_math.c', 'util/u_mempool.c', @@ -208,9 +211,9 @@ if env['llvm']: 'gallivm/lp_bld_format_soa.c', 'gallivm/lp_bld_format_yuv.c', 'gallivm/lp_bld_gather.c', + 'gallivm/lp_bld_init.c', 'gallivm/lp_bld_intr.c', 'gallivm/lp_bld_logic.c', - 'gallivm/lp_bld_init.c', 'gallivm/lp_bld_misc.cpp', 'gallivm/lp_bld_pack.c', 'gallivm/lp_bld_printf.c', @@ -222,10 +225,10 @@ if env['llvm']: 'gallivm/lp_bld_tgsi_soa.c', 'gallivm/lp_bld_type.c', 'draw/draw_llvm.c', - 'draw/draw_pt_fetch_shade_pipeline_llvm.c', + 'draw/draw_llvm_sample.c', 'draw/draw_llvm_translate.c', - 'draw/draw_vs_llvm.c', - 'draw/draw_llvm_sample.c' + 'draw/draw_pt_fetch_shade_pipeline_llvm.c', + 'draw/draw_vs_llvm.c' ] gallium = env.ConvenienceLibrary( diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h new file mode 100644 index 0000000000..958ed20dc8 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h @@ -0,0 +1,114 @@ +/************************************************************************** + * + * Copyright 2010, VMware, inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +static boolean TAG(do_cliptest)( struct pt_post_vs *pvs, + struct draw_vertex_info *info ) +{ + struct vertex_header *out = info->verts; + const float *scale = pvs->draw->viewport.scale; + const float *trans = pvs->draw->viewport.translate; + /* const */ float (*plane)[4] = pvs->draw->plane; + const unsigned pos = draw_current_shader_position_output(pvs->draw); + const unsigned ef = pvs->draw->vs.edgeflag_output; + const unsigned nr = pvs->draw->nr_planes; + const unsigned flags = (FLAGS); + unsigned need_pipeline = 0; + unsigned j; + + for (j = 0; j < info->count; j++) { + float *position = out->data[pos]; + unsigned mask = 0x0; + + initialize_vertex_header(out); + + if (flags & (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_HALF_Z | DO_CLIP_USER)) { + out->clip[0] = position[0]; + out->clip[1] = position[1]; + out->clip[2] = position[2]; + out->clip[3] = position[3]; + + /* Do the hardwired planes first: + */ + if (flags & DO_CLIP_XY) { + if (-position[0] + position[3] < 0) mask |= (1<<0); + if ( position[0] + position[3] < 0) mask |= (1<<1); + if (-position[1] + position[3] < 0) mask |= (1<<2); + if ( position[1] + position[3] < 0) mask |= (1<<3); + } + + /* Clip Z planes according to full cube, half cube or none. + */ + if (flags & DO_CLIP_FULL_Z) { + if ( position[2] + position[3] < 0) mask |= (1<<4); + if (-position[2] + position[3] < 0) mask |= (1<<5); + } + else if (flags & DO_CLIP_HALF_Z) { + if ( position[2] < 0) mask |= (1<<4); + if (-position[2] + position[3] < 0) mask |= (1<<5); + } + + if (flags & DO_CLIP_USER) { + unsigned i; + for (i = 6; i < nr; i++) { + if (dot4(position, plane[i]) < 0) + mask |= (1<<i); + } + } + + out->clipmask = mask; + need_pipeline |= out->clipmask; + } + + if ((flags & DO_VIEWPORT) && mask == 0) + { + /* divide by w */ + float w = 1.0f / position[3]; + + /* Viewport mapping */ + position[0] = position[0] * w * scale[0] + trans[0]; + position[1] = position[1] * w * scale[1] + trans[1]; + position[2] = position[2] * w * scale[2] + trans[2]; + position[3] = w; + } + + if ((flags & DO_EDGEFLAG) && ef) { + const float *edgeflag = out->data[ef]; + out->edgeflag = !(edgeflag[0] != 1.0f); + need_pipeline |= !out->edgeflag; + } + + out = (struct vertex_header *)( (char *)out + info->stride ); + } + + return need_pipeline != 0; +} + + +#undef FLAGS +#undef TAG diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 995b675b9a..937b093479 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -34,6 +34,7 @@ #include "pipe/p_context.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_cpu_detect.h" #include "draw_context.h" #include "draw_vs.h" #include "draw_gs.h" @@ -41,6 +42,25 @@ #if HAVE_LLVM #include "gallivm/lp_bld_init.h" #include "draw_llvm.h" + +static boolean +draw_get_option_use_llvm(void) +{ + static boolean first = TRUE; + static boolean value; + if (first) { + first = FALSE; + value = debug_get_bool_option("DRAW_USE_LLVM", TRUE); + +#ifdef PIPE_ARCH_X86 + util_cpu_detect(); + /* require SSE2 due to LLVM PR6960. */ + if (!util_cpu_caps.has_sse2) + value = FALSE; +#endif + } + return value; +} #endif struct draw_context *draw_create( struct pipe_context *pipe ) @@ -50,10 +70,13 @@ struct draw_context *draw_create( struct pipe_context *pipe ) goto fail; #if HAVE_LLVM - lp_build_init(); - assert(lp_build_engine); - draw->engine = lp_build_engine; - draw->llvm = draw_llvm_create(draw); + if(draw_get_option_use_llvm()) + { + lp_build_init(); + assert(lp_build_engine); + draw->engine = lp_build_engine; + draw->llvm = draw_llvm_create(draw); + } #endif if (!draw_init(draw)) @@ -83,6 +106,8 @@ boolean draw_init(struct draw_context *draw) ASSIGN_4V( draw->plane[4], 0, 0, 1, 1 ); /* yes these are correct */ ASSIGN_4V( draw->plane[5], 0, 0, -1, 1 ); /* mesa's a bit wonky */ draw->nr_planes = 6; + draw->clip_xy = 1; + draw->clip_z = 1; draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */ @@ -135,7 +160,8 @@ void draw_destroy( struct draw_context *draw ) draw_vs_destroy( draw ); draw_gs_destroy( draw ); #ifdef HAVE_LLVM - draw_llvm_destroy( draw->llvm ); + if(draw->llvm) + draw_llvm_destroy( draw->llvm ); #endif FREE( draw ); @@ -162,6 +188,14 @@ void draw_set_mrd(struct draw_context *draw, double mrd) } +static void update_clip_flags( struct draw_context *draw ) +{ + draw->clip_xy = !draw->driver.bypass_clip_xy; + draw->clip_z = (!draw->driver.bypass_clip_z && + !draw->depth_clamp); + draw->clip_user = (draw->nr_planes > 6); +} + /** * Register new primitive rasterization/rendering state. * This causes the drawing pipeline to be rebuilt. @@ -176,18 +210,25 @@ void draw_set_rasterizer_state( struct draw_context *draw, draw->rasterizer = raster; draw->rast_handle = rast_handle; - draw->bypass_clipping = draw->driver.bypass_clipping; - } + } } - +/* With a little more work, llvmpipe will be able to turn this off and + * do its own x/y clipping. + * + * Some hardware can turn off clipping altogether - in particular any + * hardware with a TNL unit can do its own clipping, even if it is + * relying on the draw module for some other reason. + */ void draw_set_driver_clipping( struct draw_context *draw, - boolean bypass_clipping ) + boolean bypass_clip_xy, + boolean bypass_clip_z ) { draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); - draw->driver.bypass_clipping = bypass_clipping; - draw->bypass_clipping = draw->driver.bypass_clipping; + draw->driver.bypass_clip_xy = bypass_clip_xy; + draw->driver.bypass_clip_z = bypass_clip_z; + update_clip_flags(draw); } @@ -217,6 +258,8 @@ void draw_set_clip_state( struct draw_context *draw, memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0])); draw->nr_planes = 6 + clip->nr; draw->depth_clamp = clip->depth_clamp; + + update_clip_flags(draw); } @@ -472,47 +515,28 @@ void draw_set_render( struct draw_context *draw, } - -/** - * Tell the drawing context about the index/element buffer to use - * (ala glDrawElements) - * If no element buffer is to be used (i.e. glDrawArrays) then this - * should be called with eltSize=0 and elements=NULL. - * - * \param draw the drawing context - * \param eltSize size of each element (1, 2 or 4 bytes) - * \param elements the element buffer ptr - */ void -draw_set_mapped_element_buffer_range( struct draw_context *draw, - unsigned eltSize, - int eltBias, - unsigned min_index, - unsigned max_index, - const void *elements ) +draw_set_index_buffer(struct draw_context *draw, + const struct pipe_index_buffer *ib) { - draw->pt.user.elts = elements; - draw->pt.user.eltSize = eltSize; - draw->pt.user.eltBias = eltBias; - draw->pt.user.min_index = min_index; - draw->pt.user.max_index = max_index; + if (ib) + memcpy(&draw->pt.index_buffer, ib, sizeof(draw->pt.index_buffer)); + else + memset(&draw->pt.index_buffer, 0, sizeof(draw->pt.index_buffer)); } +/** + * Tell drawing context where to find mapped index/element buffer. + */ void -draw_set_mapped_element_buffer( struct draw_context *draw, - unsigned eltSize, - int eltBias, - const void *elements ) +draw_set_mapped_index_buffer(struct draw_context *draw, + const void *elements) { - draw->pt.user.elts = elements; - draw->pt.user.eltSize = eltSize; - draw->pt.user.eltBias = eltBias; - draw->pt.user.min_index = 0; - draw->pt.user.max_index = 0xffffffff; + draw->pt.user.elts = elements; } - + /* Revamp me please: */ void draw_do_flush( struct draw_context *draw, unsigned flags ) @@ -659,7 +683,8 @@ draw_set_mapped_texture(struct draw_context *draw, const void *data[DRAW_MAX_TEXTURE_LEVELS]) { #ifdef HAVE_LLVM - draw_llvm_set_mapped_texture(draw, + if(draw->llvm) + draw_llvm_set_mapped_texture(draw, sampler_idx, width, height, depth, last_level, row_stride, img_stride, data); diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index 116716af6f..4c780e4dcb 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -160,18 +160,11 @@ void draw_set_vertex_elements(struct draw_context *draw, unsigned count, const struct pipe_vertex_element *elements); -void -draw_set_mapped_element_buffer_range( struct draw_context *draw, - unsigned eltSize, - int eltBias, - unsigned min_index, - unsigned max_index, - const void *elements ); - -void draw_set_mapped_element_buffer( struct draw_context *draw, - unsigned eltSize, - int eltBias, - const void *elements ); +void draw_set_index_buffer(struct draw_context *draw, + const struct pipe_index_buffer *ib); + +void draw_set_mapped_index_buffer(struct draw_context *draw, + const void *elements); void draw_set_mapped_vertex_buffer(struct draw_context *draw, unsigned attr, const void *buffer); @@ -196,6 +189,9 @@ draw_set_so_state(struct draw_context *draw, * draw_pt.c */ +void draw_vbo(struct draw_context *draw, + const struct pipe_draw_info *info); + void draw_arrays(struct draw_context *draw, unsigned prim, unsigned start, unsigned count); @@ -216,7 +212,8 @@ void draw_set_render( struct draw_context *draw, struct vbuf_render *render ); void draw_set_driver_clipping( struct draw_context *draw, - boolean bypass_clipping ); + boolean bypass_clip_xy, + boolean bypass_clip_z ); void draw_set_force_passthrough( struct draw_context *draw, boolean enable ); diff --git a/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/src/gallium/auxiliary/draw/draw_decompose_tmp.h index a52d2b5058..a142563af9 100644 --- a/src/gallium/auxiliary/draw/draw_decompose_tmp.h +++ b/src/gallium/auxiliary/draw/draw_decompose_tmp.h @@ -54,10 +54,10 @@ FUNC(FUNC_VARS) FUNC_ENTER; - /* prim, count, and last_vertex_last should have been defined */ + /* prim, prim_flags, count, and last_vertex_last should have been defined */ if (0) { - debug_printf("%s: prim 0x%x, count %d, last_vertex_last %d\n", - __FUNCTION__, prim, count, last_vertex_last); + debug_printf("%s: prim 0x%x, prim_flags 0x%x, count %d, last_vertex_last %d\n", + __FUNCTION__, prim, prim_flags, count, last_vertex_last); } switch (prim) { @@ -80,7 +80,7 @@ FUNC(FUNC_VARS) case PIPE_PRIM_LINE_LOOP: case PIPE_PRIM_LINE_STRIP: if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; idx[1] = GET_ELT(0); idx[2] = idx[1]; @@ -90,7 +90,7 @@ FUNC(FUNC_VARS) LINE(flags, idx[0], idx[1]); } /* close the loop */ - if (prim == PIPE_PRIM_LINE_LOOP) + if (prim == PIPE_PRIM_LINE_LOOP && !prim_flags) LINE(flags, idx[1], idx[2]); } break; @@ -255,17 +255,23 @@ FUNC(FUNC_VARS) if (last_vertex_last) { flags = (DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_2 | DRAW_PIPE_EDGE_FLAG_0); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_2; + edge_next = DRAW_PIPE_EDGE_FLAG_0; - edge_finish = DRAW_PIPE_EDGE_FLAG_1; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_1; } else { flags = (DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_0; + edge_next = DRAW_PIPE_EDGE_FLAG_1; - edge_finish = DRAW_PIPE_EDGE_FLAG_2; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_2; } idx[0] = GET_ELT(0); @@ -300,7 +306,7 @@ FUNC(FUNC_VARS) case PIPE_PRIM_LINE_STRIP_ADJACENCY: if (count >= 4) { - flags = DRAW_PIPE_RESET_STIPPLE; + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; idx[1] = GET_ELT(0); idx[2] = GET_ELT(1); idx[3] = GET_ELT(2); diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c index 4a1013e79a..50a03ac95a 100644 --- a/src/gallium/auxiliary/draw/draw_gs.c +++ b/src/gallium/auxiliary/draw/draw_gs.c @@ -380,7 +380,7 @@ static void gs_tri_adj(struct draw_geometry_shader *shader, #define FUNC gs_run_elts #define LOCAL_VARS const ushort *elts = input_prims->elts; -#define GET_ELT(idx) (elts[idx] & ~DRAW_PIPE_FLAG_MASK) +#define GET_ELT(idx) (elts[idx]) #include "draw_gs_tmp.h" @@ -457,6 +457,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader, output_prims->start = 0; output_prims->count = shader->emitted_vertices; output_prims->prim = shader->output_primitive; + output_prims->flags = 0x0; output_prims->primitive_lengths = shader->primitive_lengths; output_prims->primitive_count = shader->emitted_primitives; output_verts->count = shader->emitted_vertices; diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h index 4a17af0dea..de7b02655a 100644 --- a/src/gallium/auxiliary/draw/draw_gs_tmp.h +++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h @@ -6,12 +6,10 @@ #define FUNC_ENTER \ /* declare more local vars */ \ - struct draw_context *draw = gs->draw; \ const unsigned prim = input_prims->prim; \ + const unsigned prim_flags = input_prims->flags; \ const unsigned count = input_prims->count; \ - const boolean last_vertex_last = \ - !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ + const boolean last_vertex_last = TRUE; \ do { \ debug_assert(input_prims->primitive_count == 1); \ switch (prim) { \ diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 8d53601d19..8759c38cab 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -210,13 +210,6 @@ draw_llvm_create(struct draw_context *draw) { struct draw_llvm *llvm; -#ifdef PIPE_ARCH_X86 - util_cpu_detect(); - /* require SSE2 due to LLVM PR6960. */ - if (!util_cpu_caps.has_sse2) - return NULL; -#endif - llvm = CALLOC_STRUCT( draw_llvm ); if (!llvm) return NULL; @@ -292,15 +285,23 @@ draw_llvm_destroy(struct draw_llvm *llvm) } struct draw_llvm_variant * -draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs) +draw_llvm_create_variant(struct draw_llvm *llvm, + unsigned num_inputs, + const struct draw_llvm_variant_key *key) { - struct draw_llvm_variant *variant = MALLOC(sizeof(struct draw_llvm_variant)); + struct draw_llvm_variant *variant; struct llvm_vertex_shader *shader = llvm_vertex_shader(llvm->draw->vs.vertex_shader); + variant = MALLOC(sizeof *variant + + shader->variant_key_size - + sizeof variant->key); + if (variant == NULL) + return NULL; + variant->llvm = llvm; - draw_llvm_make_variant_key(llvm, &variant->key); + memcpy(&variant->key, key, shader->variant_key_size); llvm->vertex_header_ptr_type = create_vertex_header(llvm, num_inputs); @@ -738,8 +739,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); /* code generated texture sampling */ - sampler = draw_llvm_sampler_soa_create(variant->key.sampler, - context_ptr); + sampler = draw_llvm_sampler_soa_create( + draw_llvm_variant_key_samplers(&variant->key), + context_ptr); #if DEBUG_STORE lp_build_printf(builder, "start = %d, end = %d, step = %d\n", @@ -901,8 +903,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); /* code generated texture sampling */ - sampler = draw_llvm_sampler_soa_create(variant->key.sampler, - context_ptr); + sampler = draw_llvm_sampler_soa_create( + draw_llvm_variant_key_samplers(&variant->key), + context_ptr); fetch_max = LLVMBuildSub(builder, fetch_count, LLVMConstInt(LLVMInt32Type(), 1, 0), @@ -1002,35 +1005,42 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian lp_func_delete_body(variant->function_elts); } -void -draw_llvm_make_variant_key(struct draw_llvm *llvm, - struct draw_llvm_variant_key *key) + +struct draw_llvm_variant_key * +draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store) { unsigned i; + struct draw_llvm_variant_key *key; + struct lp_sampler_static_state *sampler; - memset(key, 0, sizeof(struct draw_llvm_variant_key)); + key = (struct draw_llvm_variant_key *)store; + /* Presumably all variants of the shader should have the same + * number of vertex elements - ie the number of shader inputs. + */ key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements; + /* All variants of this shader will have the same value for + * nr_samplers. Not yet trying to compact away holes in the + * sampler array. + */ + key->nr_samplers = llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_SAMPLER] + 1; + + sampler = draw_llvm_variant_key_samplers(key); + memcpy(key->vertex_element, llvm->draw->pt.vertex_element, sizeof(struct pipe_vertex_element) * key->nr_vertex_elements); + + memset(sampler, 0, key->nr_samplers * sizeof *sampler); - memcpy(&key->vs, - &llvm->draw->vs.vertex_shader->state, - sizeof(struct pipe_shader_state)); - - /* if the driver implemented the sampling hooks then - * setup our sampling state */ - if (llvm->draw->num_sampler_views && llvm->draw->num_samplers) { - for(i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; ++i) { - struct draw_vertex_shader *shader = llvm->draw->vs.vertex_shader; - if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) - lp_sampler_static_state(&key->sampler[i], - llvm->draw->sampler_views[i], - llvm->draw->samplers[i]); - } + for (i = 0 ; i < key->nr_samplers; i++) { + lp_sampler_static_state(&sampler[i], + llvm->draw->sampler_views[i], + llvm->draw->samplers[i]); } + + return key; } void diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index 4addb47d2d..6196b2f983 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -151,12 +151,43 @@ typedef void struct draw_llvm_variant_key { - struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS]; - unsigned nr_vertex_elements; - struct pipe_shader_state vs; - struct lp_sampler_static_state sampler[PIPE_MAX_VERTEX_SAMPLERS]; + unsigned nr_vertex_elements:16; + unsigned nr_samplers:16; + + /* Variable number of vertex elements: + */ + struct pipe_vertex_element vertex_element[1]; + + /* Followed by variable number of samplers: + */ +/* struct lp_sampler_static_state sampler; */ }; +#define DRAW_LLVM_MAX_VARIANT_KEY_SIZE \ + (sizeof(struct draw_llvm_variant_key) + \ + PIPE_MAX_VERTEX_SAMPLERS * sizeof(struct lp_sampler_static_state) + \ + (PIPE_MAX_ATTRIBS-1) * sizeof(struct pipe_vertex_element)) + + +static INLINE size_t +draw_llvm_variant_key_size(unsigned nr_vertex_elements, + unsigned nr_samplers) +{ + return (sizeof(struct draw_llvm_variant_key) + + nr_samplers * sizeof(struct lp_sampler_static_state) + + (nr_vertex_elements - 1) * sizeof(struct pipe_vertex_element)); +} + + +static INLINE struct lp_sampler_static_state * +draw_llvm_variant_key_samplers(struct draw_llvm_variant_key *key) +{ + return (struct lp_sampler_static_state *) + &key->vertex_element[key->nr_vertex_elements]; +} + + + struct draw_llvm_variant_list_item { struct draw_llvm_variant *base; @@ -165,7 +196,6 @@ struct draw_llvm_variant_list_item struct draw_llvm_variant { - struct draw_llvm_variant_key key; LLVMValueRef function; LLVMValueRef function_elts; draw_jit_vert_func jit_func; @@ -176,11 +206,16 @@ struct draw_llvm_variant struct draw_llvm *llvm; struct draw_llvm_variant_list_item list_item_global; struct draw_llvm_variant_list_item list_item_local; + + /* key is variable-sized, must be last */ + struct draw_llvm_variant_key key; + /* key is variable-sized, must be last */ }; struct llvm_vertex_shader { struct draw_vertex_shader base; + unsigned variant_key_size; struct draw_llvm_variant_list_item variants; unsigned variants_created; unsigned variants_cached; @@ -220,14 +255,15 @@ void draw_llvm_destroy(struct draw_llvm *llvm); struct draw_llvm_variant * -draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs); +draw_llvm_create_variant(struct draw_llvm *llvm, + unsigned num_vertex_header_attribs, + const struct draw_llvm_variant_key *key); void draw_llvm_destroy_variant(struct draw_llvm_variant *variant); -void -draw_llvm_make_variant_key(struct draw_llvm *llvm, - struct draw_llvm_variant_key *key); +struct draw_llvm_variant_key * +draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store); LLVMValueRef draw_llvm_translate_from(LLVMBuilderRef builder, diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c index 58995e0724..6206197dae 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.c +++ b/src/gallium/auxiliary/draw/draw_pipe.c @@ -169,35 +169,27 @@ static void do_triangle( struct draw_context *draw, /* * Set up macros for draw_pt_decompose.h template code. * This code uses vertex indexes / elements. - * - * Flags are needed by the stipple and unfilled stages. When the two stages - * are active, vcache_run_extras is called and the flags are stored in the - * higher bits of i0. Otherwise, flags do not matter. */ #define TRIANGLE(flags,i0,i1,i2) \ do { \ - assert(!((i1) & DRAW_PIPE_FLAG_MASK)); \ - assert(!((i2) & DRAW_PIPE_FLAG_MASK)); \ do_triangle( draw, \ - i0, /* flags */ \ - verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK), \ + flags, \ + verts + stride * (i0), \ verts + stride * (i1), \ verts + stride * (i2) ); \ } while (0) #define LINE(flags,i0,i1) \ do { \ - assert(!((i1) & DRAW_PIPE_FLAG_MASK)); \ do_line( draw, \ - i0, /* flags */ \ - verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK), \ + flags, \ + verts + stride * (i0), \ verts + stride * (i1) ); \ } while (0) #define POINT(i0) \ do { \ - assert(!((i0) & DRAW_PIPE_FLAG_MASK)); \ do_point( draw, verts + stride * (i0) ); \ } while (0) @@ -207,6 +199,7 @@ static void do_triangle( struct draw_context *draw, #define FUNC_VARS \ struct draw_context *draw, \ unsigned prim, \ + unsigned prim_flags, \ struct vertex_header *vertices, \ unsigned stride, \ const ushort *elts, \ @@ -245,22 +238,27 @@ void draw_pipeline_run( struct draw_context *draw, const unsigned count = prim_info->primitive_lengths[i]; #if DEBUG - /* make sure none of the element indexes go outside the vertex buffer */ + /* Warn if one of the element indexes go outside the vertex buffer */ { unsigned max_index = 0x0, i; /* find the largest element index */ for (i = 0; i < count; i++) { - unsigned int index = (prim_info->elts[start + i] - & ~DRAW_PIPE_FLAG_MASK); + unsigned int index = prim_info->elts[start + i]; if (index > max_index) max_index = index; } - assert(max_index <= vert_info->count); + if (max_index >= vert_info->count) { + debug_printf("%s: max_index (%u) outside vertex buffer (%u)\n", + __FUNCTION__, + max_index, + vert_info->count); + } } #endif pipe_run_elts(draw, prim_info->prim, + prim_info->flags, vert_info->verts, vert_info->stride, prim_info->elts + start, @@ -298,6 +296,7 @@ void draw_pipeline_run( struct draw_context *draw, #define FUNC_VARS \ struct draw_context *draw, \ unsigned prim, \ + unsigned prim_flags, \ struct vertex_header *vertices, \ unsigned stride, \ unsigned count @@ -330,6 +329,7 @@ void draw_pipeline_run_linear( struct draw_context *draw, pipe_run_linear(draw, prim_info->prim, + prim_info->flags, (struct vertex_header*)verts, vert_info->stride, count); diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c index eafa29276f..8b92543987 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_validate.c +++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c @@ -265,7 +265,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) /* Clip stage */ - if (!draw->bypass_clipping) + if (draw->clip_xy || draw->clip_z || draw->clip_user) { draw->pipeline.clip->next = next; next = draw->pipeline.clip; diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c index 3c93c9014a..58c5858734 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c +++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c @@ -353,9 +353,6 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf ) /* Allocate a new vertex buffer */ vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size; - /* even number */ - vbuf->max_vertices = vbuf->max_vertices & ~1; - if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID) vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1; diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 397d4bf653..362f563ba6 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -140,8 +140,7 @@ struct draw_context } middle; struct { - struct draw_pt_front_end *vcache; - struct draw_pt_front_end *varray; + struct draw_pt_front_end *vsplit; } front; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; @@ -150,6 +149,8 @@ struct draw_context struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS]; unsigned nr_vertex_elements; + struct pipe_index_buffer index_buffer; + /* user-space vertex data, buffers */ struct { /** vertex element/index buffer (ex: glDrawElements) */ @@ -175,13 +176,19 @@ struct draw_context } pt; struct { - boolean bypass_clipping; - boolean bypass_vs; + boolean bypass_clip_xy; + boolean bypass_clip_z; } driver; boolean flushing; /**< debugging/sanity */ boolean suspend_flushing; /**< internally set */ - boolean bypass_clipping; /**< set if either api or driver bypass_clipping true */ + + /* Flags set if API requires clipping in these planes and the + * driver doesn't indicate that it can do it for us. + */ + boolean clip_xy; + boolean clip_z; + boolean clip_user; boolean force_passthrough; /**< never clip or shade */ @@ -296,6 +303,10 @@ struct draw_vertex_info { unsigned count; }; +/* these flags are set if the primitive is a segment of a larger one */ +#define DRAW_SPLIT_BEFORE 0x1 +#define DRAW_SPLIT_AFTER 0x2 + struct draw_prim_info { boolean linear; unsigned start; @@ -304,6 +315,7 @@ struct draw_prim_info { unsigned count; unsigned prim; + unsigned flags; unsigned *primitive_lengths; unsigned primitive_count; }; @@ -369,21 +381,15 @@ void draw_pipeline_destroy( struct draw_context *draw ); -/* We use the top few bits in the elts[] parameter to convey a little - * API information. This limits the number of vertices we can address - * to only 4096 -- if that becomes a problem, we can switch to 32-bit - * draw indices. - * - * These flags expected at first vertex of lines & triangles when - * unfilled and/or line stipple modes are operational. +/* + * These flags are used by the pipeline when unfilled and/or line stipple modes + * are operational. */ -#define DRAW_PIPE_MAX_VERTICES (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_0 (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_1 (0x2<<12) -#define DRAW_PIPE_EDGE_FLAG_2 (0x4<<12) -#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12) -#define DRAW_PIPE_RESET_STIPPLE (0x8<<12) -#define DRAW_PIPE_FLAG_MASK (0xf<<12) +#define DRAW_PIPE_EDGE_FLAG_0 0x1 +#define DRAW_PIPE_EDGE_FLAG_1 0x2 +#define DRAW_PIPE_EDGE_FLAG_2 0x4 +#define DRAW_PIPE_EDGE_FLAG_ALL 0x7 +#define DRAW_PIPE_RESET_STIPPLE 0x8 void draw_pipeline_run( struct draw_context *draw, const struct draw_vertex_info *vert, diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 248927505d..f44bf2507c 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -39,25 +39,14 @@ #include "util/u_math.h" #include "util/u_prim.h" #include "util/u_format.h" +#include "util/u_draw.h" DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE) DEBUG_GET_ONCE_BOOL_OPTION(draw_no_fse, "DRAW_NO_FSE", FALSE) -#ifdef HAVE_LLVM -DEBUG_GET_ONCE_BOOL_OPTION(draw_use_llvm, "DRAW_USE_LLVM", TRUE) -#endif - -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - if (count < first) - return 0; - return count - (count - first) % incr; -} - - /* Overall we split things into: - * - frontend -- prepare fetch_elts, draw_elts - eg vcache + * - frontend -- prepare fetch_elts, draw_elts - eg vsplit * - middle -- fetch, shade, cliptest, viewport * - pipeline -- the prim pipeline: clipping, wide lines, etc * - backend -- the vbuf_render provided by the driver. @@ -77,7 +66,7 @@ draw_pt_arrays(struct draw_context *draw, { unsigned first, incr; draw_pt_split_prim(prim, &first, &incr); - count = trim(count, first, incr); + count = draw_pt_trim_count(count, first, incr); if (count < first) return TRUE; } @@ -97,7 +86,9 @@ draw_pt_arrays(struct draw_context *draw, opt |= PT_PIPELINE; } - if (!draw->bypass_clipping && !draw->pt.test_fse) { + if ((draw->clip_xy || + draw->clip_z || + draw->clip_user) && !draw->pt.test_fse) { opt |= PT_CLIPTEST; } @@ -115,22 +106,11 @@ draw_pt_arrays(struct draw_context *draw, middle = draw->pt.middle.general; } - - /* Pick the right frontend - */ - if (draw->pt.user.elts || (opt & PT_PIPELINE)) { - frontend = draw->pt.front.vcache; - } else { - frontend = draw->pt.front.varray; - } + frontend = draw->pt.front.vsplit; frontend->prepare( frontend, prim, middle, opt ); - frontend->run(frontend, - draw_pt_elt_func(draw), - draw_pt_elt_ptr(draw, start), - draw->pt.user.eltBias, - count); + frontend->run(frontend, start, count); frontend->finish( frontend ); @@ -143,12 +123,8 @@ boolean draw_pt_init( struct draw_context *draw ) draw->pt.test_fse = debug_get_option_draw_fse(); draw->pt.no_fse = debug_get_option_draw_no_fse(); - draw->pt.front.vcache = draw_pt_vcache( draw ); - if (!draw->pt.front.vcache) - return FALSE; - - draw->pt.front.varray = draw_pt_varray(draw); - if (!draw->pt.front.varray) + draw->pt.front.vsplit = draw_pt_vsplit(draw); + if (!draw->pt.front.vsplit) return FALSE; draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw ); @@ -164,7 +140,7 @@ boolean draw_pt_init( struct draw_context *draw ) return FALSE; #if HAVE_LLVM - if (debug_get_option_draw_use_llvm()) + if (draw->llvm) draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw ); #endif @@ -194,14 +170,9 @@ void draw_pt_destroy( struct draw_context *draw ) draw->pt.middle.fetch_shade_emit = NULL; } - if (draw->pt.front.vcache) { - draw->pt.front.vcache->destroy( draw->pt.front.vcache ); - draw->pt.front.vcache = NULL; - } - - if (draw->pt.front.varray) { - draw->pt.front.varray->destroy( draw->pt.front.varray ); - draw->pt.front.varray = NULL; + if (draw->pt.front.vsplit) { + draw->pt.front.vsplit->destroy( draw->pt.front.vsplit ); + draw->pt.front.vsplit = NULL; } } @@ -221,24 +192,29 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) uint ii = 0; uint j; - if (draw->pt.user.elts) { + if (draw->pt.user.eltSize) { + const char *elts; + /* indexed arrays */ + elts = (const char *) draw->pt.user.elts; + elts += draw->pt.index_buffer.offset; + switch (draw->pt.user.eltSize) { case 1: { - const ubyte *elem = (const ubyte *) draw->pt.user.elts; + const ubyte *elem = (const ubyte *) elts; ii = elem[start + i]; } break; case 2: { - const ushort *elem = (const ushort *) draw->pt.user.elts; + const ushort *elem = (const ushort *) elts; ii = elem[start + i]; } break; case 4: { - const uint *elem = (const uint *) draw->pt.user.elts; + const uint *elem = (const uint *) elts; ii = elem[start + i]; } break; @@ -324,17 +300,8 @@ draw_arrays(struct draw_context *draw, unsigned prim, /** - * Draw vertex arrays. - * This is the main entrypoint into the drawing module. - * If drawing an indexed primitive, the draw_set_mapped_element_buffer_range() - * function should have already been called to specify the element/index buffer - * information. - * - * \param prim one of PIPE_PRIM_x - * \param start index of first vertex to draw - * \param count number of vertices to draw - * \param startInstance number for the first primitive instance (usually 0). - * \param instanceCount number of instances to draw (1=non-instanced) + * Instanced drawing. + * \sa draw_vbo */ void draw_arrays_instanced(struct draw_context *draw, @@ -344,10 +311,50 @@ draw_arrays_instanced(struct draw_context *draw, unsigned startInstance, unsigned instanceCount) { - unsigned reduced_prim = u_reduced_prim(mode); + struct pipe_draw_info info; + + util_draw_init_info(&info); + + info.mode = mode; + info.start = start; + info.count = count; + info.start_instance = startInstance; + info.instance_count = instanceCount; + + info.indexed = (draw->pt.user.elts != NULL); + if (!info.indexed) { + info.min_index = start; + info.max_index = start + count - 1; + } + + draw_vbo(draw, &info); +} + + +/** + * Draw vertex arrays. + * This is the main entrypoint into the drawing module. If drawing an indexed + * primitive, the draw_set_index_buffer() and draw_set_mapped_index_buffer() + * functions should have already been called to specify the element/index + * buffer information. + */ +void +draw_vbo(struct draw_context *draw, + const struct pipe_draw_info *info) +{ + unsigned reduced_prim = u_reduced_prim(info->mode); unsigned instance; - assert(instanceCount > 0); + assert(info->instance_count > 0); + if (info->indexed) + assert(draw->pt.user.elts); + + draw->pt.user.eltSize = + (info->indexed) ? draw->pt.index_buffer.index_size : 0; + + draw->pt.user.eltBias = info->index_bias; + draw->pt.user.min_index = info->min_index; + draw->pt.user.max_index = info->max_index; if (reduced_prim != draw->reduced_prim) { draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE); @@ -355,8 +362,8 @@ draw_arrays_instanced(struct draw_context *draw, } if (0) - debug_printf("draw_arrays(mode=%u start=%u count=%u):\n", - mode, start, count); + debug_printf("draw_vbo(mode=%u start=%u count=%u):\n", + info->mode, info->start, info->count); if (0) tgsi_dump(draw->vs.vertex_shader->state.tokens, 0); @@ -384,10 +391,10 @@ draw_arrays_instanced(struct draw_context *draw, } if (0) - draw_print_arrays(draw, mode, start, MIN2(count, 20)); + draw_print_arrays(draw, info->mode, info->start, MIN2(info->count, 20)); - for (instance = 0; instance < instanceCount; instance++) { - draw->instance_id = instance + startInstance; - draw_pt_arrays(draw, mode, start, count); + for (instance = 0; instance < info->instance_count; instance++) { + draw->instance_id = instance + info->start_instance; + draw_pt_arrays(draw, info->mode, info->start, info->count); } } diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h index 44356fba4c..5fbb424291 100644 --- a/src/gallium/auxiliary/draw/draw_pt.h +++ b/src/gallium/auxiliary/draw/draw_pt.h @@ -35,8 +35,6 @@ #include "pipe/p_compiler.h" -typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx ); - struct draw_pt_middle_end; struct draw_context; struct draw_prim_info; @@ -52,13 +50,18 @@ struct draw_vertex_info; /* The "front end" - prepare sets of fetch, draw elements for the * middle end. * - * Currenly one version of this: - * - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims - * Later: - * - varray, varray_split - * - velement, velement_split + * The fetch elements are indices to the vertices. The draw elements are + * indices to the fetched vertices. When both arrays of elements are both + * linear, middle->run_linear is called; When only the fetch elements are + * linear, middle->run_linear_elts is called; Otherwise, middle->run is + * called. + * + * When the number of the draw elements exceeds max_vertex of the middle end, + * the draw elements (as well as the fetch elements) are splitted and the + * middle end is called multiple times. * - * Currenly only using the vcache version. + * Currenly there is: + * - vsplit - catchall implementation, splits big prims */ struct draw_pt_front_end { void (*prepare)( struct draw_pt_front_end *, @@ -67,9 +70,7 @@ struct draw_pt_front_end { unsigned opt ); void (*run)( struct draw_pt_front_end *, - pt_elt_func elt_func, - const void *elt_ptr, - int elt_bias, + unsigned start, unsigned count ); void (*finish)( struct draw_pt_front_end * ); @@ -80,6 +81,8 @@ struct draw_pt_front_end { /* The "middle end" - prepares actual hardware vertices for the * hardware backend. * + * prim_flags is as defined by pipe_draw_info::flags. + * * Currently two versions of this: * - fetch, vertex shade, cliptest, prim-pipeline * - fetch, emit (ie passthrough) @@ -94,11 +97,13 @@ struct draw_pt_middle_end { const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); void (*run_linear)(struct draw_pt_middle_end *, unsigned start, - unsigned count); + unsigned count, + unsigned prim_flags ); /* Transform all vertices in a linear range and then draw them with * the supplied element list. May fail and return FALSE. @@ -107,7 +112,8 @@ struct draw_pt_middle_end { unsigned fetch_start, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); int (*get_max_vertex_count)( struct draw_pt_middle_end * ); @@ -122,19 +128,11 @@ struct vbuf_render; struct vertex_header; -/* Helper functions. - */ -pt_elt_func draw_pt_elt_func( struct draw_context *draw ); -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ); - /* Frontends: * - * Currently only the general-purpose vcache implementation, could add - * a special case for tiny vertex buffers. + * Currently only the general-purpose vsplit implementation. */ -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ); -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw); +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw); /* Middle-ends: @@ -223,7 +221,9 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, struct draw_vertex_info *info ); void draw_pt_post_vs_prepare( struct pt_post_vs *pvs, - boolean bypass_clipping, + boolean clip_xy, + boolean clip_z, + boolean clip_user, boolean bypass_viewport, boolean opengl, boolean need_edgeflags ); @@ -237,6 +237,7 @@ void draw_pt_post_vs_destroy( struct pt_post_vs *pvs ); * Utils: */ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr); +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr); #endif diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c deleted file mode 100644 index 88f4d9f495..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_elts.c +++ /dev/null @@ -1,89 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <keith@tungstengraphics.com> - */ - -#include "draw/draw_pt.h" -#include "draw/draw_private.h" - -/* Neat get_elt func that also works for varrays drawing by encoding - * the start value into a pointer. - */ - -static unsigned elt_uint( const void *elts, unsigned idx ) -{ - return *(((const uint *)elts) + idx); -} - -static unsigned elt_ushort( const void *elts, unsigned idx ) -{ - return *(((const ushort *)elts) + idx); -} - -static unsigned elt_ubyte( const void *elts, unsigned idx ) -{ - return *(((const ubyte *)elts) + idx); -} - -static unsigned elt_vert( const void *elts, unsigned idx ) -{ - /* unsigned index is packed in the pointer */ - return (unsigned)(uintptr_t)elts + idx; -} - -pt_elt_func draw_pt_elt_func( struct draw_context *draw ) -{ - switch (draw->pt.user.eltSize) { - case 0: return &elt_vert; - case 1: return &elt_ubyte; - case 2: return &elt_ushort; - case 4: return &elt_uint; - default: return NULL; - } -} - -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ) -{ - const char *elts = draw->pt.user.elts; - - switch (draw->pt.user.eltSize) { - case 0: - return (const void *)(((const ubyte *)NULL) + start); - case 1: - return (const void *)(((const ubyte *)elts) + start); - case 2: - return (const void *)(((const ushort *)elts) + start); - case 4: - return (const void *)(((const uint *)elts) + start); - default: - return NULL; - } -} diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index 5568fbb9f8..c8dfc16911 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -120,9 +120,6 @@ void draw_pt_emit_prepare( struct pt_emit *emit, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* even number */ - *max_vertices = *max_vertices & ~1; } @@ -147,11 +144,6 @@ void draw_pt_emit( struct pt_emit *emit, if (vertex_count == 0) return; - if (vertex_count >= UNDEFINED_VERTEX_ID) { - assert(0); - return; - } - /* XXX: and work out some way to coordinate the render primitive * between vbuf.c and here... */ @@ -226,9 +218,6 @@ void draw_pt_emit_linear(struct pt_emit *emit, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - /* XXX: and work out some way to coordinate the render primitive * between vbuf.c and here... */ diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index 5c8af17c8e..e706b7796f 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -191,15 +191,6 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; } @@ -210,7 +201,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -220,11 +212,6 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (fetch_count >= UNDEFINED_VERTEX_ID) { - assert(0); - return; - } - draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)fetch_count ); @@ -273,7 +260,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -283,9 +271,6 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)count )) @@ -334,7 +319,8 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -344,9 +330,6 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - return FALSE; - if (!draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)count )) diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index b8270280b6..7c198c6026 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -102,7 +102,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.nr_inputs); /* inputs - fetch from api format */ fse->key.viewport = !draw->identity_viewport; - fse->key.clip = !draw->bypass_clipping; + fse->key.clip = draw->clip_xy || draw->clip_z || draw->clip_user; fse->key.const_vbuffers = 0; memset(fse->key.element, 0, @@ -175,15 +175,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; - /* Probably need to do this somewhere (or fix exec shader not to * need it): */ @@ -197,7 +188,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, static void fse_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -207,9 +199,6 @@ static void fse_run_linear( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)count )) @@ -265,7 +254,8 @@ fse_run(struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -275,9 +265,6 @@ fse_run(struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (fetch_count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)fetch_count )) @@ -327,7 +314,8 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -337,9 +325,6 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - return FALSE; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)count )) diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index 5b16c3788e..b72fd61245 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -100,8 +100,10 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, * but gl vs dx9 clip spaces. */ draw_pt_post_vs_prepare( fpme->post_vs, - (boolean)draw->bypass_clipping, - (boolean)draw->identity_viewport, + draw->clip_xy, + draw->clip_z, + draw->clip_user, + draw->identity_viewport, (boolean)draw->rasterizer->gl_rasterization_rules, (draw->vs.edgeflag_output ? TRUE : FALSE) ); @@ -112,16 +114,13 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, gs_out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } - /* return even number */ - *max_vertices = *max_vertices & ~1; - /* No need to prepare the shader. */ vs->prepare(vs, draw); @@ -295,7 +294,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -311,6 +311,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; @@ -320,7 +321,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, unsigned start, - unsigned count) + unsigned count, + unsigned prim_flags) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -336,6 +338,7 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, prim_info.count = count; prim_info.elts = NULL; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &count; @@ -348,7 +351,8 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -364,6 +368,7 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 4b99bee86a..77291e304e 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -66,7 +66,8 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, struct draw_context *draw = fpme->draw; struct llvm_vertex_shader *shader = llvm_vertex_shader(draw->vs.vertex_shader); - struct draw_llvm_variant_key key; + char store[DRAW_LLVM_MAX_VARIANT_KEY_SIZE]; + struct draw_llvm_variant_key *key; struct draw_llvm_variant *variant = NULL; struct draw_llvm_variant_list_item *li; unsigned i; @@ -106,8 +107,10 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, * but gl vs dx9 clip spaces. */ draw_pt_post_vs_prepare( fpme->post_vs, - (boolean)draw->bypass_clipping, - (boolean)(draw->identity_viewport), + draw->clip_xy, + draw->clip_z, + draw->clip_user, + draw->identity_viewport, (boolean)draw->rasterizer->gl_rasterization_rules, (draw->vs.edgeflag_output ? TRUE : FALSE) ); @@ -118,21 +121,21 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } /* return even number */ *max_vertices = *max_vertices & ~1; - - draw_llvm_make_variant_key(fpme->llvm, &key); + + key = draw_llvm_make_variant_key(fpme->llvm, store); li = first_elem(&shader->variants); while(!at_end(&shader->variants, li)) { - if(memcmp(&li->base->key, &key, sizeof key) == 0) { + if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) { variant = li->base; break; } @@ -155,7 +158,7 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, } } - variant = draw_llvm_create_variant(fpme->llvm, nr); + variant = draw_llvm_create_variant(fpme->llvm, nr, key); if (variant) { insert_at_head(&shader->variants, &variant->list_item_local); @@ -294,7 +297,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -310,6 +314,7 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; @@ -319,7 +324,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle, unsigned start, - unsigned count) + unsigned count, + unsigned prim_flags) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -335,6 +341,7 @@ static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle, prim_info.count = count; prim_info.elts = NULL; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &count; @@ -348,7 +355,8 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -364,6 +372,7 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c index 308f927b77..769409cfd6 100644 --- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c +++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c @@ -26,14 +26,26 @@ **************************************************************************/ #include "util/u_memory.h" +#include "util/u_math.h" #include "pipe/p_context.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" + +#define DO_CLIP_XY 0x1 +#define DO_CLIP_FULL_Z 0x2 +#define DO_CLIP_HALF_Z 0x4 +#define DO_CLIP_USER 0x8 +#define DO_VIEWPORT 0x10 +#define DO_EDGEFLAG 0x20 + + struct pt_post_vs { struct draw_context *draw; + unsigned flags; + boolean (*run)( struct pt_post_vs *pvs, struct draw_vertex_info *info ); }; @@ -56,186 +68,47 @@ dot4(const float *a, const float *b) a[3]*b[3]); } -static INLINE unsigned -compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr, - boolean clip_depth) -{ - unsigned mask = 0x0; - unsigned i; +#define FLAGS (0) +#define TAG(x) x##_none +#include "draw_cliptest_tmp.h" -#if 0 - debug_printf("compute clipmask %f %f %f %f\n", - clip[0], clip[1], clip[2], clip[3]); - assert(clip[3] != 0.0); -#endif +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT) +#define TAG(x) x##_xy_fullz_viewport +#include "draw_cliptest_tmp.h" - /* Do the hardwired planes first: - */ - if (-clip[0] + clip[3] < 0) mask |= (1<<0); - if ( clip[0] + clip[3] < 0) mask |= (1<<1); - if (-clip[1] + clip[3] < 0) mask |= (1<<2); - if ( clip[1] + clip[3] < 0) mask |= (1<<3); - if (clip_depth) { - if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */ - if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */ - } +#define FLAGS (DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT) +#define TAG(x) x##_xy_halfz_viewport +#include "draw_cliptest_tmp.h" - /* Followed by any remaining ones: - */ - for (i = 6; i < nr; i++) { - if (dot4(clip, plane[i]) < 0) - mask |= (1<<i); - } +#define FLAGS (DO_CLIP_FULL_Z | DO_VIEWPORT) +#define TAG(x) x##_fullz_viewport +#include "draw_cliptest_tmp.h" - return mask; -} +#define FLAGS (DO_CLIP_HALF_Z | DO_VIEWPORT) +#define TAG(x) x##_halfz_viewport +#include "draw_cliptest_tmp.h" +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT) +#define TAG(x) x##_xy_fullz_user_viewport +#include "draw_cliptest_tmp.h" -/* The normal case - cliptest, rhw divide, viewport transform. - * - * Also handle identity viewport here at the expense of a few wasted - * instructions - */ -static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs, - struct draw_vertex_info *info ) -{ - struct vertex_header *out = info->verts; - const float *scale = pvs->draw->viewport.scale; - const float *trans = pvs->draw->viewport.translate; - const unsigned pos = draw_current_shader_position_output(pvs->draw); - unsigned clipped = 0; - unsigned j; - - if (0) debug_printf("%s count, %d\n", __FUNCTION__, info->count); - - for (j = 0; j < info->count; j++) { - float *position = out->data[pos]; - - initialize_vertex_header(out); -#if 0 - debug_printf("%d) io = %p, data = %p = [%f, %f, %f, %f]\n", - j, out, position, position[0], position[1], position[2], position[3]); -#endif - - out->clip[0] = position[0]; - out->clip[1] = position[1]; - out->clip[2] = position[2]; - out->clip[3] = position[3]; - - out->vertex_id = 0xffff; - /* Disable depth clipping if depth clamping is enabled. */ - out->clipmask = compute_clipmask_gl(out->clip, - pvs->draw->plane, - pvs->draw->nr_planes, - !pvs->draw->depth_clamp); - clipped += out->clipmask; - - if (out->clipmask == 0) - { - /* divide by w */ - float w = 1.0f / position[3]; - - /* Viewport mapping */ - position[0] = position[0] * w * scale[0] + trans[0]; - position[1] = position[1] * w * scale[1] + trans[1]; - position[2] = position[2] * w * scale[2] + trans[2]; - position[3] = w; -#if 0 - debug_printf("post viewport: %f %f %f %f\n", - position[0], - position[1], - position[2], - position[3]); -#endif - } - - out = (struct vertex_header *)( (char *)out + info->stride ); - } - - return clipped != 0; -} +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT | DO_EDGEFLAG) +#define TAG(x) x##_xy_fullz_user_viewport_edgeflag +#include "draw_cliptest_tmp.h" -/* As above plus edgeflags +/* Don't want to create 64 versions of this function, so catch the + * less common ones here. This is looking like something which should + * be code-generated, perhaps appended to the end of the vertex + * shader. */ -static boolean -post_vs_cliptest_viewport_gl_edgeflag(struct pt_post_vs *pvs, - struct draw_vertex_info *info) -{ - unsigned j; - boolean needpipe; - - needpipe = post_vs_cliptest_viewport_gl(pvs, info); - - /* If present, copy edgeflag VS output into vertex header. - * Otherwise, leave header as is. - */ - if (pvs->draw->vs.edgeflag_output) { - struct vertex_header *out = info->verts; - int ef = pvs->draw->vs.edgeflag_output; - - for (j = 0; j < info->count; j++) { - const float *edgeflag = out->data[ef]; - out->edgeflag = !(edgeflag[0] != 1.0f); - needpipe |= !out->edgeflag; - out = (struct vertex_header *)( (char *)out + info->stride ); - } - } - return needpipe; -} - +#define FLAGS (pvs->flags) +#define TAG(x) x##_generic +#include "draw_cliptest_tmp.h" -/* If bypass_clipping is set, skip cliptest and rhw divide. - */ -static boolean post_vs_viewport( struct pt_post_vs *pvs, - struct draw_vertex_info *info ) -{ - struct vertex_header *out = info->verts; - const float *scale = pvs->draw->viewport.scale; - const float *trans = pvs->draw->viewport.translate; - const unsigned pos = draw_current_shader_position_output(pvs->draw); - unsigned j; - - if (0) debug_printf("%s\n", __FUNCTION__); - for (j = 0; j < info->count; j++) { - float *position = out->data[pos]; - - initialize_vertex_header(out); - /* Viewport mapping only, no cliptest/rhw divide - */ - position[0] = position[0] * scale[0] + trans[0]; - position[1] = position[1] * scale[1] + trans[1]; - position[2] = position[2] * scale[2] + trans[2]; - - out = (struct vertex_header *)((char *)out + info->stride); - } - - return FALSE; -} - - -/* If bypass_clipping is set and we have an identity viewport, nothing - * to do. - */ -static boolean post_vs_none( struct pt_post_vs *pvs, - struct draw_vertex_info *info ) -{ - struct vertex_header *out = info->verts; - unsigned j; - - if (0) debug_printf("%s\n", __FUNCTION__); - /* just initialize the vertex_id in all headers */ - for (j = 0; j < info->count; j++) { - initialize_vertex_header(out); - - out = (struct vertex_header *)((char *)out + info->stride); - } - return FALSE; -} - boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, struct draw_vertex_info *info ) { @@ -244,31 +117,72 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, void draw_pt_post_vs_prepare( struct pt_post_vs *pvs, - boolean bypass_clipping, + boolean clip_xy, + boolean clip_z, + boolean clip_user, boolean bypass_viewport, boolean opengl, boolean need_edgeflags ) { - if (!need_edgeflags) { - if (bypass_clipping) { - if (bypass_viewport) - pvs->run = post_vs_none; - else - pvs->run = post_vs_viewport; - } - else { - /* if (opengl) */ - pvs->run = post_vs_cliptest_viewport_gl; - } + pvs->flags = 0; + + if (clip_xy) + pvs->flags |= DO_CLIP_XY; + + if (clip_z && opengl) { + pvs->flags |= DO_CLIP_FULL_Z; + ASSIGN_4V( pvs->draw->plane[4], 0, 0, 1, 1 ); + } + + if (clip_z && !opengl) { + pvs->flags |= DO_CLIP_HALF_Z; + ASSIGN_4V( pvs->draw->plane[4], 0, 0, 1, 0 ); } - else { - /* If we need to copy edgeflags to the vertex header, it should - * mean we're running the primitive pipeline. Hence the bypass - * flags should be false. - */ - assert(!bypass_clipping); - assert(!bypass_viewport); - pvs->run = post_vs_cliptest_viewport_gl_edgeflag; + + if (clip_user) + pvs->flags |= DO_CLIP_USER; + + if (!bypass_viewport) + pvs->flags |= DO_VIEWPORT; + + if (need_edgeflags) + pvs->flags |= DO_EDGEFLAG; + + /* Now select the relevant function: + */ + switch (pvs->flags) { + case 0: + pvs->run = do_cliptest_none; + break; + + case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT: + pvs->run = do_cliptest_xy_fullz_viewport; + break; + + case DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT: + pvs->run = do_cliptest_xy_halfz_viewport; + break; + + case DO_CLIP_FULL_Z | DO_VIEWPORT: + pvs->run = do_cliptest_fullz_viewport; + break; + + case DO_CLIP_HALF_Z | DO_VIEWPORT: + pvs->run = do_cliptest_halfz_viewport; + break; + + case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT: + pvs->run = do_cliptest_xy_fullz_user_viewport; + break; + + case (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | + DO_VIEWPORT | DO_EDGEFLAG): + pvs->run = do_cliptest_xy_fullz_user_viewport_edgeflag; + break; + + default: + pvs->run = do_cliptest_generic; + break; } } diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c index f7f4f24d35..c86bdd99a3 100644 --- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c @@ -225,7 +225,7 @@ static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2) #define FUNC so_run_elts #define LOCAL_VARS const ushort *elts = input_prims->elts; -#define GET_ELT(idx) (elts[start + (idx)] & ~DRAW_PIPE_FLAG_MASK) +#define GET_ELT(idx) (elts[start + (idx)]) #include "draw_so_emit_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c index 182a597cca..513bbbed21 100644 --- a/src/gallium/auxiliary/draw/draw_pt_util.c +++ b/src/gallium/auxiliary/draw/draw_pt_util.c @@ -92,3 +92,10 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; } } + +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr) +{ + if (count < first) + return 0; + return count - (count - first) % incr; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c deleted file mode 100644 index cd7bb7bf25..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray.c +++ /dev/null @@ -1,200 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "util/u_math.h" -#include "util/u_memory.h" - -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - -#define FETCH_MAX 256 -#define DRAW_MAX (FETCH_MAX+8) - -struct varray_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned driver_fetch_max; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; -}; - - -static void varray_flush_linear(struct varray_frontend *varray, - unsigned start, unsigned count) -{ - if (count) { - assert(varray->middle->run_linear); - varray->middle->run_linear(varray->middle, start, count); - } -} - -static void varray_line_loop_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count, - boolean end ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 1) { - unsigned nr = 0, i; - - for (i = 0; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - if (end) - varray->fetch_elts[nr++] = start; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - -static void varray_fan_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 2) { - unsigned nr = 0, i; - - if (segment_start != 0) - varray->fetch_elts[nr++] = start; - - for (i = 0 ; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - - -#define FUNC varray_run -#include "draw_pt_varray_tmp_linear.h" - -static unsigned decompose_prim[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY + 1] = { - PIPE_PRIM_POINTS, - PIPE_PRIM_LINES, - PIPE_PRIM_LINE_STRIP, /* decomposed LINELOOP */ - PIPE_PRIM_LINE_STRIP, - PIPE_PRIM_TRIANGLES, - PIPE_PRIM_TRIANGLE_STRIP, - PIPE_PRIM_TRIANGLE_FAN, - PIPE_PRIM_QUADS, - PIPE_PRIM_QUAD_STRIP, - PIPE_PRIM_POLYGON, - PIPE_PRIM_LINES_ADJACENCY, - PIPE_PRIM_LINE_STRIP_ADJACENCY, - PIPE_PRIM_TRIANGLES_ADJACENCY, - PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY -}; - - - -static void varray_prepare(struct draw_pt_front_end *frontend, - unsigned in_prim, - struct draw_pt_middle_end *middle, - unsigned opt) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - - varray->base.run = varray_run; - - varray->input_prim = in_prim; - assert(in_prim < Elements(decompose_prim)); - varray->output_prim = decompose_prim[in_prim]; - - varray->middle = middle; - middle->prepare(middle, - varray->output_prim, - opt, &varray->driver_fetch_max ); - - /* check that the max is even */ - assert((varray->driver_fetch_max & 1) == 0); - - varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max); -} - - - - -static void varray_finish(struct draw_pt_front_end *frontend) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - varray->middle->finish(varray->middle); - varray->middle = NULL; -} - -static void varray_destroy(struct draw_pt_front_end *frontend) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw) -{ - ushort i; - struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend); - if (varray == NULL) - return NULL; - - varray->base.prepare = varray_prepare; - varray->base.run = NULL; - varray->base.finish = varray_finish; - varray->base.destroy = varray_destroy; - varray->draw = draw; - - for (i = 0; i < DRAW_MAX; i++) { - varray->draw_elts[i] = i; - } - - return &varray->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h deleted file mode 100644 index 7c722457c3..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h +++ /dev/null @@ -1,238 +0,0 @@ - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - struct draw_context *draw = varray->draw; - unsigned start = (unsigned)elts; - - boolean flatfirst = (draw->rasterizer->flatshade && - draw->rasterizer->flatshade_first); - unsigned i, j; - ushort flags; - unsigned first, incr; - - varray->fetch_start = start; - - draw_pt_split_prim(varray->input_prim, &first, &incr); - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i < end; i++) { - POINT(varray, i + 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+1 < end; i += 2) { - LINE(varray, DRAW_PIPE_RESET_STIPPLE, - i + 0, i + 1); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINE_LOOP: - if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; - - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - LINE(varray, flags, i - 1, 0); - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - case PIPE_PRIM_LINE_STRIP: - flags = DRAW_PIPE_RESET_STIPPLE; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i += 3) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLE_STRIP: - if (flatfirst) { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1 + (i&1), i + 2 - (i&1)); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - else { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i + 2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0 + (i&1), i + 1 - (i&1), i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - break; - - case PIPE_PRIM_TRIANGLE_FAN: - if (count >= 3) { - if (flatfirst) { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - else { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - } - break; - - case PIPE_PRIM_QUADS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 4) { - QUAD(varray, i + 0, i + 1, i + 2, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_QUAD_STRIP: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 2) { - QUAD(varray, i + 2, i + 0, i + 1, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - break; - - case PIPE_PRIM_POLYGON: - { - /* These bitflags look a little odd because we submit the - * vertices as (1,2,0) to satisfy flatshade requirements. - */ - const ushort edge_first = DRAW_PIPE_EDGE_FLAG_2; - const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0; - const ushort edge_last = DRAW_PIPE_EDGE_FLAG_1; - - flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++, flags = edge_middle) { - - if (i + 3 == count) - flags |= edge_last; - - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - default: - assert(0); - break; - } - - varray_flush(varray); -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h deleted file mode 100644 index 55e43b2a71..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h +++ /dev/null @@ -1,103 +0,0 @@ -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - /* - * count either has been trimmed in draw_pt_arrays or is set to - * (driver)_fetch_max which is hopefully always larger than first. - */ - assert(count >= first); - return count - (count - first) % incr; -} - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - unsigned start = (unsigned) ((char *) elts - (char *) NULL); - - unsigned j; - unsigned first, incr; - - assert(elt_bias == 0); - - draw_pt_split_prim(varray->input_prim, &first, &incr); - - /* Sanitize primitive length: - */ - count = trim(count, first, incr); - if (count < first) - return; - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - case PIPE_PRIM_LINES: - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_LINE_STRIP: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_QUADS: - case PIPE_PRIM_QUAD_STRIP: - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr ); - varray_flush_linear(varray, start + j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - case PIPE_PRIM_LINE_LOOP: - /* Always have to decompose as we've stated that this will be - * emitted as a line-strip. - */ - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_line_loop_segment(varray, start, j, nr, nr == remaining); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - - case PIPE_PRIM_POLYGON: - case PIPE_PRIM_TRIANGLE_FAN: - if (count < varray->driver_fetch_max) { - varray_flush_linear(varray, start, count); - } - else { - for ( j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_fan_segment(varray, start, j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - } - break; - - default: - assert(0); - break; - } -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c deleted file mode 100644 index a848b54f7d..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache.c +++ /dev/null @@ -1,610 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <keith@tungstengraphics.com> - */ - -#include "util/u_memory.h" -#include "util/u_prim.h" -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - - -#define CACHE_MAX 256 -#define FETCH_MAX 256 -#define DRAW_MAX (16*1024) - - -struct vcache_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - unsigned in[CACHE_MAX]; - ushort out[CACHE_MAX]; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned draw_count; - unsigned fetch_count; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; - - unsigned middle_prim; - unsigned opt; -}; - - -static INLINE void -vcache_flush( struct vcache_frontend *vcache ) -{ - if (vcache->middle_prim != vcache->output_prim) { - vcache->middle_prim = vcache->output_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - if (vcache->draw_count) { - vcache->middle->run( vcache->middle, - vcache->fetch_elts, - vcache->fetch_count, - vcache->draw_elts, - vcache->draw_count ); - } - - memset(vcache->in, ~0, sizeof(vcache->in)); - vcache->fetch_count = 0; - vcache->draw_count = 0; -} - - -static INLINE void -vcache_check_flush( struct vcache_frontend *vcache ) -{ - if (vcache->draw_count + 6 >= DRAW_MAX || - vcache->fetch_count + 6 >= FETCH_MAX) { - vcache_flush( vcache ); - } -} - - -static INLINE void -vcache_elt( struct vcache_frontend *vcache, - unsigned felt, - ushort flags ) -{ - unsigned idx = felt % CACHE_MAX; - - if (vcache->in[idx] != felt) { - assert(vcache->fetch_count < FETCH_MAX); - - vcache->in[idx] = felt; - vcache->out[idx] = (ushort)vcache->fetch_count; - vcache->fetch_elts[vcache->fetch_count++] = felt; - } - - vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags; -} - - - -static INLINE void -vcache_triangle( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_point( struct vcache_frontend *vcache, - unsigned i0 ) -{ - vcache_elt(vcache, i0, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_adj_flags( struct vcache_frontend *vcache, - unsigned flags, - unsigned a0, unsigned i0, unsigned i1, unsigned a1 ) -{ - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_adj( struct vcache_frontend *vcache, - unsigned a0, unsigned i0, unsigned i1, unsigned a1 ) -{ - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_adj_flags( struct vcache_frontend *vcache, - unsigned flags, - unsigned i0, unsigned a0, - unsigned i1, unsigned a1, - unsigned i2, unsigned a2 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_elt(vcache, i2, 0); - vcache_elt(vcache, a2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_adj( struct vcache_frontend *vcache, - unsigned i0, unsigned a0, - unsigned i1, unsigned a1, - unsigned i2, unsigned a2 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_elt(vcache, i2, 0); - vcache_elt(vcache, a2, 0); - vcache_check_flush(vcache); -} - - -/* At least for now, we're back to using a template include file for - * this. The two paths aren't too different though - it may be - * possible to reunify them. - */ -#define TRIANGLE(flags,i0,i1,i2) vcache_triangle_flags(vcache,flags,i0,i1,i2) -#define LINE(flags,i0,i1) vcache_line_flags(vcache,flags,i0,i1) -#define POINT(i0) vcache_point(vcache,i0) -#define LINE_ADJ(flags,a0,i0,i1,a1) \ - vcache_line_adj_flags(vcache,flags,a0,i0,i1,a1) -#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \ - vcache_triangle_adj_flags(vcache,flags,i0,a0,i1,a1,i2,a2) -#define FUNC vcache_run_extras -#include "draw_pt_vcache_tmp.h" - -#define TRIANGLE(flags,i0,i1,i2) vcache_triangle(vcache,i0,i1,i2) -#define LINE(flags,i0,i1) vcache_line(vcache,i0,i1) -#define POINT(i0) vcache_point(vcache,i0) -#define LINE_ADJ(flags,a0,i0,i1,a1) \ - vcache_line_adj(vcache,a0,i0,i1,a1) -#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \ - vcache_triangle_adj(vcache,i0,a0,i1,a1,i2,a2) -#define FUNC vcache_run -#include "draw_pt_vcache_tmp.h" - -static INLINE void -rebase_uint_elts( const unsigned *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -rebase_ushort_elts( const ushort *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -rebase_ubyte_elts( const ubyte *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -translate_uint_elts( const unsigned *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - -static INLINE void -translate_ushort_elts( const ushort *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - -static INLINE void -translate_ubyte_elts( const ubyte *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - - - -#if 0 -static INLINE enum pipe_format -format_from_get_elt( pt_elt_func get_elt ) -{ - switch (draw->pt.user.eltSize) { - case 1: return PIPE_FORMAT_R8_UNORM; - case 2: return PIPE_FORMAT_R16_UNORM; - case 4: return PIPE_FORMAT_R32_UNORM; - default: return PIPE_FORMAT_NONE; - } -} -#endif - - -/** - * Check if any vertex attributes use instance divisors. - * Note that instance divisors complicate vertex fetching so we need - * to take the vcache path when they're in use. - */ -static boolean -any_instance_divisors(const struct draw_context *draw) -{ - uint i; - - for (i = 0; i < draw->pt.nr_vertex_elements; i++) { - uint div = draw->pt.vertex_element[i].instance_divisor; - if (div) - return TRUE; - } - return FALSE; -} - - -static INLINE void -vcache_check_run( struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned draw_count ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - struct draw_context *draw = vcache->draw; - const unsigned min_index = draw->pt.user.min_index; - const unsigned max_index = draw->pt.user.max_index; - const unsigned index_size = draw->pt.user.eltSize; - unsigned fetch_count; - const ushort *transformed_elts; - ushort *storage = NULL; - boolean ok = FALSE; - - /* debug: verify indexes are in range [min_index, max_index] */ - if (0) { - unsigned i; - for (i = 0; i < draw_count; i++) { - if (index_size == 1) { - assert( ((const ubyte *) elts)[i] >= min_index); - assert( ((const ubyte *) elts)[i] <= max_index); - } - else if (index_size == 2) { - assert( ((const ushort *) elts)[i] >= min_index); - assert( ((const ushort *) elts)[i] <= max_index); - } - else { - assert(index_size == 4); - assert( ((const uint *) elts)[i] >= min_index); - assert( ((const uint *) elts)[i] <= max_index); - } - } - } - - /* Note: max_index is frequently 0xffffffff so we have to be sure - * that any arithmetic involving max_index doesn't overflow! - */ - if (max_index >= (unsigned) DRAW_PIPE_MAX_VERTICES) - goto fail; - - if (any_instance_divisors(draw)) - goto fail; - - fetch_count = max_index + 1 - min_index; - - if (0) - debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, - vcache->fetch_max, - draw_count); - - if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES || - fetch_count >= UNDEFINED_VERTEX_ID || - fetch_count > draw_count) { - if (0) debug_printf("fail\n"); - goto fail; - } - - if (vcache->middle_prim != vcache->input_prim) { - vcache->middle_prim = vcache->input_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - assert((elt_bias >= 0 && min_index + elt_bias >= min_index) || - (elt_bias < 0 && min_index + elt_bias < min_index)); - - if (min_index == 0 && - index_size == 2) { - transformed_elts = (const ushort *)elts; - } - else { - storage = MALLOC( draw_count * sizeof(ushort) ); - if (!storage) - goto fail; - - if (min_index == 0) { - switch(index_size) { - case 1: - translate_ubyte_elts( (const ubyte *)elts, - draw_count, - storage ); - break; - - case 2: - translate_ushort_elts( (const ushort *)elts, - draw_count, - storage ); - break; - - case 4: - translate_uint_elts( (const uint *)elts, - draw_count, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - else { - switch(index_size) { - case 1: - rebase_ubyte_elts( (const ubyte *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 2: - rebase_ushort_elts( (const ushort *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 4: - rebase_uint_elts( (const uint *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - transformed_elts = storage; - } - - if (fetch_count < UNDEFINED_VERTEX_ID) - ok = vcache->middle->run_linear_elts( vcache->middle, - min_index + elt_bias, /* start */ - fetch_count, - transformed_elts, - draw_count ); - - FREE(storage); - - if (ok) - return; - - debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n", - fetch_count, draw_count); - -fail: - vcache_run( frontend, get_elt, elts, elt_bias, draw_count ); -} - - - - -static void -vcache_prepare( struct draw_pt_front_end *frontend, - unsigned in_prim, - struct draw_pt_middle_end *middle, - unsigned opt ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - - if (opt & PT_PIPELINE) { - vcache->base.run = vcache_run_extras; - } - else { - vcache->base.run = vcache_check_run; - } - - /* VCache will always emit the reduced version of its input - * primitive, ie STRIP/FANS become TRIS, etc. - * - * This is not to be confused with what the GS might be up to, - * which is a separate issue. - */ - vcache->input_prim = in_prim; - switch (in_prim) { - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - vcache->output_prim = PIPE_PRIM_LINES_ADJACENCY; - break; - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - vcache->output_prim = PIPE_PRIM_TRIANGLES_ADJACENCY; - break; - default: - vcache->output_prim = u_reduced_prim(in_prim); - } - - vcache->middle = middle; - vcache->opt = opt; - - /* Have to run prepare here, but try and guess a good prim for - * doing so: - */ - vcache->middle_prim = (opt & PT_PIPELINE) - ? vcache->output_prim : vcache->input_prim; - - middle->prepare( middle, - vcache->middle_prim, - opt, &vcache->fetch_max ); -} - - -static void -vcache_finish( struct draw_pt_front_end *frontend ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - vcache->middle->finish( vcache->middle ); - vcache->middle = NULL; -} - - -static void -vcache_destroy( struct draw_pt_front_end *frontend ) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ) -{ - struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend ); - if (vcache == NULL) - return NULL; - - vcache->base.prepare = vcache_prepare; - vcache->base.run = NULL; - vcache->base.finish = vcache_finish; - vcache->base.destroy = vcache_destroy; - vcache->draw = draw; - - memset(vcache->in, ~0, sizeof(vcache->in)); - - return &vcache->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h deleted file mode 100644 index 1a3748d5f0..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h +++ /dev/null @@ -1,19 +0,0 @@ -#define FUNC_VARS \ - struct draw_pt_front_end *frontend, \ - pt_elt_func get_elt, \ - const void *elts, \ - int elt_bias, \ - unsigned count - -#define LOCAL_VARS \ - struct vcache_frontend *vcache = (struct vcache_frontend *) frontend; \ - struct draw_context *draw = vcache->draw; \ - const unsigned prim = vcache->input_prim; \ - const boolean last_vertex_last = !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); - -#define GET_ELT(idx) (get_elt(elts, idx) + elt_bias) - -#define FUNC_EXIT do { vcache_flush(vcache); } while (0) - -#include "draw_decompose_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c new file mode 100644 index 0000000000..a687525309 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c @@ -0,0 +1,208 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "util/u_math.h" +#include "util/u_memory.h" + +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_pt.h" + +#define SEGMENT_SIZE 1024 +#define MAP_SIZE 256 + +struct vsplit_frontend { + struct draw_pt_front_end base; + struct draw_context *draw; + + unsigned prim; + + struct draw_pt_middle_end *middle; + + unsigned max_vertices; + ushort segment_size; + + /* buffers for splitting */ + unsigned fetch_elts[SEGMENT_SIZE]; + ushort draw_elts[SEGMENT_SIZE]; + ushort identity_draw_elts[SEGMENT_SIZE]; + + struct { + /* map a fetch element to a draw element */ + unsigned fetches[MAP_SIZE]; + ushort draws[MAP_SIZE]; + boolean has_max_fetch; + + ushort num_fetch_elts; + ushort num_draw_elts; + } cache; +}; + + +static void +vsplit_clear_cache(struct vsplit_frontend *vsplit) +{ + memset(vsplit->cache.fetches, 0xff, sizeof(vsplit->cache.fetches)); + vsplit->cache.has_max_fetch = FALSE; + vsplit->cache.num_fetch_elts = 0; + vsplit->cache.num_draw_elts = 0; +} + +static void +vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags) +{ + vsplit->middle->run(vsplit->middle, + vsplit->fetch_elts, vsplit->cache.num_fetch_elts, + vsplit->draw_elts, vsplit->cache.num_draw_elts, flags); +} + +/** + * Add a fetch element and add it to the draw elements. + */ +static INLINE void +vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch) +{ + unsigned hash = fetch % MAP_SIZE; + + if (vsplit->cache.fetches[hash] != fetch) { + /* update cache */ + vsplit->cache.fetches[hash] = fetch; + vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts; + + /* add fetch */ + assert(vsplit->cache.num_fetch_elts < vsplit->segment_size); + vsplit->fetch_elts[vsplit->cache.num_fetch_elts++] = fetch; + } + + vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash]; +} + + +/** + * Add a fetch element and add it to the draw elements. The fetch element is + * in full range (uint). + */ +static INLINE void +vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch) +{ + /* special care for 0xffffffff */ + if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) { + unsigned hash = fetch % MAP_SIZE; + vsplit->cache.fetches[hash] = fetch - 1; /* force update */ + vsplit->cache.has_max_fetch = TRUE; + } + + vsplit_add_cache(vsplit, fetch); +} + + +#define FUNC vsplit_run_linear +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ubyte +#define ELT_TYPE ubyte +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ushort +#define ELT_TYPE ushort +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_uint +#define ELT_TYPE uint +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + + +static void vsplit_prepare(struct draw_pt_front_end *frontend, + unsigned in_prim, + struct draw_pt_middle_end *middle, + unsigned opt) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + + switch (vsplit->draw->pt.user.eltSize) { + case 0: + vsplit->base.run = vsplit_run_linear; + break; + case 1: + vsplit->base.run = vsplit_run_ubyte; + break; + case 2: + vsplit->base.run = vsplit_run_ushort; + break; + case 4: + vsplit->base.run = vsplit_run_uint; + break; + default: + assert(0); + break; + } + + /* split only */ + vsplit->prim = in_prim; + + vsplit->middle = middle; + middle->prepare(middle, vsplit->prim, opt, &vsplit->max_vertices); + + vsplit->segment_size = MIN2(SEGMENT_SIZE, vsplit->max_vertices); +} + + +static void vsplit_finish(struct draw_pt_front_end *frontend) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + vsplit->middle->finish(vsplit->middle); + vsplit->middle = NULL; +} + + +static void vsplit_destroy(struct draw_pt_front_end *frontend) +{ + FREE(frontend); +} + + +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw) +{ + struct vsplit_frontend *vsplit = CALLOC_STRUCT(vsplit_frontend); + ushort i; + + if (!vsplit) + return NULL; + + vsplit->base.prepare = vsplit_prepare; + vsplit->base.run = NULL; + vsplit->base.finish = vsplit_finish; + vsplit->base.destroy = vsplit_destroy; + vsplit->draw = draw; + + for (i = 0; i < SEGMENT_SIZE; i++) + vsplit->identity_draw_elts[i] = i; + + return &vsplit->base; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h new file mode 100644 index 0000000000..3f66f962e1 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h @@ -0,0 +1,309 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#define CONCAT2(name, elt_type) name ## elt_type +#define CONCAT(name, elt_type) CONCAT2(name, elt_type) + +#ifdef ELT_TYPE + +/** + * Fetch all elements in [min_index, max_index] with bias, and use the + * (rebased) index buffer as the draw elements. + */ +static boolean +CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned istart, unsigned icount) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) + ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset); + const unsigned min_index = draw->pt.user.min_index; + const unsigned max_index = draw->pt.user.max_index; + const int elt_bias = draw->pt.user.eltBias; + unsigned fetch_start, fetch_count; + const ushort *draw_elts = NULL; + unsigned i; + + /* use the ib directly */ + if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) { + if (icount > vsplit->max_vertices) + return FALSE; + + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + assert(idx >= min_index && idx <= max_index); + } + draw_elts = (const ushort *) ib; + } + else { + /* have to go through vsplit->draw_elts */ + if (icount > vsplit->segment_size) + return FALSE; + } + + /* this is faster only when we fetch less elements than the normal path */ + if (max_index - min_index > icount - 1) + return FALSE; + + if (elt_bias < 0 && min_index < -elt_bias) + return FALSE; + + /* why this check? */ + for (i = 0; i < draw->pt.nr_vertex_elements; i++) { + if (draw->pt.vertex_element[i].instance_divisor) + return FALSE; + } + + fetch_start = min_index + elt_bias; + fetch_count = max_index - min_index + 1; + + if (!draw_elts) { + if (min_index == 0) { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) idx; + } + } + else { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) (idx - min_index); + } + } + + draw_elts = vsplit->draw_elts; + } + + return vsplit->middle->run_linear_elts(vsplit->middle, + fetch_start, fetch_count, + draw_elts, icount, 0x0); +} + +/** + * Use the cache to prepare the fetch and draw elements, and flush. + * + * When spoken is TRUE, ispoken replaces istart; When close is TRUE, iclose is + * appended. + */ +static INLINE void +CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, unsigned icount, + boolean spoken, unsigned ispoken, + boolean close, unsigned iclose) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) + ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset); + const int ibias = draw->pt.user.eltBias; + unsigned i; + + assert(icount + !!close <= vsplit->segment_size); + + vsplit_clear_cache(vsplit); + + spoken = !!spoken; + if (ibias == 0) { + if (spoken) + ADD_CACHE(vsplit, ib[ispoken]); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, ib[istart + i]); + + if (close) + ADD_CACHE(vsplit, ib[iclose]); + } + else if (ibias > 0) { + if (spoken) + ADD_CACHE(vsplit, (uint) ib[ispoken] + ibias); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, (uint) ib[istart + i] + ibias); + + if (close) + ADD_CACHE(vsplit, (uint) ib[iclose] + ibias); + } + else { + if (spoken) { + if (ib[ispoken] < -ibias) + return; + ADD_CACHE(vsplit, ib[ispoken] + ibias); + } + + for (i = spoken; i < icount; i++) { + if (ib[istart + i] < -ibias) + return; + ADD_CACHE(vsplit, ib[istart + i] + ibias); + } + + if (close) { + if (ib[iclose] < -ibias) + return; + ADD_CACHE(vsplit, ib[iclose] + ibias); + } + } + + vsplit_flush_cache(vsplit, flags); +} + +static void +CONCAT(vsplit_segment_simple_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount) +{ + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, FALSE, 0); +} + +static void +CONCAT(vsplit_segment_loop_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean close_loop = ((flags) == DRAW_SPLIT_BEFORE); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, close_loop, i0); +} + +static void +CONCAT(vsplit_segment_fan_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean use_spoken = (((flags) & DRAW_SPLIT_BEFORE) != 0); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, use_spoken, i0, FALSE, 0); +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->segment_size; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) \ + CONCAT(vsplit_primitive_, ELT_TYPE)(vsplit, istart, icount) + +#else /* ELT_TYPE */ + +static void +vsplit_segment_simple_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount) +{ + assert(icount <= vsplit->max_vertices); + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); +} + +static void +vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean close_loop = (flags == DRAW_SPLIT_BEFORE); + unsigned nr; + + assert(icount + !!close_loop <= vsplit->segment_size); + + if (close_loop) { + for (nr = 0; nr < icount; nr++) + vsplit->fetch_elts[nr] = istart + nr; + vsplit->fetch_elts[nr++] = i0; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +static void +vsplit_segment_fan_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean use_spoken = ((flags & DRAW_SPLIT_BEFORE) != 0); + unsigned nr = 0, i; + + assert(icount + !!use_spoken <= vsplit->segment_size); + + if (use_spoken) { + vsplit->fetch_elts[nr++] = i0; + for (i = 1 ; i < icount; i++) + vsplit->fetch_elts[nr++] = istart + i; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->max_vertices; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) FALSE + +#define ELT_TYPE linear + +#endif /* ELT_TYPE */ + +#define FUNC_VARS \ + struct draw_pt_front_end *frontend, \ + unsigned start, \ + unsigned count + +#define SEGMENT_SIMPLE(flags, istart, icount) \ + CONCAT(vsplit_segment_simple_, ELT_TYPE)(vsplit, flags, istart, icount) + +#define SEGMENT_LOOP(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_loop_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#define SEGMENT_FAN(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_fan_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#include "draw_split_tmp.h" + +#undef CONCAT2 +#undef CONCAT + +#undef ELT_TYPE +#undef ADD_CACHE diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h index 6d8937a0b4..7fafde9d5e 100644 --- a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h +++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h @@ -7,11 +7,9 @@ #define FUNC_ENTER \ /* declare more local vars */ \ - struct draw_context *draw = so->draw; \ const unsigned prim = input_prims->prim; \ - const boolean last_vertex_last = \ - !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ + const unsigned prim_flags = input_prims->flags; \ + const boolean last_vertex_last = TRUE; \ do { \ debug_assert(input_prims->primitive_count == 1); \ switch (prim) { \ diff --git a/src/gallium/auxiliary/draw/draw_split_tmp.h b/src/gallium/auxiliary/draw/draw_split_tmp.h new file mode 100644 index 0000000000..47defc62b9 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_split_tmp.h @@ -0,0 +1,176 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +static void +FUNC(FUNC_VARS) +{ + unsigned first, incr; + LOCAL_VARS + + /* + * prim, start, count, and max_count_{simple,loop,fan} should have been + * defined + */ + if (0) { + debug_printf("%s: prim 0x%x, start %d, count %d, max_count_simple %d, " + "max_count_loop %d, max_count_fan %d\n", + __FUNCTION__, prim, start, count, max_count_simple, + max_count_loop, max_count_fan); + } + + draw_pt_split_prim(prim, &first, &incr); + /* sanitize primitive length */ + count = draw_pt_trim_count(count, first, incr); + if (count < first) + return; + + /* try flushing the entire primitive */ + if (PRIMITIVE(start, count)) + return; + + /* must be able to at least flush two complete primitives */ + assert(max_count_simple >= first + incr && + max_count_loop >= first + incr && + max_count_fan >= first + incr); + + /* no splitting required */ + if (count <= max_count_simple) { + SEGMENT_SIMPLE(0x0, start, count); + } + else { + const unsigned rollback = first - incr; + unsigned flags = DRAW_SPLIT_AFTER, seg_start = 0, seg_max; + + /* + * Both count and seg_max below are explicitly trimmed. Because + * + * seg_start = N * (seg_max - rollback) = N' * incr, + * + * we have + * + * remaining = count - seg_start = first + N'' * incr. + * + * That is, remaining is implicitly trimmed. + */ + switch (prim) { + case PIPE_PRIM_POINTS: + case PIPE_PRIM_LINES: + case PIPE_PRIM_LINE_STRIP: + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_QUADS: + case PIPE_PRIM_QUAD_STRIP: + case PIPE_PRIM_LINES_ADJACENCY: + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + case PIPE_PRIM_TRIANGLES_ADJACENCY: + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + seg_max = + draw_pt_trim_count(MIN2(max_count_simple, count), first, incr); + if (prim == PIPE_PRIM_TRIANGLE_STRIP || + prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) { + /* make sure we flush even number of triangles at a time */ + if (seg_max < count && !(((seg_max - first) / incr) & 1)) + seg_max -= incr; + } + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_SIMPLE(flags, start + seg_start, seg_max); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_SIMPLE(flags, start + seg_start, remaining); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_LINE_LOOP: + seg_max = + draw_pt_trim_count(MIN2(max_count_loop, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_LOOP(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_LOOP(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_TRIANGLE_FAN: + case PIPE_PRIM_POLYGON: + seg_max = + draw_pt_trim_count(MIN2(max_count_fan, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_FAN(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_FAN(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + default: + assert(0); + break; + } + } +} + +#undef FUNC +#undef FUNC_VARS +#undef LOCAL_VARS + +#undef PRIMITIVE +#undef SEGMENT_SIMPLE +#undef SEGMENT_LOOP +#undef SEGMENT_FAN diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c index d13ad24fff..fa9992db78 100644 --- a/src/gallium/auxiliary/draw/draw_vs_llvm.c +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -28,6 +28,7 @@ #include "util/u_math.h" #include "util/u_memory.h" #include "pipe/p_shader_tokens.h" +#include "pipe/p_screen.h" #include "draw_private.h" #include "draw_context.h" @@ -109,6 +110,11 @@ draw_create_vs_llvm(struct draw_context *draw, tgsi_scan_shader(state->tokens, &vs->base.info); + vs->variant_key_size = + draw_llvm_variant_key_size( + vs->base.info.file_max[TGSI_FILE_INPUT]+1, + vs->base.info.file_max[TGSI_FILE_SAMPLER]+1); + vs->base.draw = draw; vs->base.prepare = vs_llvm_prepare; vs->base.run_linear = vs_llvm_run_linear; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 7b35dd4bb4..e0d30be98d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -59,14 +59,6 @@ #include "lp_bld_arit.h" -/* - * XXX: Increasing eliminates some artifacts, but adds others, most - * noticeably corruption in the Earth halo in Google Earth. - */ -#define RCP_NEWTON_STEPS 0 - -#define RSQRT_NEWTON_STEPS 0 - #define EXP_POLY_DEGREE 3 #define LOG_POLY_DEGREE 5 @@ -267,7 +259,7 @@ lp_build_add(struct lp_build_context *bld, } -/** Return the sum of the elements of a */ +/** Return the scalar sum of the elements of a */ LLVMValueRef lp_build_sum_vector(struct lp_build_context *bld, LLVMValueRef a) @@ -278,11 +270,9 @@ lp_build_sum_vector(struct lp_build_context *bld, assert(lp_check_value(type, a)); - if (a == bld->zero) - return bld->zero; - if (a == bld->undef) - return bld->undef; - assert(type.length > 1); + if (type.length == 1) { + return a; + } assert(!bld->type.norm); @@ -546,7 +536,7 @@ lp_build_mul_imm(struct lp_build_context *bld, if(b == 2 && bld->type.floating) return lp_build_add(bld, a, a); - if(util_is_pot(b)) { + if(util_is_power_of_two(b)) { unsigned shift = ffs(b) - 1; if(bld->type.floating) { @@ -1266,6 +1256,11 @@ lp_build_sqrt(struct lp_build_context *bld, * * x_{i+1} = x_i * (2 - a * x_i) * + * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or + * +/-Inf, giving NaN instead. Certain applications rely on this behavior, + * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's + * halo. It would be necessary to clamp the argument to prevent this. + * * See also: * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division * - http://softwarecommunity.intel.com/articles/eng/1818.htm @@ -1306,13 +1301,27 @@ lp_build_rcp(struct lp_build_context *bld, if(LLVMIsConstant(a)) return LLVMConstFDiv(bld->one, a); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + /* + * We don't use RCPPS because: + * - it only has 10bits of precision + * - it doesn't even get the reciprocate of 1.0 exactly + * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf + * - for recent processors the benefit over DIVPS is marginal, a case + * depedent + * + * We could still use it on certain processors if benchmarks show that the + * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for + * particular uses that require less workarounds. + */ + + if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); - for (i = 0; i < RCP_NEWTON_STEPS; ++i) { + for (i = 0; i < num_iterations; ++i) { res = lp_build_rcp_refine(bld, a, res); } @@ -1363,13 +1372,14 @@ lp_build_rsqrt(struct lp_build_context *bld, assert(type.floating); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); - for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) { + for (i = 0; i < num_iterations; ++i) { res = lp_build_rsqrt_refine(bld, a, res); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c index 39dfc51e50..d3a5afff8c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c @@ -46,7 +46,7 @@ boolean lp_check_alignment(const void *ptr, unsigned alignment) { - assert(util_is_pot(alignment)); + assert(util_is_power_of_two(alignment)); return ((uintptr_t)ptr & (alignment - 1)) == 0; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 247cb83ce6..92123e09d3 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -388,7 +388,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && - util_is_pot(format_desc->block.bits)) { + util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; /* @@ -416,7 +416,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && - util_is_pot(format_desc->block.bits) && + util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && !format_desc->is_mixed && diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 6d5410d970..48baf7c425 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -40,6 +40,7 @@ #include <llvm/ExecutionEngine/ExecutionEngine.h> #include <llvm/ExecutionEngine/JITEventListener.h> #include <llvm/Support/CommandLine.h> +#include <llvm/Support/PrettyStackTrace.h> #include "pipe/p_config.h" #include "util/u_debug.h" @@ -143,7 +144,6 @@ lp_set_target_options(void) llvm::UnsafeFPMath = true; #endif -#if 0 /* * LLVM will generate MMX instructions for vectors <= 64 bits, leading to * innefficient code, and in 32bit systems, to the corruption of the FPU @@ -152,10 +152,8 @@ lp_set_target_options(void) * See also: * - http://llvm.org/bugs/show_bug.cgi?id=3287 * - http://l4.me.uk/post/2009/06/07/llvm-wrinkle-3-configuration-what-configuration/ - * - * XXX: Unfortunately this is not working. */ - static boolean first = FALSE; + static boolean first = TRUE; if (first) { static const char* options[] = { "prog", @@ -164,7 +162,13 @@ lp_set_target_options(void) llvm::cl::ParseCommandLineOptions(2, const_cast<char**>(options)); first = FALSE; } -#endif + + /* + * By default LLVM adds a signal handler to output a pretty stack trace. + * This signal handler is never removed, causing problems when unloading the + * shared object where the gallium driver resides. + */ + llvm::DisablePrettyStackTrace = true; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index e470082b97..e947b90d16 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -37,6 +37,8 @@ #define LP_BLD_PACK_H +#include "pipe/p_compiler.h" + #include "gallivm/lp_bld.h" diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 0fd014ab9b..259b1142e3 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -82,9 +82,9 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, state->swizzle_a = view->swizzle_a; state->target = texture->target; - state->pot_width = util_is_pot(texture->width0); - state->pot_height = util_is_pot(texture->height0); - state->pot_depth = util_is_pot(texture->depth0); + state->pot_width = util_is_power_of_two(texture->width0); + state->pot_height = util_is_power_of_two(texture->height0); + state->pot_depth = util_is_power_of_two(texture->depth0); state->wrap_s = sampler->wrap_s; state->wrap_t = sampler->wrap_t; @@ -124,6 +124,52 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, /** + * Compute the partial offset of a pixel block along an arbitrary axis. + * + * @param coord coordinate in pixels + * @param stride number of bytes between rows of successive pixel blocks + * @param block_length number of pixels in a pixels block along the coordinate + * axis + * @param out_offset resulting relative offset of the pixel block in bytes + * @param out_subcoord resulting sub-block pixel coordinate + */ +void +lp_build_sample_partial_offset(struct lp_build_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_subcoord) +{ + LLVMValueRef offset; + LLVMValueRef subcoord; + + if (block_length == 1) { + subcoord = bld->zero; + } + else { + /* + * Pixel blocks have power of two dimensions. LLVM should convert the + * rem/div to bit arithmetic. + * TODO: Verify this. + */ + + LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length); + subcoord = LLVMBuildURem(bld->builder, coord, block_width, ""); + coord = LLVMBuildUDiv(bld->builder, coord, block_width, ""); + } + + offset = lp_build_mul(bld, coord, stride); + + assert(out_offset); + assert(out_subcoord); + + *out_offset = offset; + *out_subcoord = subcoord; +} + + +/** * Compute the offset of a pixel block. * * x, y, z, y_stride, z_stride are vectors, and they refer to pixels. @@ -144,48 +190,35 @@ lp_build_sample_offset(struct lp_build_context *bld, { LLVMValueRef x_stride; LLVMValueRef offset; - LLVMValueRef i; - LLVMValueRef j; - - /* - * Describe the coordinates in terms of pixel blocks. - * - * TODO: pixel blocks are power of two. LLVM should convert rem/div to - * bit arithmetic. Verify this. - */ - - if (format_desc->block.width == 1) { - i = bld->zero; - } - else { - LLVMValueRef block_width = lp_build_const_int_vec(bld->type, format_desc->block.width); - i = LLVMBuildURem(bld->builder, x, block_width, ""); - x = LLVMBuildUDiv(bld->builder, x, block_width, ""); - } - - if (format_desc->block.height == 1) { - j = bld->zero; - } - else { - LLVMValueRef block_height = lp_build_const_int_vec(bld->type, format_desc->block.height); - j = LLVMBuildURem(bld->builder, y, block_height, ""); - y = LLVMBuildUDiv(bld->builder, y, block_height, ""); - } x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8); - offset = lp_build_mul(bld, x, x_stride); + + lp_build_sample_partial_offset(bld, + format_desc->block.width, + x, x_stride, + &offset, out_i); if (y && y_stride) { - LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride); + LLVMValueRef y_offset; + lp_build_sample_partial_offset(bld, + format_desc->block.height, + y, y_stride, + &y_offset, out_j); offset = lp_build_add(bld, offset, y_offset); } + else { + *out_j = bld->zero; + } if (z && z_stride) { - LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride); + LLVMValueRef z_offset; + LLVMValueRef k; + lp_build_sample_partial_offset(bld, + 1, /* pixel blocks are always 2D */ + z, z_stride, + &z_offset, &k); offset = lp_build_add(bld, offset, z_offset); } *out_offset = offset; - *out_i = i; - *out_j = j; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index 5b8f478094..caafc4eca0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -36,6 +36,8 @@ #define LP_BLD_SAMPLE_H +#include "pipe/p_format.h" + #include "gallivm/lp_bld.h" struct pipe_resource; @@ -147,6 +149,15 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, void +lp_build_sample_partial_offset(struct lp_build_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_i); + + +void lp_build_sample_offset(struct lp_build_context *bld, const struct util_format_description *format_desc, LLVMValueRef x, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 806c7d56a8..1f39d9c98b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -176,6 +176,7 @@ texture_dims(enum pipe_texture_target tex) case PIPE_TEXTURE_1D: return 1; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_CUBE: return 2; case PIPE_TEXTURE_3D: @@ -322,59 +323,6 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, /** - * Fetch the texels as <4n x i8> in AoS form. - */ -static LLVMValueRef -lp_build_sample_packed(struct lp_build_sample_context *bld, - LLVMValueRef x, - LLVMValueRef y, - LLVMValueRef y_stride, - LLVMValueRef data_array) -{ - LLVMValueRef offset, i, j; - LLVMValueRef data_ptr; - LLVMValueRef res; - - /* convert x,y,z coords to linear offset from start of texture, in bytes */ - lp_build_sample_offset(&bld->uint_coord_bld, - bld->format_desc, - x, y, NULL, y_stride, NULL, - &offset, &i, &j); - - /* get pointer to mipmap level 0 data */ - data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0); - - if (util_format_is_rgba8_variant(bld->format_desc)) { - /* Just fetch the data directly without swizzling */ - assert(bld->format_desc->block.width == 1); - assert(bld->format_desc->block.height == 1); - assert(bld->format_desc->block.bits <= bld->texel_type.width); - - res = lp_build_gather(bld->builder, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset); - } - else { - struct lp_type type; - - assert(bld->texel_type.width == 32); - - memset(&type, 0, sizeof type); - type.width = 8; - type.length = bld->texel_type.length*4; - type.norm = TRUE; - - res = lp_build_fetch_rgba_aos(bld->builder, bld->format_desc, type, - data_ptr, offset, i, j); - } - - return res; -} - - -/** * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes. */ static LLVMValueRef @@ -408,7 +356,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld, /** - * We only support a few wrap modes in lp_build_sample_wrap_int() at this time. + * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at this time. * Return whether the given mode is supported by that function. */ static boolean @@ -430,13 +378,18 @@ is_simple_wrap_mode(unsigned mode) * \param length the texture size along one dimension * \param is_pot if TRUE, length is a power of two * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param i0 resulting sub-block pixel coordinate for coord0 */ -static LLVMValueRef -lp_build_sample_wrap_int(struct lp_build_sample_context *bld, - LLVMValueRef coord, - LLVMValueRef length, - boolean is_pot, - unsigned wrap_mode) +static void +lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef length, + LLVMValueRef stride, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *out_offset, + LLVMValueRef *out_i) { struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; @@ -469,7 +422,134 @@ lp_build_sample_wrap_int(struct lp_build_sample_context *bld, assert(0); } - return coord; + lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride, + out_offset, out_i); +} + + +/** + * Build LLVM code for texture wrap mode, for scaled integer texcoords. + * \param coord0 the incoming texcoord (s,t,r or q) scaled to the texture size + * \param length the texture size along one dimension + * \param stride pixel stride along the coordinate axis + * \param block_length is the length of the pixel block along the + * coordinate axis + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param offset0 resulting relative offset for coord0 + * \param offset1 resulting relative offset for coord0 + 1 + * \param i0 resulting sub-block pixel coordinate for coord0 + * \param i1 resulting sub-block pixel coordinate for coord0 + 1 + */ +static void +lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord0, + LLVMValueRef length, + LLVMValueRef stride, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *offset0, + LLVMValueRef *offset1, + LLVMValueRef *i0, + LLVMValueRef *i1) +{ + struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + LLVMValueRef length_minus_one; + LLVMValueRef lmask, umask, mask; + + if (block_length != 1) { + /* + * If the pixel block covers more than one pixel then there is no easy + * way to calculate offset1 relative to offset0. Instead, compute them + * independently. + */ + + LLVMValueRef coord1; + + lp_build_sample_wrap_nearest_int(bld, + block_length, + coord0, + length, + stride, + is_pot, + wrap_mode, + offset0, i0); + + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + + lp_build_sample_wrap_nearest_int(bld, + block_length, + coord1, + length, + stride, + is_pot, + wrap_mode, + offset1, i1); + + return; + } + + /* + * Scalar pixels -- try to compute offset0 and offset1 with a single stride + * multiplication. + */ + + *i0 = uint_coord_bld->zero; + *i1 = uint_coord_bld->zero; + + length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if (is_pot) { + coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, ""); + } + else { + /* Signed remainder won't give the right results for negative + * dividends but unsigned remainder does.*/ + coord0 = LLVMBuildURem(bld->builder, coord0, length, ""); + } + + mask = lp_build_compare(bld->builder, int_coord_bld->type, + PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); + + *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); + *offset1 = LLVMBuildAnd(bld->builder, + lp_build_add(uint_coord_bld, *offset0, stride), + mask, ""); + break; + + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type, + PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero); + umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type, + PIPE_FUNC_LESS, coord0, length_minus_one); + + coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero); + coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one); + + mask = LLVMBuildAnd(bld->builder, lmask, umask, ""); + + *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); + *offset1 = lp_build_add(uint_coord_bld, + *offset0, + LLVMBuildAnd(bld->builder, stride, mask, "")); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + *offset0 = uint_coord_bld->zero; + *offset1 = uint_coord_bld->zero; + break; + } } @@ -1740,16 +1820,21 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, LLVMValueRef i32_c8, i32_c128, i32_c255; LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi; - LLVMValueRef x0, x1; - LLVMValueRef y0, y1; - LLVMValueRef neighbors[2][2]; + LLVMValueRef data_ptr; + LLVMValueRef x_stride, y_stride; + LLVMValueRef x_offset0, x_offset1; + LLVMValueRef y_offset0, y_offset1; + LLVMValueRef offset[2][2]; + LLVMValueRef x_subcoord[2], y_subcoord[2]; LLVMValueRef neighbors_lo[2][2]; LLVMValueRef neighbors_hi[2][2]; LLVMValueRef packed, packed_lo, packed_hi; LLVMValueRef unswizzled[4]; - LLVMValueRef stride; + const unsigned level = 0; + unsigned i, j; - assert(bld->static_state->target == PIPE_TEXTURE_2D); + assert(bld->static_state->target == PIPE_TEXTURE_2D + || bld->static_state->target == PIPE_TEXTURE_RECT); assert(bld->static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR); assert(bld->static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR); assert(bld->static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE); @@ -1793,21 +1878,30 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, s_fpart = LLVMBuildAnd(builder, s, i32_c255, ""); t_fpart = LLVMBuildAnd(builder, t, i32_c255, ""); - x0 = s_ipart; - y0 = t_ipart; - - x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one); - y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one); - - x0 = lp_build_sample_wrap_int(bld, x0, width, bld->static_state->pot_width, - bld->static_state->wrap_s); - y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height, - bld->static_state->wrap_t); - - x1 = lp_build_sample_wrap_int(bld, x1, width, bld->static_state->pot_width, - bld->static_state->wrap_s); - y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height, - bld->static_state->wrap_t); + x_stride = lp_build_const_vec(bld->uint_coord_bld.type, + bld->format_desc->block.bits/8); + + y_stride = lp_build_get_const_level_stride_vec(bld, stride_array, level); + + lp_build_sample_wrap_linear_int(bld, + bld->format_desc->block.width, + s_ipart, width, x_stride, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_offset0, &x_offset1, + &x_subcoord[0], &x_subcoord[1]); + lp_build_sample_wrap_linear_int(bld, + bld->format_desc->block.height, + t_ipart, height, y_stride, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_offset0, &y_offset1, + &y_subcoord[0], &y_subcoord[1]); + + offset[0][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset0); + offset[0][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset0); + offset[1][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset1); + offset[1][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset1); /* * Transform 4 x i32 in @@ -1836,7 +1930,6 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shuffle_lo; LLVMValueRef shuffle_hi; - unsigned i, j; for(j = 0; j < h16.type.length; j += 4) { #ifdef PIPE_ARCH_LITTLE_ENDIAN @@ -1864,7 +1957,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, ""); } - stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0); + /* + * get pointer to mipmap level 0 data + */ + data_ptr = lp_build_get_const_mipmap_level(bld, data_array, level); /* * Fetch the pixels as 4 x 32bit (rgba order might differ): @@ -1883,20 +1979,38 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, * The higher 8 bits of the resulting elements will be zero. */ - neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array); - neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array); - neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array); - neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array); + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + LLVMValueRef rgba8; - neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, ""); - neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, ""); - neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, ""); - neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, ""); + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->builder, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset[j][i]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]); + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->builder, + bld->format_desc, + u8n.type, + data_ptr, offset[j][i], + x_subcoord[i], + y_subcoord[j]); + } + + lp_build_unpack2(builder, u8n.type, h16.type, + rgba8, + &neighbors_lo[j][i], &neighbors_hi[j][i]); + } + } /* * Linear interpolate with 8.8 fixed point. @@ -2077,7 +2191,8 @@ lp_build_sample_soa(LLVMBuilderRef builder, } else if (util_format_fits_8unorm(bld.format_desc) && bld.format_desc->nr_channels > 1 && - static_state->target == PIPE_TEXTURE_2D && + (static_state->target == PIPE_TEXTURE_2D || + static_state->target == PIPE_TEXTURE_RECT) && static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR && static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR && static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 0aa64affac..0e07f7f3f3 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -200,8 +200,10 @@ static void lp_exec_mask_cond_push(struct lp_exec_mask *mask, } mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask; assert(LLVMTypeOf(val) == mask->int_vec_type); - mask->cond_mask = val; - + mask->cond_mask = LLVMBuildAnd(mask->bld->builder, + mask->cond_mask, + val, + ""); lp_exec_mask_update(mask); } @@ -802,7 +804,7 @@ emit_store( case TGSI_FILE_PREDICATE: lp_exec_mask_store(&bld->exec_mask, pred, value, - bld->preds[index][chan_index]); + bld->preds[reg->Register.Index][chan_index]); break; default: diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h index 3ffe916f8e..fec1d3dfbc 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -128,16 +128,16 @@ struct lp_build_context */ struct lp_type type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_elem_type(type) */ LLVMTypeRef elem_type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_vec_type(type) */ LLVMTypeRef vec_type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_int_elem_type(type) */ LLVMTypeRef int_elem_type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_int_vec_type(type) */ LLVMTypeRef int_vec_type; /** Same as lp_build_undef(type) */ diff --git a/src/gallium/auxiliary/os/os_stream.c b/src/gallium/auxiliary/os/os_stream.c new file mode 100644 index 0000000000..3c55fc00d9 --- /dev/null +++ b/src/gallium/auxiliary/os/os_stream.c @@ -0,0 +1,58 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pipe/p_config.h" + +#include "os_stream.h" +#include "util/u_memory.h" +#include "util/u_string.h" + +int +os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + char buf[1024]; + int retval; + va_list ap2; + va_copy(ap2, ap); + retval = util_vsnprintf(buf, sizeof(buf), format, ap2); + va_end(ap2); + if(retval <= 0) + {} + else if(retval < sizeof(buf)) + stream->write(stream, buf, retval); + else + { + char* str = MALLOC(retval + 1); + if(!str) + return -1; + retval = util_vsnprintf(str, retval + 1, format, ap); + if(retval > 0) + stream->write(stream, str, retval); + FREE(str); + } + + return retval; +} diff --git a/src/gallium/auxiliary/os/os_stream.h b/src/gallium/auxiliary/os/os_stream.h index 693a0621e2..6c6050bb02 100644 --- a/src/gallium/auxiliary/os/os_stream.h +++ b/src/gallium/auxiliary/os/os_stream.h @@ -50,6 +50,9 @@ struct os_stream void (*flush)(struct os_stream *stream); + + int + (*vprintf)(struct os_stream *stream, const char* format, va_list ap); }; @@ -90,6 +93,27 @@ os_stream_flush(struct os_stream *stream) stream->flush(stream); } +int +os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap); + +static INLINE int +os_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + return stream->vprintf(stream, format, ap); +} + +static INLINE int +os_stream_printf (struct os_stream* stream, const char *format, ...) +{ + int retval; + va_list args; + + va_start (args, format); + retval = stream->vprintf(stream, format, args); + va_end (args); + + return retval; +} struct os_stream * os_file_stream_create(const char *filename); @@ -118,5 +142,4 @@ os_str_stream_get_and_close(struct os_stream *stream); #define os_file_stream_create(_filename) os_null_stream_create() #endif - #endif /* _OS_STREAM_H_ */ diff --git a/src/gallium/auxiliary/os/os_stream_log.c b/src/gallium/auxiliary/os/os_stream_log.c index 7cc2028a22..b01377c346 100644 --- a/src/gallium/auxiliary/os/os_stream_log.c +++ b/src/gallium/auxiliary/os/os_stream_log.c @@ -73,7 +73,8 @@ static struct os_stream os_log_stream_struct = { &os_log_stream_close, &os_log_stream_write, - &os_log_stream_flush + &os_log_stream_flush, + &os_default_stream_vprintf, }; diff --git a/src/gallium/auxiliary/os/os_stream_null.c b/src/gallium/auxiliary/os/os_stream_null.c index 128c4e8f0e..a549a789e6 100644 --- a/src/gallium/auxiliary/os/os_stream_null.c +++ b/src/gallium/auxiliary/os/os_stream_null.c @@ -56,12 +56,18 @@ os_null_stream_flush(struct os_stream *stream) (void)stream; } +static int +os_null_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + return 0; +} static struct os_stream os_null_stream = { &os_null_stream_close, &os_null_stream_write, - &os_null_stream_flush + &os_null_stream_flush, + &os_null_stream_vprintf }; diff --git a/src/gallium/auxiliary/os/os_stream_stdc.c b/src/gallium/auxiliary/os/os_stream_stdc.c index 9e7ed71107..37e7d063e2 100644 --- a/src/gallium/auxiliary/os/os_stream_stdc.c +++ b/src/gallium/auxiliary/os/os_stream_stdc.c @@ -83,6 +83,14 @@ os_stdc_stream_flush(struct os_stream *_stream) fflush(stream->file); } +static int +os_stdc_stream_vprintf (struct os_stream* _stream, const char *format, va_list ap) +{ + struct os_stdc_stream *stream = os_stdc_stream(_stream); + + return vfprintf(stream->file, format, ap); +} + struct os_stream * os_file_stream_create(const char *filename) @@ -96,6 +104,7 @@ os_file_stream_create(const char *filename) stream->base.close = &os_stdc_stream_close; stream->base.write = &os_stdc_stream_write; stream->base.flush = &os_stdc_stream_flush; + stream->base.vprintf = &os_stdc_stream_vprintf; stream->file = fopen(filename, "w"); if(!stream->file) diff --git a/src/gallium/auxiliary/os/os_stream_str.c b/src/gallium/auxiliary/os/os_stream_str.c index b5c7270d2a..be9478b2a1 100644 --- a/src/gallium/auxiliary/os/os_stream_str.c +++ b/src/gallium/auxiliary/os/os_stream_str.c @@ -118,6 +118,7 @@ os_str_stream_create(size_t size) stream->base.close = &os_str_stream_close; stream->base.write = &os_str_stream_write; stream->base.flush = &os_str_stream_flush; + stream->base.vprintf = &os_default_stream_vprintf; stream->str = os_malloc(size); if(!stream->str) diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h index cec2524da2..2ef02160f2 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h @@ -50,8 +50,7 @@ #define PB_BUFMGR_H_ -#include "pipe/p_compiler.h" -#include "pipe/p_defines.h" +#include "pb_buffer.h" #ifdef __cplusplus diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c index 2e15751e50..0461c81550 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c +++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c @@ -30,7 +30,7 @@ #include "rtasm_cpu.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) static boolean rtasm_sse_enabled(void) { static boolean firsttime = 1; @@ -49,7 +49,7 @@ static boolean rtasm_sse_enabled(void) int rtasm_cpu_has_sse(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; @@ -59,7 +59,7 @@ int rtasm_cpu_has_sse(void) int rtasm_cpu_has_sse2(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 9f70b73698..75b0f6a68e 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -22,8 +22,9 @@ **************************************************************************/ #include "pipe/p_config.h" +#include "util/u_cpu_detect.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "pipe/p_compiler.h" #include "util/u_debug.h" @@ -231,6 +232,10 @@ static void emit_modrm( struct x86_function *p, assert(reg.mod == mod_REG); + /* TODO: support extended x86-64 registers */ + assert(reg.idx < 8); + assert(regmem.idx < 8); + val |= regmem.mod << 6; /* mod field */ val |= reg.idx << 3; /* reg field */ val |= regmem.idx; /* r/m field */ @@ -363,6 +368,12 @@ int x86_get_label( struct x86_function *p ) */ +void x64_rexw(struct x86_function *p) +{ + if(x86_target(p) != X86_32) + emit_1ub(p, 0x48); +} + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ) @@ -449,6 +460,52 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ) emit_1i(p, imm); } +void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + x86_mov_reg_imm(p, dst, imm); + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_1i(p, imm); + } +} + +void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm ) +{ + DUMP_RI( dst, imm ); + emit_1ub(p, 0x66); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb8 + dst.idx); + emit_2ub(p, imm & 0xff, imm >> 8); + } + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_2ub(p, imm & 0xff, imm >> 8); + } +} + +void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb0 + dst.idx); + emit_1ub(p, imm); + } + else + { + emit_1ub(p, 0xc6); + emit_modrm_noreg(p, 0, dst); + emit_1ub(p, imm); + } +} + /** * Immediate group 1 instructions. */ @@ -520,7 +577,7 @@ void x86_push( struct x86_function *p, } - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } void x86_push_imm32( struct x86_function *p, @@ -530,7 +587,7 @@ void x86_push_imm32( struct x86_function *p, emit_1ub(p, 0x68); emit_1i(p, imm32); - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } @@ -540,23 +597,33 @@ void x86_pop( struct x86_function *p, DUMP_R( reg ); assert(reg.mod == mod_REG); emit_1ub(p, 0x58 + reg.idx); - p->stack_offset -= 4; + p->stack_offset -= sizeof(void*); } void x86_inc( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x40 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x40 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 0, reg); } void x86_dec( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x48 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x48 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 1, reg); } void x86_ret( struct x86_function *p ) @@ -583,9 +650,82 @@ void x86_mov( struct x86_function *p, struct x86_reg src ) { DUMP_RR( dst, src ); + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + uint8_t rex = 0x40; + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + emit_1ub(p, rex); + } + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_mov16( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_1ub(p, 0x66); + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_mov8( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_op_modrm( p, 0x8a, 0x88, dst, src ); +} + +void x64_mov64( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + uint8_t rex = 0x48; + DUMP_RR( dst, src ); + assert(x86_target(p) != X86_32); + + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + } + emit_1ub(p, rex); emit_op_modrm( p, 0x8b, 0x89, dst, src ); } +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb6); + emit_modrm(p, dst, src); +} + +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb7); + emit_modrm(p, dst, src); +} + void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -680,6 +820,61 @@ void x86_div( struct x86_function *p, emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src); } +void x86_bswap( struct x86_function *p, struct x86_reg reg ) +{ + DUMP_R(reg); + assert(reg.file == file_REG32); + assert(reg.mod == mod_REG); + emit_2ub(p, 0x0f, 0xc8 + reg.idx); +} + +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 5, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 5, reg); + emit_1ub(p, imm); + } +} + +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 7, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 7, reg); + emit_1ub(p, imm); + } +} + +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 4, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 4, reg); + emit_1ub(p, imm); + } +} /*********************************************************************** @@ -1013,6 +1208,77 @@ void sse_movmskps( struct x86_function *p, * SSE2 instructions */ +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + if(dst.mod == mod_REG && dst.file == file_REG32) + { + emit_1ub(p, 0x7e); + emit_modrm(p, src, dst); + } + else + { + emit_op_modrm(p, 0x6e, 0x7e, dst, src); + } +} + +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + switch (dst.mod) { + case mod_REG: + emit_3ub(p, 0xf3, 0x0f, 0x7e); + emit_modrm(p, dst, src); + break; + case mod_INDIRECT: + case mod_DISP32: + case mod_DISP8: + assert(src.mod == mod_REG); + emit_3ub(p, 0x66, 0x0f, 0xd6); + emit_modrm(p, src, dst); + break; + default: + assert(0); + break; + } +} + +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf3, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf2, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x28, 0x29, dst, src); +} + /** * Perform a reduced swizzle: */ @@ -1027,6 +1293,28 @@ void sse2_pshufd( struct x86_function *p, emit_1ub(p, shuf); } +void sse2_pshuflw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf2, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + +void sse2_pshufhw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf3, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1045,6 +1333,24 @@ void sse2_cvtps2dq( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_cvtsd2ss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0xf2, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + +void sse2_cvtpd2ps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1081,6 +1387,97 @@ void sse2_punpcklbw( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x61); + emit_modrm( p, dst, src ); +} + +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x62); + emit_modrm( p, dst, src ); +} + +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x6c); + emit_modrm( p, dst, src ); +} + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_3ub(p, 0x66, 0x0f, 0xeb); + emit_modrm(p, dst, src); +} void sse2_rcpps( struct x86_function *p, struct x86_reg dst, @@ -1100,18 +1497,6 @@ void sse2_rcpss( struct x86_function *p, emit_modrm( p, dst, src ); } -void sse2_movd( struct x86_function *p, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( dst, src ); - emit_2ub(p, 0x66, X86_TWOB); - emit_op_modrm( p, 0x6e, 0x7e, dst, src ); -} - - - - /*********************************************************************** * x87 instructions */ @@ -1702,23 +2087,80 @@ void x86_cdecl_caller_pop_regs( struct x86_function *p ) } -/* Retreive a reference to one of the function arguments, taking into - * account any push/pop activity: - */ struct x86_reg x86_fn_arg( struct x86_function *p, - unsigned arg ) + unsigned arg ) { - return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + switch(x86_target(p)) + { + case X86_64_WIN64_ABI: + /* Microsoft uses a different calling convention than the rest of the world */ + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_CX); + case 2: + return x86_make_reg(file_REG32, reg_DX); + case 3: + return x86_make_reg(file_REG32, reg_R8); + case 4: + return x86_make_reg(file_REG32, reg_R9); + default: + /* Win64 allocates stack slots as if it pushed the first 4 arguments too */ + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + arg * 8); + } + case X86_64_STD_ABI: + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_DI); + case 2: + return x86_make_reg(file_REG32, reg_SI); + case 3: + return x86_make_reg(file_REG32, reg_DX); + case 4: + return x86_make_reg(file_REG32, reg_CX); + case 5: + return x86_make_reg(file_REG32, reg_R8); + case 6: + return x86_make_reg(file_REG32, reg_R9); + default: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + (arg - 6) * 8); /* ??? */ + } + case X86_32: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), p->stack_offset + arg * 4); /* ??? */ + default: + abort(); + } } +static void x86_init_func_common( struct x86_function *p ) +{ + util_cpu_detect(); + p->caps = 0; + if(util_cpu_caps.has_mmx) + p->caps |= X86_MMX; + if(util_cpu_caps.has_mmx2) + p->caps |= X86_MMX2; + if(util_cpu_caps.has_sse) + p->caps |= X86_SSE; + if(util_cpu_caps.has_sse2) + p->caps |= X86_SSE2; + if(util_cpu_caps.has_sse3) + p->caps |= X86_SSE3; + if(util_cpu_caps.has_sse4_1) + p->caps |= X86_SSE4_1; + p->csr = p->store; + DUMP_START(); +} void x86_init_func( struct x86_function *p ) { p->size = 0; p->store = NULL; - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_init_func_size( struct x86_function *p, unsigned code_size ) @@ -1728,8 +2170,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size ) if (p->store == NULL) { p->store = p->error_overflow; } - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_release_func( struct x86_function *p ) diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h index 6208e8f707..2b9678b176 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h @@ -24,22 +24,31 @@ #ifndef _RTASM_X86SSE_H_ #define _RTASM_X86SSE_H_ +#include "pipe/p_compiler.h" #include "pipe/p_config.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* It is up to the caller to ensure that instructions issued are * suitable for the host cpu. There are no checks made in this module * for mmx/sse/sse2 support on the cpu. */ struct x86_reg { - unsigned file:3; - unsigned idx:3; + unsigned file:2; + unsigned idx:4; unsigned mod:2; /* mod_REG if this is just a register */ int disp:24; /* only +/- 23bits of offset - should be enough... */ }; +#define X86_MMX 1 +#define X86_MMX2 2 +#define X86_SSE 4 +#define X86_SSE2 8 +#define X86_SSE3 0x10 +#define X86_SSE4_1 0x20 + struct x86_function { + unsigned caps; unsigned size; unsigned char *store; unsigned char *csr; @@ -75,7 +84,15 @@ enum x86_reg_name { reg_SP, reg_BP, reg_SI, - reg_DI + reg_DI, + reg_R8, + reg_R9, + reg_R10, + reg_R11, + reg_R12, + reg_R13, + reg_R14, + reg_R15 }; @@ -110,6 +127,29 @@ typedef void (*x86_func)(void); /* Begin/end/retrieve function creation: */ +enum x86_target +{ + X86_32, + X86_64_STD_ABI, + X86_64_WIN64_ABI +}; + +/* make this read a member of x86_function if target != host is desired */ +static INLINE enum x86_target x86_target( struct x86_function* p ) +{ +#ifdef PIPE_ARCH_X86 + return X86_32; +#elif defined(_WIN64) + return X86_64_WIN64_ABI; +#elif defined(PIPE_ARCH_X86_64) + return X86_64_STD_ABI; +#endif +} + +static INLINE unsigned x86_target_caps( struct x86_function* p ) +{ + return p->caps; +} void x86_init_func( struct x86_function *p ); void x86_init_func_size( struct x86_function *p, unsigned code_size ); @@ -138,6 +178,8 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg ); */ int x86_get_label( struct x86_function *p ); +void x64_rexw(struct x86_function *p); + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ); @@ -178,18 +220,54 @@ void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, unsigned char shuf ); +void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr); void sse_prefetch0( struct x86_function *p, struct x86_reg ptr); @@ -227,7 +305,6 @@ void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src ); -void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src); void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -237,6 +314,14 @@ void x86_dec( struct x86_function *p, struct x86_reg reg ); void x86_inc( struct x86_function *p, struct x86_reg reg ); void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm ); +void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm ); +void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm ); void x86_mul( struct x86_function *p, struct x86_reg src ); void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -250,7 +335,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_sahf( struct x86_function *p ); void x86_div( struct x86_function *p, struct x86_reg src ); - +void x86_bswap( struct x86_function *p, struct x86_reg src ); +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); void x86_cdecl_caller_push_regs( struct x86_function *p ); void x86_cdecl_caller_pop_regs( struct x86_function *p ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h index 4cd27317b3..dd78b36100 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.h +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h @@ -28,6 +28,7 @@ #ifndef TGSI_DUMP_H #define TGSI_DUMP_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 298f3d0a8b..0757f05dfa 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -3239,6 +3239,8 @@ exec_instruction( if (mach->CallStackTop == 0) { /* returning from main() */ + mach->CondStackTop = 0; + mach->LoopStackTop = 0; *pc = -1; return; } @@ -3767,6 +3769,9 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) } #endif + /* Strictly speaking, these assertions aren't really needed but they + * can potentially catch some bugs in the control flow code. + */ assert(mach->CondStackTop == 0); assert(mach->LoopStackTop == 0); assert(mach->ContStackTop == 0); diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h index 50248884fd..1992d11bbe 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info.h +++ b/src/gallium/auxiliary/tgsi/tgsi_info.h @@ -28,6 +28,7 @@ #ifndef TGSI_INFO_H #define TGSI_INFO_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c index db9a342220..1891203abe 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.c +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c @@ -282,17 +282,6 @@ tgsi_parse_token( } -unsigned -tgsi_num_tokens(const struct tgsi_token *tokens) -{ - struct tgsi_parse_context ctx; - if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) { - unsigned len = (ctx.FullHeader.Header.HeaderSize + - ctx.FullHeader.Header.BodySize); - return len; - } - return 0; -} /** @@ -319,3 +308,19 @@ tgsi_alloc_tokens(unsigned num_tokens) unsigned bytes = num_tokens * sizeof(struct tgsi_token); return (struct tgsi_token *) MALLOC(bytes); } + + +void +tgsi_dump_tokens(const struct tgsi_token *tokens) +{ + const unsigned *dwords = (const unsigned *)tokens; + int nr = tgsi_num_tokens(tokens); + int i; + + assert(sizeof(*tokens) == sizeof(unsigned)); + + debug_printf("const unsigned tokens[%d] = {\n", nr); + for (i = 0; i < nr; i++) + debug_printf("0x%08x,\n", dwords[i]); + debug_printf("};\n"); +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h index 36de8807b4..d4df585176 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.h +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h @@ -28,6 +28,7 @@ #ifndef TGSI_PARSE_H #define TGSI_PARSE_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus @@ -132,8 +133,15 @@ void tgsi_parse_token( struct tgsi_parse_context *ctx ); -unsigned -tgsi_num_tokens(const struct tgsi_token *tokens); +static INLINE unsigned +tgsi_num_tokens(const struct tgsi_token *tokens) +{ + struct tgsi_header header = *(const struct tgsi_header *) tokens; + return header.HeaderSize + header.BodySize; +} + +void +tgsi_dump_tokens(const struct tgsi_token *tokens); struct tgsi_token * tgsi_dup_tokens(const struct tgsi_token *tokens); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h index d81ee3d00e..00aa8b84fe 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h @@ -32,9 +32,12 @@ extern "C" { #endif +#include "pipe/p_compiler.h" + +struct tgsi_exec_machine; +struct tgsi_interp_coef; struct tgsi_token; struct x86_function; -struct tgsi_interp_coef; unsigned tgsi_emit_sse2( diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c index fe638e211f..73287b667d 100644 --- a/src/gallium/auxiliary/translate/translate.c +++ b/src/gallium/auxiliary/translate/translate.c @@ -38,7 +38,7 @@ struct translate *translate_create( const struct translate_key *key ) { struct translate *translate = NULL; -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) translate = translate_sse2_create( key ); if (translate) return translate; diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h index eb6f2cc486..a75380228b 100644 --- a/src/gallium/auxiliary/translate/translate.h +++ b/src/gallium/auxiliary/translate/translate.h @@ -85,6 +85,18 @@ struct translate { unsigned instance_id, void *output_buffer); + void (PIPE_CDECL *run_elts16)( struct translate *, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + + void (PIPE_CDECL *run_elts8)( struct translate *, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + void (PIPE_CDECL *run)( struct translate *, unsigned start, unsigned count, diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index 42cfd763e9..ad809db720 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -64,6 +64,14 @@ struct translate_generic { unsigned input_stride; unsigned max_index; + /* this value is set to -1 if this is a normal element with output_format != input_format: + * in this case, u_format is used to do a full conversion + * + * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids: + * in this case, memcpy is used to copy this amount of bytes + */ + int copy_size; + } attrib[PIPE_MAX_ATTRIBS]; unsigned nr_attrib; @@ -354,7 +362,65 @@ static emit_func get_emit_func( enum pipe_format format ) } } +static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg, + unsigned elt, + unsigned instance_id, + void *vert ) +{ + unsigned nr_attrs = tg->nr_attrib; + unsigned attr; + + for (attr = 0; attr < nr_attrs; attr++) { + float data[4]; + uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset; + + if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { + const uint8_t *src; + unsigned index; + int copy_size; + + if (tg->attrib[attr].instance_divisor) { + index = instance_id / tg->attrib[attr].instance_divisor; + } + else { + index = elt; + } + + /* clamp to void going out of bounds */ + index = MIN2(index, tg->attrib[attr].max_index); + src = tg->attrib[attr].input_ptr + + tg->attrib[attr].input_stride * index; + + copy_size = tg->attrib[attr].copy_size; + if(likely(copy_size >= 0)) + memcpy(dst, src, copy_size); + else + { + tg->attrib[attr].fetch( data, src, 0, 0 ); + + if (0) + debug_printf("Fetch linear attr %d from %p stride %d index %d: " + " %f, %f, %f, %f \n", + attr, + tg->attrib[attr].input_ptr, + tg->attrib[attr].input_stride, + index, + data[0], data[1],data[2], data[3]); + + tg->attrib[attr].emit( data, dst ); + } + } else { + if(likely(tg->attrib[attr].copy_size >= 0)) + memcpy(data, &instance_id, 4); + else + { + data[0] = (float)instance_id; + tg->attrib[attr].emit( data, dst ); + } + } + } +} /** * Fetch vertex attributes for 'count' vertices. @@ -367,62 +433,45 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate, { struct translate_generic *tg = translate_generic(translate); char *vert = output_buffer; - unsigned nr_attrs = tg->nr_attrib; - unsigned attr; unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ for (i = 0; i < count; i++) { - const unsigned elt = *elts++; - - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - char *dst = vert + tg->attrib[attr].output_offset; - - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { - const uint8_t *src; - unsigned index; - - if (tg->attrib[attr].instance_divisor) { - index = instance_id / tg->attrib[attr].instance_divisor; - } else { - index = elt; - } - - /* clamp to void going out of bounds */ - index = MIN2(index, tg->attrib[attr].max_index); - - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * index; - - tg->attrib[attr].fetch( data, src, 0, 0 ); - - if (0) - debug_printf("Fetch elt attr %d from %p stride %d div %u max %u index %d: " - " %f, %f, %f, %f \n", - attr, - tg->attrib[attr].input_ptr, - tg->attrib[attr].input_stride, - tg->attrib[attr].instance_divisor, - tg->attrib[attr].max_index, - index, - data[0], data[1],data[2], data[3]); - } else { - data[0] = (float)instance_id; - } + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} - if (0) - debug_printf("vert %d/%d attr %d: %f %f %f %f\n", - i, elt, attr, data[0], data[1], data[2], data[3]); +static void PIPE_CDECL generic_run_elts16( struct translate *translate, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; - tg->attrib[attr].emit( data, dst ); - } + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); vert += tg->translate.key.output_stride; } } +static void PIPE_CDECL generic_run_elts8( struct translate *translate, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} static void PIPE_CDECL generic_run( struct translate *translate, unsigned start, @@ -432,57 +481,10 @@ static void PIPE_CDECL generic_run( struct translate *translate, { struct translate_generic *tg = translate_generic(translate); char *vert = output_buffer; - unsigned nr_attrs = tg->nr_attrib; - unsigned attr; unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ for (i = 0; i < count; i++) { - unsigned elt = start + i; - - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - char *dst = vert + tg->attrib[attr].output_offset; - - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { - const uint8_t *src; - unsigned index; - - if (tg->attrib[attr].instance_divisor) { - index = instance_id / tg->attrib[attr].instance_divisor; - } - else { - index = elt; - } - - /* clamp to void going out of bounds */ - index = MIN2(index, tg->attrib[attr].max_index); - - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * index; - - tg->attrib[attr].fetch( data, src, 0, 0 ); - - if (0) - debug_printf("Fetch linear attr %d from %p stride %d index %d: " - " %f, %f, %f, %f \n", - attr, - tg->attrib[attr].input_ptr, - tg->attrib[attr].input_stride, - index, - data[0], data[1],data[2], data[3]); - } else { - data[0] = (float)instance_id; - } - - if (0) - debug_printf("vert %d attr %d: %f %f %f %f\n", - i, attr, data[0], data[1], data[2], data[3]); - - tg->attrib[attr].emit( data, dst ); - } - + generic_run_one(tg, start + i, instance_id, vert); vert += tg->translate.key.output_stride; } } @@ -528,6 +530,8 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->translate.release = generic_release; tg->translate.set_buffer = generic_set_buffer; tg->translate.run_elts = generic_run_elts; + tg->translate.run_elts16 = generic_run_elts16; + tg->translate.run_elts8 = generic_run_elts8; tg->translate.run = generic_run; for (i = 0; i < key->nr_elements; i++) { @@ -544,9 +548,28 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->attrib[i].input_offset = key->element[i].input_offset; tg->attrib[i].instance_divisor = key->element[i].instance_divisor; - tg->attrib[i].emit = get_emit_func(key->element[i].output_format); tg->attrib[i].output_offset = key->element[i].output_offset; + tg->attrib[i].copy_size = -1; + if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID) + { + if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED + || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED) + tg->attrib[i].copy_size = 4; + } + else + { + if(key->element[i].input_format == key->element[i].output_format + && format_desc->block.width == 1 + && format_desc->block.height == 1 + && !(format_desc->block.bits & 7)) + tg->attrib[i].copy_size = format_desc->block.bits >> 3; + } + + if(tg->attrib[i].copy_size < 0) + tg->attrib[i].emit = get_emit_func(key->element[i].output_format); + else + tg->attrib[i].emit = NULL; } tg->nr_attrib = key->nr_elements; diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index ef3aa674a3..f8bf5b4669 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -30,11 +30,12 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_format.h" #include "translate.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "rtasm/rtasm_cpu.h" #include "rtasm/rtasm_x86sse.h" @@ -46,21 +47,9 @@ #define W 3 -typedef void (PIPE_CDECL *run_func)( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer); - -typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - struct translate_buffer { const void *base_ptr; - unsigned stride; + uintptr_t stride; unsigned max_index; }; @@ -73,21 +62,43 @@ struct translate_buffer_varient { #define ELEMENT_BUFFER_INSTANCE_ID 1001 +#define NUM_CONSTS 7 + +enum +{ + CONST_IDENTITY, + CONST_INV_127, + CONST_INV_255, + CONST_INV_32767, + CONST_INV_65535, + CONST_INV_2147483647, + CONST_255 +}; + +#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} +static float consts[NUM_CONSTS][4] = { + {0, 0, 0, 1}, + C(1.0 / 127.0), + C(1.0 / 255.0), + C(1.0 / 32767.0), + C(1.0 / 65535.0), + C(1.0 / 2147483647.0), + C(255.0) +}; +#undef C struct translate_sse { struct translate translate; struct x86_function linear_func; struct x86_function elt_func; + struct x86_function elt16_func; + struct x86_function elt8_func; struct x86_function *func; - boolean loaded_identity; - boolean loaded_255; - boolean loaded_inv_255; - - float identity[4]; - float float_255[4]; - float inv_255[4]; + PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; + int8_t reg_to_const[16]; + int8_t const_to_reg[NUM_CONSTS]; struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; @@ -102,17 +113,16 @@ struct translate_sse { boolean use_instancing; unsigned instance_id; - run_func gen_run; - run_elts_func gen_run_elts; - /* these are actually known values, but putting them in a struct * like this is helpful to keep them in sync across the file. */ struct x86_reg tmp_EAX; - struct x86_reg idx_EBX; /* either start+i or &elt[i] */ - struct x86_reg outbuf_ECX; - struct x86_reg machine_EDX; - struct x86_reg count_ESI; /* decrements to zero */ + struct x86_reg tmp2_EDX; + struct x86_reg src_ECX; + struct x86_reg idx_ESI; /* either start+i or &elt[i] */ + struct x86_reg machine_EDI; + struct x86_reg outbuf_EBX; + struct x86_reg count_EBP; /* decrements to zero */ }; static int get_offset( const void *a, const void *b ) @@ -120,281 +130,950 @@ static int get_offset( const void *a, const void *b ) return (const char *)b - (const char *)a; } +static struct x86_reg get_const( struct translate_sse *p, unsigned id) +{ + struct x86_reg reg; + unsigned i; + if(p->const_to_reg[id] >= 0) + return x86_make_reg(file_XMM, p->const_to_reg[id]); -static struct x86_reg get_identity( struct translate_sse *p ) -{ - struct x86_reg reg = x86_make_reg(file_XMM, 6); - - if (!p->loaded_identity) { - p->loaded_identity = TRUE; - p->identity[0] = 0; - p->identity[1] = 0; - p->identity[2] = 0; - p->identity[3] = 1; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->identity[0]))); + for(i = 2; i < 8; ++i) + { + if(p->reg_to_const[i] < 0) + break; } + /* TODO: be smarter here */ + if(i == 8) + --i; + + reg = x86_make_reg(file_XMM, i); + + if(p->reg_to_const[i] >= 0) + p->const_to_reg[p->reg_to_const[i]] = -1; + + p->reg_to_const[i] = id; + p->const_to_reg[id] = i; + + /* TODO: this should happen outside the loop, if possible */ + sse_movaps(p->func, reg, + x86_make_disp(p->machine_EDI, + get_offset(p, &p->consts[id][0]))); + return reg; } -static struct x86_reg get_255( struct translate_sse *p ) +/* load the data in a SSE2 register, padding with zeros */ +static boolean emit_load_sse2( struct translate_sse *p, + struct x86_reg data, + struct x86_reg src, + unsigned size) { - struct x86_reg reg = x86_make_reg(file_XMM, 7); - - if (!p->loaded_255) { - p->loaded_255 = TRUE; - p->float_255[0] = - p->float_255[1] = - p->float_255[2] = - p->float_255[3] = 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->float_255[0]))); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + switch(size) + { + case 1: + x86_movzx8(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 2: + x86_movzx16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 3: + x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); + x86_shl_imm(p->func, tmp, 16); + x86_mov16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 4: + sse2_movd(p->func, data, src); + break; + case 6: + sse2_movd(p->func, data, src); + x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); + sse2_movd(p->func, tmpXMM, tmp); + sse2_punpckldq(p->func, data, tmpXMM); + break; + case 8: + sse2_movq(p->func, data, src); + break; + case 12: + sse2_movq(p->func, data, src); + sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); + sse2_punpcklqdq(p->func, data, tmpXMM); + break; + case 16: + sse2_movdqu(p->func, data, src); + break; + default: + return FALSE; } - - return reg; + return TRUE; } -static struct x86_reg get_inv_255( struct translate_sse *p ) +/* this value can be passed for the out_chans argument */ +#define CHANNELS_0001 5 + +/* this function will load #chans float values, and will + * pad the register with zeroes at least up to out_chans. + * + * If out_chans is set to CHANNELS_0001, then the fourth + * value will be padded with 1. Only pass this value if + * chans < 4 or results are undefined. + */ +static void emit_load_float32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) { - struct x86_reg reg = x86_make_reg(file_XMM, 5); - - if (!p->loaded_inv_255) { - p->loaded_inv_255 = TRUE; - p->inv_255[0] = - p->inv_255[1] = - p->inv_255[2] = - p->inv_255[3] = 1.0f / 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->inv_255[0]))); + switch(chans) + { + case 1: + /* a 0 0 0 + * a 0 0 1 + */ + sse_movss(p->func, data, arg0); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 2: + /* 0 0 0 1 + * a b 0 1 + */ + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + sse_movlps(p->func, data, arg0); + break; + case 3: + /* Have to jump through some hoops: + * + * c 0 0 0 + * c 0 0 1 if out_chans == CHANNELS_0001 + * 0 0 c 0/1 + * a b c 0/1 + */ + sse_movss(p->func, data, x86_make_disp(arg0, 8)); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) ); + sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); + sse_movlps(p->func, data, arg0); + break; + case 4: + sse_movups(p->func, data, arg0); + break; } - - return reg; } +/* this function behaves like emit_load_float32, but loads + 64-bit floating point numbers, converting them to 32-bit + ones */ +static void emit_load_float64to32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) +{ + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + switch(chans) + { + case 1: + sse2_movsd(p->func, data, arg0); + if(out_chans > 1) + sse2_cvtpd2ps(p->func, data, data); + else + sse2_cvtsd2ss(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + break; + case 2: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 3: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + if(out_chans > 3) + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + else + sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 4: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + break; + } +} -static void emit_load_R32G32B32A32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) { - sse_movups(p->func, data, arg0); + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, dst_gpr, src_gpr); + else + { + /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movq(p->func, dst_xmm, src_xmm); + else + sse_movlps(p->func, dst_xmm, src_xmm); + } } -static void emit_load_R32G32B32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) { - /* Have to jump through some hoops: - * - * c 0 0 0 - * c 0 0 1 - * 0 0 c 1 - * a b c 1 - */ - sse_movss(p->func, data, x86_make_disp(arg0, 8)); - sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); - sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst_gpr, dst_xmm, src, src); } -static void emit_load_R32G32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) { - /* 0 0 0 1 - * a b 0 1 - */ - sse_movups(p->func, data, get_identity(p) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst, dst, src_gpr, src_xmm); } +static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) +{ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movdqu(p->func, dst, src); + else + sse_movups(p->func, dst, src); +} -static void emit_load_R32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, + * but may or may not be good on older processors + * TODO: may perhaps want to use non-temporal stores here if possible + */ +static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) { - /* a 0 0 0 - * a 0 0 1 - */ - sse_movss(p->func, data, arg0); - sse_orps(p->func, data, get_identity(p) ); + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); + struct x86_reg dataGPR = p->tmp_EAX; + struct x86_reg dataGPR2 = p->tmp2_EDX; + + if(size < 8) + { + switch (size) + { + case 1: + x86_mov8(p->func, dataGPR, src); + x86_mov8(p->func, dst, dataGPR); + break; + case 2: + x86_mov16(p->func, dataGPR, src); + x86_mov16(p->func, dst, dataGPR); + break; + case 3: + x86_mov16(p->func, dataGPR, src); + x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); + x86_mov16(p->func, dst, dataGPR); + x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); + break; + case 4: + x86_mov(p->func, dataGPR, src); + x86_mov(p->func, dst, dataGPR); + break; + case 6: + x86_mov(p->func, dataGPR, src); + x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); + x86_mov(p->func, dst, dataGPR); + x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); + break; + } + } + else if(!(x86_target_caps(p->func) & X86_SSE)) + { + unsigned i = 0; + assert((size & 3) == 0); + for(i = 0; i < size; i += 4) + { + x86_mov(p->func, dataGPR, x86_make_disp(src, i)); + x86_mov(p->func, x86_make_disp(dst, i), dataGPR); + } + } + else + { + switch(size) + { + case 8: + emit_load64(p, dataGPR, dataXMM, src); + emit_store64(p, dst, dataGPR, dataXMM); + break; + case 12: + emit_load64(p, dataGPR2, dataXMM, src); + x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); + emit_store64(p, dst, dataGPR2, dataXMM); + x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); + break; + case 16: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dst, dataXMM); + break; + case 24: + emit_mov128(p, dataXMM, src); + emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); + break; + case 32: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); + break; + default: + assert(0); + } + } } +static boolean translate_attr_convert( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) -static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg data, - struct x86_reg src ) { + const struct util_format_description* input_desc = util_format_description(a->input_format); + const struct util_format_description* output_desc = util_format_description(a->output_format); + unsigned i; + boolean id_swizzle = TRUE; + unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; + unsigned needed_chans = 0; + unsigned imms[2] = {0, 0x3f800000}; - /* Load and unpack twice: - */ - sse_movss(p->func, data, src); - sse2_punpcklbw(p->func, data, get_identity(p)); - sse2_punpcklbw(p->func, data, get_identity(p)); + if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) + return FALSE; - /* Convert to float: - */ - sse2_cvtdq2ps(p->func, data, data); + if(input_desc->channel[0].size & 7) + return FALSE; + if(input_desc->colorspace != output_desc->colorspace) + return FALSE; - /* Scale by 1/255.0 - */ - sse_mulps(p->func, data, get_inv_255(p)); -} + for(i = 1; i < input_desc->nr_channels; ++i) + { + if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) + return FALSE; + } + for(i = 1; i < output_desc->nr_channels; ++i) + { + if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) + return FALSE; + } + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(output_desc->swizzle[i] < 4) + swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; + } + if((x86_target_caps(p->func) & X86_SSE) && (0 + || a->output_format == PIPE_FORMAT_R32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); -static void emit_store_R32G32B32A32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movups(p->func, dest, dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R32G32B32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Emit two, shuffle, emit one. - */ - sse_movlps(p->func, dest, dataXMM); - sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ - sse_movss(p->func, x86_make_disp(dest,8), dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } -static void emit_store_R32G32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movlps(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovzx */ + switch(input_desc->channel[0].size) + { + case 8: + /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 32: /* we lose precision here */ + sse2_psrld_imm(p->func, dataXMM, 1); + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_255); + break; + case 16: + factor = get_const(p, CONST_INV_65535); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + else if(input_desc->channel[0].size == 32) + sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovsx */ + switch(input_desc->channel[0].size) + { + case 8: + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 24); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 16); + break; + case 32: /* we lose precision here */ + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_127); + break; + case 16: + factor = get_const(p, CONST_INV_32767); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + break; + + break; + case UTIL_FORMAT_TYPE_FLOAT: + if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) + return FALSE; + if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) + { + swizzle[3] = UTIL_FORMAT_SWIZZLE_W; + needed_chans = CHANNELS_0001; + } + switch(input_desc->channel[0].size) + { + case 32: + emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + case 64: /* we lose precision here */ + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + default: + return FALSE; + } + break; + default: + return FALSE; + } -static void emit_store_R32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movss(p->func, dest, dataXMM); -} + if(!id_swizzle) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse_movups(p->func, dst, dataXMM); + else + { + if(output_desc->nr_channels >= 2 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse_movlps(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + sse_movss(p->func, dst, dataXMM); + else + x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 2) + { + if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + if(output_desc->nr_channels >= 3) + { + if(output_desc->nr_channels >= 4 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); + else + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); + sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + } + return TRUE; + } + else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 + && output_desc->channel[0].normalized == input_desc->channel[0].normalized + && (0 + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + unsigned imms[2] = {0, 1}; + + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Scale by 255.0 - */ - sse_mulps(p->func, dataXMM, get_255(p)); + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } - /* Pack and emit: - */ - sse2_cvtps2dq(p->func, dataXMM, dataXMM); - sse2_packssdw(p->func, dataXMM, dataXMM); - sse2_packuswb(p->func, dataXMM, dataXMM); - sse_movss(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(input_desc->channel[0].normalized) + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + sse2_psrlw_imm(p->func, dataXMM, 1); + } + else + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(input_desc->channel[0].normalized) + { + sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, tmpXMM, dataXMM); + sse2_psllw_imm(p->func, dataXMM, 9); + sse2_psrlw_imm(p->func, dataXMM, 8); + sse2_por(p->func, tmpXMM, dataXMM); + sse2_psrlw_imm(p->func, dataXMM, 7); + sse2_por(p->func, tmpXMM, dataXMM); + { + struct x86_reg t = dataXMM; + dataXMM = tmpXMM; + tmpXMM = t; + } + } + else + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psraw_imm(p->func, dataXMM, 8); + } + break; + default: + assert(0); + } + if(output_desc->channel[0].normalized) + imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; + if(!id_swizzle) + sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse2_movq(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse2_movd(p->func, dst, dataXMM); + else + { + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, dst, tmp); + if(output_desc->nr_channels >= 2) + x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + else + { + if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + if(output_desc->nr_channels >= 2) + { + sse2_movd(p->func, tmp, dataXMM); + x86_shr_imm(p->func, tmp, 16); + x86_mov16(p->func, x86_make_disp(dst, 2), tmp); + } + } + } + if(output_desc->nr_channels >= 3) + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 4), tmp); + if(output_desc->nr_channels >= 4) + { + x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + else + { + if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + sse2_psrlq_imm(p->func, dataXMM, 48); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 6), tmp); + } + } + } + } + } + return TRUE; + } + else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) + { + struct x86_reg tmp = p->tmp_EAX; + unsigned i; + if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 + && swizzle[0] == UTIL_FORMAT_SWIZZLE_W + && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z + && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y + && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) + { + /* TODO: support movbe */ + x86_mov(p->func, tmp, src); + x86_bswap(p->func, tmp); + x86_mov(p->func, dst, tmp); + return TRUE; + } -/* Extended swizzles? Maybe later. - */ -static void emit_swizzle( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg src, - unsigned char shuffle ) -{ - sse_shufps(p->func, dest, src, shuffle); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + switch(output_desc->channel[0].size) + { + case 8: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[0].normalized ? 0xff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[0].normalized ? 0x7f : 1; + break; + default: + return FALSE; + } + } + x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); + } + else + { + x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); + x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); + } + break; + case 16: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3c00; + break; + default: + return FALSE; + } + } + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); + } + else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); + else + { + x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); + x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); + } + break; + case 32: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3f800000; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); + x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); + } + break; + case 64: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned l = 0; + unsigned h = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + h = output_desc->channel[1].normalized ? 0xffffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + h = output_desc->channel[1].normalized ? 0x7fffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + h = 0x3ff00000; + l = 0; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); + x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); + } + else + { + if(x86_target_caps(p->func) & X86_SSE) + { + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); + emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); + emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); + x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); + x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); + } + } + break; + default: + return FALSE; + } + } + return TRUE; + } + /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ + else if((x86_target_caps(p->func) & X86_SSE2) && + a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0 + || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM + || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + /* load */ + sse_movups(p->func, dataXMM, src); -static boolean translate_attr( struct translate_sse *p, - const struct translate_element *a, - struct x86_reg srcECX, - struct x86_reg dstEAX) -{ - struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3)); - switch (a->input_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_load_R32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_load_R32G32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_load_R32G32B32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_load_R32G32B32A32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - break; - default: - return FALSE; - } + /* scale by 255.0 */ + sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); - switch (a->output_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_store_R32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_store_R32G32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_store_R32G32B32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_store_R32G32B32A32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - default: - return FALSE; + /* pack and emit */ + sse2_cvtps2dq(p->func, dataXMM, dataXMM); + sse2_packssdw(p->func, dataXMM, dataXMM); + sse2_packuswb(p->func, dataXMM, dataXMM); + sse2_movd(p->func, dst, dataXMM); + + return TRUE; } - return TRUE; + return FALSE; } +static boolean translate_attr( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) +{ + if(a->input_format == a->output_format) + { + emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); + return TRUE; + } + + return translate_attr_convert(p, a, src, dst); +} static boolean init_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { unsigned i; - struct x86_reg instance_id = x86_make_disp(p->machine_EDX, + struct x86_reg instance_id = x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; - if (linear || varient->instance_divisor) { - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + if (!index_size || varient->instance_divisor) { + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); - struct x86_reg elt = p->idx_EBX; + struct x86_reg elt = p->idx_ESI; struct x86_reg tmp_EAX = p->tmp_EAX; /* Calculate pointer to first attrib: @@ -406,20 +1085,16 @@ static boolean init_inputs( struct translate_sse *p, x86_mov(p->func, tmp_EAX, instance_id); if (varient->instance_divisor != 1) { - struct x86_reg tmp_EDX = p->machine_EDX; - struct x86_reg tmp_ECX = p->outbuf_ECX; + struct x86_reg tmp_EDX = p->tmp2_EDX; + struct x86_reg tmp_ECX = p->src_ECX; /* TODO: Add x86_shr() to rtasm and use it whenever * instance divisor is power of two. */ - x86_push(p->func, tmp_EDX); - x86_push(p->func, tmp_ECX); x86_xor(p->func, tmp_EDX, tmp_EDX); x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ - x86_pop(p->func, tmp_ECX); - x86_pop(p->func, tmp_EDX); } } else { x86_mov(p->func, tmp_EAX, elt); @@ -430,16 +1105,23 @@ static boolean init_inputs( struct translate_sse *p, */ x86_imul(p->func, tmp_EAX, buf_stride); + x64_rexw(p->func); x86_add(p->func, tmp_EAX, buf_base_ptr); /* In the linear case, keep the buffer pointer instead of the * index number. */ - if (linear && p->nr_buffer_varients == 1) + if (!index_size && p->nr_buffer_varients == 1) + { + x64_rexw(p->func); x86_mov(p->func, elt, tmp_EAX); + } else + { + x64_rexw(p->func); x86_mov(p->func, buf_ptr, tmp_EAX); + } } } @@ -448,44 +1130,57 @@ static boolean init_inputs( struct translate_sse *p, static struct x86_reg get_buffer_ptr( struct translate_sse *p, - boolean linear, + unsigned index_size, unsigned var_idx, struct x86_reg elt ) { if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { - return x86_make_disp(p->machine_EDX, + return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); } - if (linear && p->nr_buffer_varients == 1) { - return p->idx_EBX; + if (!index_size && p->nr_buffer_varients == 1) { + return p->idx_ESI; } - else if (linear || p->buffer_varient[var_idx].instance_divisor) { - struct x86_reg ptr = p->tmp_EAX; + else if (!index_size || p->buffer_varient[var_idx].instance_divisor) { + struct x86_reg ptr = p->src_ECX; struct x86_reg buf_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer_varient[var_idx].ptr)); + x64_rexw(p->func); x86_mov(p->func, ptr, buf_ptr); return ptr; } else { - struct x86_reg ptr = p->tmp_EAX; + struct x86_reg ptr = p->src_ECX; const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx]; struct x86_reg buf_stride = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); struct x86_reg buf_base_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].base_ptr)); /* Calculate pointer to current attrib: */ - x86_mov(p->func, ptr, buf_stride); - x86_imul(p->func, ptr, elt); + switch(index_size) + { + case 1: + x86_movzx8(p->func, ptr, elt); + break; + case 2: + x86_movzx16(p->func, ptr, elt); + break; + case 4: + x86_mov(p->func, ptr, elt); + break; + } + x86_imul(p->func, ptr, buf_stride); + x64_rexw(p->func); x86_add(p->func, ptr, buf_base_ptr); return ptr; } @@ -494,39 +1189,43 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p, static boolean incr_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { - if (linear && p->nr_buffer_varients == 1) { - struct x86_reg stride = x86_make_disp(p->machine_EDX, + if (!index_size && p->nr_buffer_varients == 1) { + struct x86_reg stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[0].stride)); if (p->buffer_varient[0].instance_divisor == 0) { - x86_add(p->func, p->idx_EBX, stride); - sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); + x64_rexw(p->func); + x86_add(p->func, p->idx_ESI, stride); + sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); } } - else if (linear) { + else if (!index_size) { unsigned i; /* Is this worthwhile?? */ for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); if (varient->instance_divisor == 0) { - x86_mov(p->func, p->tmp_EAX, buf_ptr); - x86_add(p->func, p->tmp_EAX, buf_stride); + x86_mov(p->func, p->tmp_EAX, buf_stride); + x64_rexw(p->func); + x86_add(p->func, p->tmp_EAX, buf_ptr); if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); + x64_rexw(p->func); x86_mov(p->func, buf_ptr, p->tmp_EAX); } } } else { - x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); + x64_rexw(p->func); + x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); } return TRUE; @@ -551,35 +1250,52 @@ static boolean incr_inputs( struct translate_sse *p, */ static boolean build_vertex_emit( struct translate_sse *p, struct x86_function *func, - boolean linear ) + unsigned index_size ) { int fixup, label; unsigned j; + memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); + memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); + p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); - p->idx_EBX = x86_make_reg(file_REG32, reg_BX); - p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); - p->machine_EDX = x86_make_reg(file_REG32, reg_DX); - p->count_ESI = x86_make_reg(file_REG32, reg_SI); + p->idx_ESI = x86_make_reg(file_REG32, reg_SI); + p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); + p->machine_EDI = x86_make_reg(file_REG32, reg_DI); + p->count_EBP = x86_make_reg(file_REG32, reg_BP); + p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); + p->src_ECX = x86_make_reg(file_REG32, reg_CX); p->func = func; - p->loaded_inv_255 = FALSE; - p->loaded_255 = FALSE; - p->loaded_identity = FALSE; x86_init_func(p->func); - /* Push a few regs? - */ - x86_push(p->func, p->idx_EBX); - x86_push(p->func, p->count_ESI); + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); + } - /* Load arguments into regs: - */ - x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); - x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); - x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); - x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); + x86_push(p->func, p->outbuf_EBX); + x86_push(p->func, p->count_EBP); + +/* on non-Win64 x86-64, these are already in the right registers */ + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_push(p->func, p->machine_EDI); + x86_push(p->func, p->idx_ESI); + + x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); + x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); + } + + x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); + + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); + else + x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); /* Load instance ID. */ @@ -588,25 +1304,25 @@ static boolean build_vertex_emit( struct translate_sse *p, p->tmp_EAX, x86_fn_arg(p->func, 4)); x86_mov(p->func, - x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), + x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), p->tmp_EAX); } /* Get vertex count, compare to zero */ x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); - x86_cmp(p->func, p->count_ESI, p->tmp_EAX); + x86_cmp(p->func, p->count_EBP, p->tmp_EAX); fixup = x86_jcc_forward(p->func, cc_E); /* always load, needed or not: */ - init_inputs(p, linear); + init_inputs(p, index_size); /* Note address for loop jump */ label = x86_get_label(p->func); { - struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); + struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); int last_varient = -1; struct x86_reg vb; @@ -618,30 +1334,31 @@ static boolean build_vertex_emit( struct translate_sse *p, */ if (varient != last_varient) { last_varient = varient; - vb = get_buffer_ptr(p, linear, varient, elt); + vb = get_buffer_ptr(p, index_size, varient, elt); } if (!translate_attr( p, a, x86_make_disp(vb, a->input_offset), - x86_make_disp(p->outbuf_ECX, a->output_offset))) + x86_make_disp(p->outbuf_EBX, a->output_offset))) return FALSE; } /* Next output vertex: */ + x64_rexw(p->func); x86_lea(p->func, - p->outbuf_ECX, - x86_make_disp(p->outbuf_ECX, + p->outbuf_EBX, + x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); /* Incr index */ - incr_inputs( p, linear ); + incr_inputs( p, index_size ); } /* decr count, loop if not zero */ - x86_dec(p->func, p->count_ESI); + x86_dec(p->func, p->count_EBP); x86_jcc(p->func, cc_NZ, label); /* Exit mmx state? @@ -656,8 +1373,20 @@ static boolean build_vertex_emit( struct translate_sse *p, /* Pop regs and return */ - x86_pop(p->func, p->count_ESI); - x86_pop(p->func, p->idx_EBX); + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_pop(p->func, p->idx_ESI); + x86_pop(p->func, p->machine_EDI); + } + + x86_pop(p->func, p->count_EBP); + x86_pop(p->func, p->outbuf_EBX); + + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); + sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); + } x86_ret(p->func); return TRUE; @@ -697,37 +1426,7 @@ static void translate_sse_release( struct translate *translate ) x86_release_func( &p->linear_func ); x86_release_func( &p->elt_func ); - FREE(p); -} - -static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run_elts( translate, - elts, - count, - instance_id, - output_buffer); -} - -static void PIPE_CDECL translate_sse_run( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run( translate, - start, - count, - instance_id, - output_buffer); + os_free_aligned(p); } @@ -736,18 +1435,19 @@ struct translate *translate_sse2_create( const struct translate_key *key ) struct translate_sse *p = NULL; unsigned i; - if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) + /* this is misnamed, it actually refers to whether rtasm is enabled or not */ + if (!rtasm_cpu_has_sse()) goto fail; - p = CALLOC_STRUCT( translate_sse ); + p = os_malloc_aligned(sizeof(struct translate_sse), 16); if (p == NULL) goto fail; + memset(p, 0, sizeof(*p)); + memcpy(p->consts, consts, sizeof(consts)); p->translate.key = *key; p->translate.release = translate_sse_release; p->translate.set_buffer = translate_sse_set_buffer; - p->translate.run_elts = translate_sse_run_elts; - p->translate.run = translate_sse_run; for (i = 0; i < key->nr_elements; i++) { if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { @@ -783,18 +1483,32 @@ struct translate *translate_sse2_create( const struct translate_key *key ) if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); - if (!build_vertex_emit(p, &p->linear_func, TRUE)) + if (!build_vertex_emit(p, &p->linear_func, 0)) + goto fail; + + if (!build_vertex_emit(p, &p->elt_func, 4)) + goto fail; + + if (!build_vertex_emit(p, &p->elt16_func, 2)) + goto fail; + + if (!build_vertex_emit(p, &p->elt8_func, 1)) + goto fail; + + p->translate.run = (void*)x86_get_func(&p->linear_func); + if (p->translate.run == NULL) goto fail; - if (!build_vertex_emit(p, &p->elt_func, FALSE)) + p->translate.run_elts = (void*)x86_get_func(&p->elt_func); + if (p->translate.run_elts == NULL) goto fail; - p->gen_run = (run_func)x86_get_func(&p->linear_func); - if (p->gen_run == NULL) + p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func); + if (p->translate.run_elts16 == NULL) goto fail; - p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); - if (p->gen_run_elts == NULL) + p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func); + if (p->translate.run_elts8 == NULL) goto fail; return &p->translate; diff --git a/src/gallium/auxiliary/util/u_bitmask.h b/src/gallium/auxiliary/util/u_bitmask.h index 87f1110296..98b85ddecd 100644 --- a/src/gallium/auxiliary/util/u_bitmask.h +++ b/src/gallium/auxiliary/util/u_bitmask.h @@ -36,6 +36,9 @@ #define U_HANDLE_BITMASK_H_ +#include "pipe/p_compiler.h" + + #ifdef __cplusplus extern "C" { #endif diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index 97fa99ec65..dfb142b9e1 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -42,6 +42,7 @@ #include "util/u_blit.h" #include "util/u_draw_quad.h" +#include "util/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_sampler.h" @@ -56,15 +57,18 @@ struct blit_state struct cso_context *cso; struct pipe_blend_state blend; - struct pipe_depth_stencil_alpha_state depthstencil; + struct pipe_depth_stencil_alpha_state depthstencil_keep; + struct pipe_depth_stencil_alpha_state depthstencil_write; struct pipe_rasterizer_state rasterizer; struct pipe_sampler_state sampler; struct pipe_viewport_state viewport; struct pipe_clip_state clip; struct pipe_vertex_element velem[2]; + enum pipe_texture_target internal_target; void *vs; void *fs[TGSI_WRITEMASK_XYZW + 1]; + void *fs_depth; struct pipe_resource *vbuf; /**< quad vertices */ unsigned vbuf_slot; @@ -95,7 +99,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->blend.rt[0].colormask = PIPE_MASK_RGBA; /* no-op depth/stencil/alpha */ - memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil)); + memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep)); + memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write)); + ctx->depthstencil_write.depth.enabled = 1; + ctx->depthstencil_write.depth.writemask = 1; + ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS; /* rasterizer */ memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer)); @@ -110,7 +118,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE; ctx->sampler.min_img_filter = 0; /* set later */ ctx->sampler.mag_img_filter = 0; /* set later */ - ctx->sampler.normalized_coords = 1; /* vertex elements state */ memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2); @@ -145,6 +152,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->vertices[i][1][3] = 1.0f; /* q */ } + if(pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES)) + ctx->internal_target = PIPE_TEXTURE_2D; + else + ctx->internal_target = PIPE_TEXTURE_RECT; + return ctx; } @@ -164,6 +176,9 @@ util_destroy_blit(struct blit_state *ctx) if (ctx->fs[i]) pipe->delete_fs_state(pipe, ctx->fs[i]); + if (ctx->fs_depth) + pipe->delete_fs_state(pipe, ctx->fs_depth); + pipe_resource_reference(&ctx->vbuf, NULL); FREE(ctx); @@ -271,7 +286,7 @@ regions_overlap(int srcX0, int srcY0, * \param writemask controls which channels in the dest surface are sourced * from the src surface. Disabled channels are sourced * from (0,0,0,1). - * XXX need some control over blitting Z and/or stencil. + * XXX need some control over blitting stencil. */ void util_blit_pixels_writemask(struct blit_state *ctx, @@ -294,8 +309,9 @@ util_blit_pixels_writemask(struct blit_state *ctx, const int srcW = abs(srcX1 - srcX0); const int srcH = abs(srcY1 - srcY0); unsigned offset; - boolean overlap; + boolean overlap, dst_is_depth; float s0, t0, s1, t1; + boolean normalized; assert(filter == PIPE_TEX_MIPFILTER_NEAREST || filter == PIPE_TEX_MIPFILTER_LINEAR); @@ -335,7 +351,6 @@ util_blit_pixels_writemask(struct blit_state *ctx, return; } - /* Create a temporary texture when src and dest alias or when src * is anything other than a 2d texture. * XXX should just use appropriate shader to access 1d / 3d slice / cube face, @@ -347,7 +362,8 @@ util_blit_pixels_writemask(struct blit_state *ctx, dst->face == srcsub.face && dst->level == srcsub.level && dst->zslice == srcZ0) || - src_tex->target != PIPE_TEXTURE_2D) + (src_tex->target != PIPE_TEXTURE_2D && + src_tex->target != PIPE_TEXTURE_RECT)) { struct pipe_resource texTemp; struct pipe_resource *tex; @@ -372,7 +388,7 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* create temp texture */ memset(&texTemp, 0, sizeof(texTemp)); - texTemp.target = PIPE_TEXTURE_2D; + texTemp.target = ctx->internal_target; texTemp.format = src_tex->format; texTemp.last_level = 0; texTemp.width0 = srcW; @@ -392,10 +408,19 @@ util_blit_pixels_writemask(struct blit_state *ctx, src_tex, srcsub, srcLeft, srcTop, srcZ0, /* src */ srcW, srcH); /* size */ - s0 = 0.0f; - s1 = 1.0f; - t0 = 0.0f; - t1 = 1.0f; + normalized = tex->target != PIPE_TEXTURE_RECT; + if(normalized) { + s0 = 0.0f; + s1 = 1.0f; + t0 = 0.0f; + t1 = 1.0f; + } + else { + s0 = 0; + s1 = srcW; + t0 = 0; + t1 = srcH; + } u_sampler_view_default_template(&sv_templ, tex, tex->format); sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ); @@ -415,20 +440,29 @@ util_blit_pixels_writemask(struct blit_state *ctx, return; } - s0 = srcX0 / (float)(u_minify(sampler_view->texture->width0, srcsub.level)); - s1 = srcX1 / (float)(u_minify(sampler_view->texture->width0, srcsub.level)); - t0 = srcY0 / (float)(u_minify(sampler_view->texture->height0, srcsub.level)); - t1 = srcY1 / (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + s0 = srcX0; + s1 = srcX1; + t0 = srcY0; + t1 = srcY1; + normalized = sampler_view->texture->target != PIPE_TEXTURE_RECT; + if(normalized) + { + s0 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level)); + s1 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level)); + t0 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + t1 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + } } + dst_is_depth = util_format_is_depth_or_stencil(dst->format); - assert(screen->is_format_supported(screen, sampler_view->format, PIPE_TEXTURE_2D, + assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target, sampler_view->texture->nr_samples, PIPE_BIND_SAMPLER_VIEW, 0)); - assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D, + assert(screen->is_format_supported(screen, dst->format, ctx->internal_target, dst->texture->nr_samples, - PIPE_BIND_RENDER_TARGET, 0)); - + dst_is_depth ? PIPE_BIND_DEPTH_STENCIL : + PIPE_BIND_RENDER_TARGET, 0)); /* save state (restored below) */ cso_save_blend(ctx->cso); cso_save_depth_stencil_alpha(ctx->cso); @@ -444,12 +478,15 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* set misc state we care about */ cso_set_blend(ctx->cso, &ctx->blend); - cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil); + cso_set_depth_stencil_alpha(ctx->cso, + dst_is_depth ? &ctx->depthstencil_write : + &ctx->depthstencil_keep); cso_set_rasterizer(ctx->cso, &ctx->rasterizer); cso_set_clip(ctx->cso, &ctx->clip); cso_set_vertex_elements(ctx->cso, 2, ctx->velem); /* sampler */ + ctx->sampler.normalized_coords = normalized; ctx->sampler.min_img_filter = filter; ctx->sampler.mag_img_filter = filter; /* we've limited this already with the sampler view but you never know... */ @@ -472,22 +509,35 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* texture */ cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view); - if (ctx->fs[writemask] == NULL) - ctx->fs[writemask] = - util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D, - TGSI_INTERPOLATE_LINEAR, - writemask); - /* shaders */ - cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]); + if (dst_is_depth) { + if (ctx->fs_depth == NULL) + ctx->fs_depth = + util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR); + + cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth); + } else { + if (ctx->fs[writemask] == NULL) + ctx->fs[writemask] = + util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR, + writemask); + + cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]); + } cso_set_vertex_shader_handle(ctx->cso, ctx->vs); /* drawing dest */ memset(&fb, 0, sizeof(fb)); fb.width = dst->width; fb.height = dst->height; - fb.nr_cbufs = 1; - fb.cbufs[0] = dst; + if (dst_is_depth) { + fb.zsbuf = dst; + } else { + fb.nr_cbufs = 1; + fb.cbufs[0] = dst; + } cso_set_framebuffer(ctx->cso, &fb); /* draw quad */ @@ -574,6 +624,7 @@ util_blit_pixels_tex(struct blit_state *ctx, int dstX1, int dstY1, float z, uint filter) { + boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT; struct pipe_framebuffer_state fb; float s0, t0, s1, t1; unsigned offset; @@ -586,10 +637,18 @@ util_blit_pixels_tex(struct blit_state *ctx, assert(tex->width0 != 0); assert(tex->height0 != 0); - s0 = srcX0 / (float)tex->width0; - s1 = srcX1 / (float)tex->width0; - t0 = srcY0 / (float)tex->height0; - t1 = srcY1 / (float)tex->height0; + s0 = srcX0; + s1 = srcX1; + t0 = srcY0; + t1 = srcY1; + + if(normalized) + { + s0 /= (float)tex->width0; + s1 /= (float)tex->width0; + t0 /= (float)tex->height0; + t1 /= (float)tex->height0; + } assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format, PIPE_TEXTURE_2D, @@ -611,12 +670,13 @@ util_blit_pixels_tex(struct blit_state *ctx, /* set misc state we care about */ cso_set_blend(ctx->cso, &ctx->blend); - cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil); + cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep); cso_set_rasterizer(ctx->cso, &ctx->rasterizer); cso_set_clip(ctx->cso, &ctx->clip); cso_set_vertex_elements(ctx->cso, 2, ctx->velem); /* sampler */ + ctx->sampler.normalized_coords = normalized; ctx->sampler.min_img_filter = filter; ctx->sampler.mag_img_filter = filter; cso_single_sampler(ctx->cso, 0, &ctx->sampler); diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h index ef95134f32..b8a0dfce13 100644 --- a/src/gallium/auxiliary/util/u_blit.h +++ b/src/gallium/auxiliary/util/u_blit.h @@ -30,18 +30,20 @@ #define U_BLIT_H +#include "pipe/p_compiler.h" + + #ifdef __cplusplus extern "C" { #endif +struct cso_context; struct pipe_context; -struct pipe_surface; struct pipe_resource; -struct cso_context; - - -struct blit_state; +struct pipe_sampler_view; +struct pipe_subresource; +struct pipe_surface; extern struct blit_state * diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c index b5b86b7214..f93ef26ae7 100644 --- a/src/gallium/auxiliary/util/u_blitter.c +++ b/src/gallium/auxiliary/util/u_blitter.c @@ -92,7 +92,7 @@ struct blitter_context_priv void *velem_state; /* Sampler state for clamping to a miplevel. */ - void *sampler_state[PIPE_MAX_TEXTURE_LEVELS]; + void *sampler_state[PIPE_MAX_TEXTURE_LEVELS * 2]; /* Rasterizer state. */ void *rs_state; @@ -254,6 +254,7 @@ void util_blitter_destroy(struct blitter_context *blitter) ctx->dsa_write_depth_keep_stencil); pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil); pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil); + pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_flush_depth_stencil); pipe->delete_rasterizer_state(pipe, ctx->rs_state); pipe->delete_vs_state(pipe, ctx->vs_col); @@ -271,7 +272,7 @@ void util_blitter_destroy(struct blitter_context *blitter) if (ctx->fs_col[i]) pipe->delete_fs_state(pipe, ctx->fs_col[i]); - for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++) + for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS * 2; i++) if (ctx->sampler_state[i]) pipe->delete_sampler_state(pipe, ctx->sampler_state[i]); @@ -319,7 +320,7 @@ static void blitter_restore_CSOs(struct blitter_context_priv *ctx) */ if (ctx->base.saved_fb_state.nr_cbufs != ~0) { pipe->set_framebuffer_state(pipe, &ctx->base.saved_fb_state); - util_assign_framebuffer_state(&ctx->base.saved_fb_state, NULL); + util_unreference_framebuffer_state(&ctx->base.saved_fb_state); ctx->base.saved_fb_state.nr_cbufs = ~0; } @@ -417,16 +418,26 @@ static void blitter_set_clear_color(struct blitter_context_priv *ctx, } } -static void get_normalized_texcoords(struct pipe_resource *src, +static void get_texcoords(struct pipe_resource *src, struct pipe_subresource subsrc, unsigned x1, unsigned y1, unsigned x2, unsigned y2, - float out[4]) + boolean normalized, float out[4]) { - out[0] = x1 / (float)u_minify(src->width0, subsrc.level); - out[1] = y1 / (float)u_minify(src->height0, subsrc.level); - out[2] = x2 / (float)u_minify(src->width0, subsrc.level); - out[3] = y2 / (float)u_minify(src->height0, subsrc.level); + if(normalized) + { + out[0] = x1 / (float)u_minify(src->width0, subsrc.level); + out[1] = y1 / (float)u_minify(src->height0, subsrc.level); + out[2] = x2 / (float)u_minify(src->width0, subsrc.level); + out[3] = y2 / (float)u_minify(src->height0, subsrc.level); + } + else + { + out[0] = x1; + out[1] = y1; + out[2] = x2; + out[3] = y2; + } } static void set_texcoords_in_vertices(const float coord[4], @@ -454,7 +465,7 @@ static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx, unsigned i; float coord[4]; - get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord); + get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord); set_texcoords_in_vertices(coord, &ctx->vertices[0][1][0], 8); for (i = 0; i < 4; i++) { @@ -489,7 +500,7 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx, float coord[4]; float st[4][2]; - get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord); + get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord); set_texcoords_in_vertices(coord, &st[0][0], 2); util_map_texcoords2d_onto_cubemap(subsrc.face, @@ -523,7 +534,7 @@ static void blitter_draw_quad(struct blitter_context_priv *ctx) static INLINE void **blitter_get_sampler_state(struct blitter_context_priv *ctx, - int miplevel) + int miplevel, boolean normalized) { struct pipe_context *pipe = ctx->base.pipe; struct pipe_sampler_state *sampler_state = &ctx->template_sampler_state; @@ -531,18 +542,19 @@ void **blitter_get_sampler_state(struct blitter_context_priv *ctx, assert(miplevel < PIPE_MAX_TEXTURE_LEVELS); /* Create the sampler state on-demand. */ - if (!ctx->sampler_state[miplevel]) { + if (!ctx->sampler_state[miplevel * 2 + normalized]) { sampler_state->lod_bias = miplevel; sampler_state->min_lod = miplevel; sampler_state->max_lod = miplevel; + sampler_state->normalized_coords = normalized; - ctx->sampler_state[miplevel] = pipe->create_sampler_state(pipe, + ctx->sampler_state[miplevel * 2 + normalized] = pipe->create_sampler_state(pipe, sampler_state); } /* Return void** so that it can be passed to bind_fragment_sampler_states * directly. */ - return &ctx->sampler_state[miplevel]; + return &ctx->sampler_state[miplevel * 2 + normalized]; } static INLINE @@ -568,6 +580,8 @@ pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target) return TGSI_TEXTURE_1D; case PIPE_TEXTURE_2D: return TGSI_TEXTURE_2D; + case PIPE_TEXTURE_RECT: + return TGSI_TEXTURE_RECT; case PIPE_TEXTURE_3D: return TGSI_TEXTURE_3D; case PIPE_TEXTURE_CUBE: @@ -716,6 +730,7 @@ void util_blitter_copy_region(struct blitter_context *blitter, struct pipe_sampler_view viewTempl, *view; unsigned bind; boolean is_stencil, is_depth; + boolean normalized; /* Give up if textures are not set. */ assert(dst && src); @@ -787,6 +802,8 @@ void util_blitter_copy_region(struct blitter_context *blitter, fb_state.zsbuf = 0; } + normalized = src->target != PIPE_TEXTURE_RECT; + /* Initialize sampler view. */ u_sampler_view_default_template(&viewTempl, src, src->format); view = pipe->create_sampler_view(pipe, src, &viewTempl); @@ -795,7 +812,7 @@ void util_blitter_copy_region(struct blitter_context *blitter, pipe->bind_rasterizer_state(pipe, ctx->rs_state); pipe->bind_vs_state(pipe, ctx->vs_tex); pipe->bind_fragment_sampler_states(pipe, 1, - blitter_get_sampler_state(ctx, subsrc.level)); + blitter_get_sampler_state(ctx, subsrc.level, normalized)); pipe->bind_vertex_elements_state(pipe, ctx->velem_state); pipe->set_fragment_sampler_views(pipe, 1, &view); pipe->set_framebuffer_state(pipe, &fb_state); @@ -806,11 +823,12 @@ void util_blitter_copy_region(struct blitter_context *blitter, /* Draw the quad with the draw_rectangle callback. */ case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: { /* Set texture coordinates. */ float coord[4]; - get_normalized_texcoords(src, subsrc, srcx, srcy, - srcx+width, srcy+height, coord); + get_texcoords(src, subsrc, srcx, srcy, + srcx+width, srcy+height, normalized, coord); /* Draw. */ blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0, diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h index f316587dea..e33d2e283f 100644 --- a/src/gallium/auxiliary/util/u_blitter.h +++ b/src/gallium/auxiliary/util/u_blitter.h @@ -27,6 +27,7 @@ #ifndef U_BLITTER_H #define U_BLITTER_H +#include "util/u_framebuffer.h" #include "util/u_inlines.h" #include "util/u_memory.h" @@ -258,45 +259,12 @@ void util_blitter_save_vertex_shader(struct blitter_context *blitter, blitter->saved_vs = vs; } -/* XXX This should probably be moved elsewhere. */ -static INLINE -void util_assign_framebuffer_state(struct pipe_framebuffer_state *dst, - const struct pipe_framebuffer_state *src) -{ - unsigned i; - - if (src) { - /* Reference all surfaces. */ - for (i = 0; i < src->nr_cbufs; i++) { - pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]); - } - for (; i < dst->nr_cbufs; i++) { - pipe_surface_reference(&dst->cbufs[i], NULL); - } - - pipe_surface_reference(&dst->zsbuf, src->zsbuf); - - dst->nr_cbufs = src->nr_cbufs; - dst->width = src->width; - dst->height = src->height; - } else { - /* Set all surfaces to NULL. */ - for (i = 0; i < dst->nr_cbufs; i++) { - pipe_surface_reference(&dst->cbufs[i], NULL); - } - - pipe_surface_reference(&dst->zsbuf, NULL); - - dst->nr_cbufs = 0; - } -} - static INLINE void util_blitter_save_framebuffer(struct blitter_context *blitter, const struct pipe_framebuffer_state *state) { blitter->saved_fb_state.nr_cbufs = 0; /* It's ~0 now, meaning it's unsaved. */ - util_assign_framebuffer_state(&blitter->saved_fb_state, state); + util_copy_framebuffer_state(&blitter->saved_fb_state, state); } static INLINE diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index 5056351307..32519b148b 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -73,7 +73,9 @@ #endif +#ifdef DEBUG DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE) +#endif struct util_cpu_caps util_cpu_caps; @@ -83,61 +85,6 @@ static int has_cpuid(void); #endif -#if defined(PIPE_ARCH_X86) - -/* The sigill handlers */ -#if defined(PIPE_OS_LINUX) /*&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)*/ -static void -sigill_handler_sse(int signal, struct sigcontext sc) -{ - /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1" - * instructions are 3 bytes long. We must increment the instruction - * pointer manually to avoid repeated execution of the offending - * instruction. - * - * If the SIGILL is caused by a divide-by-zero when unmasked - * exceptions aren't supported, the SIMD FPU status and control - * word will be restored at the end of the test, so we don't need - * to worry about doing it here. Besides, we may not be able to... - */ - sc.eip += 3; - - util_cpu_caps.has_sse=0; -} - -static void -sigfpe_handler_sse(int signal, struct sigcontext sc) -{ - if (sc.fpstate->magic != 0xffff) { - /* Our signal context has the extended FPU state, so reset the - * divide-by-zero exception mask and clear the divide-by-zero - * exception bit. - */ - sc.fpstate->mxcsr |= 0x00000200; - sc.fpstate->mxcsr &= 0xfffffffb; - } else { - /* If we ever get here, we're completely hosed. - */ - } -} -#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */ - -#if defined(PIPE_OS_WINDOWS) -static LONG CALLBACK -win32_sig_handler_sse(EXCEPTION_POINTERS* ep) -{ - if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){ - ep->ContextRecord->Eip +=3; - util_cpu_caps.has_sse=0; - return EXCEPTION_CONTINUE_EXECUTION; - } - return EXCEPTION_CONTINUE_SEARCH; -} -#endif /* PIPE_OS_WINDOWS */ - -#endif /* PIPE_ARCH_X86 */ - - #if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) static jmp_buf __lv_powerpc_jmpbuf; static volatile sig_atomic_t __lv_powerpc_canjump = 0; @@ -194,123 +141,8 @@ check_os_altivec_support(void) } #endif /* PIPE_ARCH_PPC */ -/* If we're running on a processor that can do SSE, let's see if we - * are allowed to or not. This will catch 2.4.0 or later kernels that - * haven't been configured for a Pentium III but are running on one, - * and RedHat patched 2.2 kernels that have broken exception handling - * support for user space apps that do SSE. - */ -#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) -static void -check_os_katmai_support(void) -{ -#if defined(PIPE_ARCH_X86) -#if defined(PIPE_OS_FREEBSD) - int has_sse=0, ret; - int len = sizeof (has_sse); - - ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0); - if (ret || !has_sse) - util_cpu_caps.has_sse=0; - -#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) - int has_sse, has_sse2, ret, mib[2]; - int varlen; - - mib[0] = CTL_MACHDEP; - mib[1] = CPU_SSE; - varlen = sizeof (has_sse); - - ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0); - if (ret < 0 || !has_sse) { - util_cpu_caps.has_sse = 0; - } else { - util_cpu_caps.has_sse = 1; - } - - mib[1] = CPU_SSE2; - varlen = sizeof (has_sse2); - ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0); - if (ret < 0 || !has_sse2) { - util_cpu_caps.has_sse2 = 0; - } else { - util_cpu_caps.has_sse2 = 1; - } - util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */ - -#elif defined(PIPE_OS_WINDOWS) - LPTOP_LEVEL_EXCEPTION_FILTER exc_fil; - if (util_cpu_caps.has_sse) { - exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse); -#if defined(PIPE_CC_GCC) - __asm __volatile ("xorps %xmm0, %xmm0"); -#elif defined(PIPE_CC_MSVC) - __asm { - xorps xmm0, xmm0 /* executing SSE instruction */ - } -#else -#error Unsupported compiler -#endif - SetUnhandledExceptionFilter(exc_fil); - } -#elif defined(PIPE_OS_LINUX) - struct sigaction saved_sigill; - struct sigaction saved_sigfpe; - - /* Save the original signal handlers. - */ - sigaction(SIGILL, NULL, &saved_sigill); - sigaction(SIGFPE, NULL, &saved_sigfpe); - - signal(SIGILL, (void (*)(int))sigill_handler_sse); - signal(SIGFPE, (void (*)(int))sigfpe_handler_sse); - - /* Emulate test for OSFXSR in CR4. The OS will set this bit if it - * supports the extended FPU save and restore required for SSE. If - * we execute an SSE instruction on a PIII and get a SIGILL, the OS - * doesn't support Streaming SIMD Exceptions, even if the processor - * does. - */ - if (util_cpu_caps.has_sse) { - __asm __volatile ("xorps %xmm1, %xmm0"); - } - - /* Emulate test for OSXMMEXCPT in CR4. The OS will set this bit if - * it supports unmasked SIMD FPU exceptions. If we unmask the - * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS - * doesn't support unmasked SIMD FPU exceptions. If we get a SIGFPE - * as expected, we're okay but we need to clean up after it. - * - * Are we being too stringent in our requirement that the OS support - * unmasked exceptions? Certain RedHat 2.2 kernels enable SSE by - * setting CR4.OSFXSR but don't support unmasked exceptions. Win98 - * doesn't even support them. We at least know the user-space SSE - * support is good in kernels that do support unmasked exceptions, - * and therefore to be safe I'm going to leave this test in here. - */ - if (util_cpu_caps.has_sse) { - /* test_os_katmai_exception_support(); */ - } - - /* Restore the original signal handlers. - */ - sigaction(SIGILL, &saved_sigill, NULL); - sigaction(SIGFPE, &saved_sigfpe, NULL); - -#else - /* We can't use POSIX signal handling to test the availability of - * SSE, so we disable it by default. - */ - util_cpu_caps.has_sse = 0; -#endif /* __linux__ */ -#endif - -#if defined(PIPE_ARCH_X86_64) - util_cpu_caps.has_sse = 1; -#endif -} - +#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) static int has_cpuid(void) { #if defined(PIPE_ARCH_X86) @@ -469,9 +301,6 @@ util_cpu_detect(void) util_cpu_caps.cacheline = regs2[2] & 0xFF; } - if (util_cpu_caps.has_sse) - check_os_katmai_support(); - if (!util_cpu_caps.has_sse) { util_cpu_caps.has_sse2 = 0; util_cpu_caps.has_sse3 = 0; diff --git a/src/gallium/auxiliary/util/u_debug_describe.c b/src/gallium/auxiliary/util/u_debug_describe.c new file mode 100644 index 0000000000..1c90ff3106 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_describe.c @@ -0,0 +1,81 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include <pipe/p_state.h> +#include <util/u_format.h> +#include <util/u_debug_describe.h> +#include <util/u_string.h> + +void +debug_describe_reference(char* buf, const struct pipe_reference*ptr) +{ + strcpy(buf, "pipe_object"); +} + +void +debug_describe_resource(char* buf, const struct pipe_resource *ptr) +{ + switch(ptr->target) + { + case PIPE_BUFFER: + util_sprintf(buf, "pipe_buffer<%u>", (unsigned)util_format_get_stride(ptr->format, ptr->width0)); + break; + case PIPE_TEXTURE_1D: + util_sprintf(buf, "pipe_texture1d<%u,%s,%u>", ptr->width0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_2D: + util_sprintf(buf, "pipe_texture2d<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_RECT: + util_sprintf(buf, "pipe_texture_rect<%u,%u,%s>", ptr->width0, ptr->height0, util_format_short_name(ptr->format)); + break; + case PIPE_TEXTURE_CUBE: + util_sprintf(buf, "pipe_texture_cube<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_3D: + util_sprintf(buf, "pipe_texture3d<%u,%u,%u,%s,%u>", ptr->width0, ptr->height0, ptr->depth0, util_format_short_name(ptr->format), ptr->last_level); + break; + default: + util_sprintf(buf, "pipe_martian_resource<%u>", ptr->target); + break; + } +} + +void +debug_describe_surface(char* buf, const struct pipe_surface *ptr) +{ + char res[128]; + debug_describe_resource(res, ptr->texture); + util_sprintf(buf, "pipe_surface<%s,%u,%u,%u>", res, ptr->face, ptr->level, ptr->zslice); +} + +void +debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr) +{ + char res[128]; + debug_describe_resource(res, ptr->texture); + util_sprintf(buf, "pipe_sampler_view<%s,%s>", res, util_format_short_name(ptr->format)); +} diff --git a/src/gallium/auxiliary/util/u_debug_describe.h b/src/gallium/auxiliary/util/u_debug_describe.h new file mode 100644 index 0000000000..26d1f803bf --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_describe.h @@ -0,0 +1,49 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_DEBUG_DESCRIBE_H_ +#define U_DEBUG_DESCRIBE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct pipe_reference; +struct pipe_resource; +struct pipe_surface; +struct pipe_sampler_view; + +/* a 256-byte buffer is necessary and sufficient */ +void debug_describe_reference(char* buf, const struct pipe_reference*ptr); +void debug_describe_resource(char* buf, const struct pipe_resource *ptr); +void debug_describe_surface(char* buf, const struct pipe_surface *ptr); +void debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr); + +#ifdef __cplusplus +} +#endif + +#endif /* U_DEBUG_DESCRIBE_H_ */ diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.c b/src/gallium/auxiliary/util/u_debug_refcnt.c new file mode 100644 index 0000000000..40a26c9c69 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_refcnt.c @@ -0,0 +1,181 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)) + +/* see http://www.mozilla.org/performance/refcnt-balancer.html for what do with the output + * on Linux, use tools/addr2line.sh to postprocess it before anything else + **/ +#include <util/u_debug.h> +#include <util/u_debug_refcnt.h> +#include <util/u_debug_stack.h> +#include <util/u_debug_symbol.h> +#include <util/u_string.h> +#include <util/u_hash_table.h> +#include <os/os_thread.h> +#include <os/os_stream.h> + +int debug_refcnt_state; + +struct os_stream* stream; + +/* TODO: maybe move this serial machinery to a stand-alone module and expose it? */ +static pipe_mutex serials_mutex; +static struct util_hash_table* serials_hash; +static unsigned serials_last; + +static unsigned hash_ptr(void* p) +{ + return (unsigned)(uintptr_t)p; +} + +static int compare_ptr(void* a, void* b) +{ + if(a == b) + return 0; + else if(a < b) + return -1; + else + return 1; +} + +static boolean debug_serial(void* p, unsigned* pserial) +{ + unsigned serial; + boolean found = TRUE; + pipe_mutex_lock(serials_mutex); + if(!serials_hash) + serials_hash = util_hash_table_create(hash_ptr, compare_ptr); + serial = (unsigned)(uintptr_t)util_hash_table_get(serials_hash, p); + if(!serial) + { + /* time to stop logging... (you'll have a 100 GB logfile at least at this point) + * TODO: avoid this + */ + serial = ++serials_last; + if(!serial) + { + debug_error("More than 2^32 objects detected, aborting.\n"); + os_abort(); + } + + util_hash_table_set(serials_hash, p, (void*)(uintptr_t)serial); + found = FALSE; + } + pipe_mutex_unlock(serials_mutex); + *pserial = serial; + return found; +} + +static void debug_serial_delete(void* p) +{ + pipe_mutex_lock(serials_mutex); + util_hash_table_remove(serials_hash, p); + pipe_mutex_unlock(serials_mutex); +} + +#define STACK_LEN 64 + +static void dump_stack(const char* symbols[STACK_LEN]) +{ + unsigned i; + for(i = 0; i < STACK_LEN; ++i) + { + if(symbols[i]) + os_stream_printf(stream, "%s\n", symbols[i]); + } + os_stream_write(stream, "\n", 1); +} + +void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ + if(debug_refcnt_state < 0) + return; + + if(!debug_refcnt_state) + { + const char* filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL); + if(filename && filename[0]) + stream = os_file_stream_create(filename); + + if(stream) + debug_refcnt_state = 1; + else + debug_refcnt_state = -1; + } + + if(debug_refcnt_state > 0) + { + struct debug_stack_frame frames[STACK_LEN]; + const char* symbols[STACK_LEN]; + char buf[1024]; + + unsigned i; + unsigned refcnt = p->count; + unsigned serial; + boolean existing = debug_serial((void*)p, &serial); + + debug_backtrace_capture(frames, 1, STACK_LEN); + for(i = 0; i < STACK_LEN; ++i) + { + if(frames[i].function) + symbols[i] = debug_symbol_name_cached(frames[i].function); + else + symbols[i] = 0; + } + + get_desc(buf, p); + + if(!existing) + { + os_stream_printf(stream, "<%s> %p %u Create\n", buf, p, serial); + dump_stack(symbols); + + /* this is there to provide a gradual change even if we don't see the initialization */ + for(i = 1; i <= refcnt - change; ++i) + { + os_stream_printf(stream, "<%s> %p %u AddRef %u\n", buf, p, serial, i); + dump_stack(symbols); + } + } + + if(change) + { + os_stream_printf(stream, "<%s> %p %u %s %u\n", buf, p, serial, change > 0 ? "AddRef" : "Release", refcnt); + dump_stack(symbols); + } + + if(!refcnt) + { + debug_serial_delete((void*)p); + os_stream_printf(stream, "<%s> %p %u Destroy\n", buf, p, serial); + dump_stack(symbols); + } + + os_stream_flush(stream); + } +} +#endif diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h new file mode 100644 index 0000000000..bea2d1c478 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_refcnt.h @@ -0,0 +1,63 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_DEBUG_REFCNT_H_ +#define U_DEBUG_REFCNT_H_ + +#include <pipe/p_config.h> +#include <pipe/p_state.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*debug_reference_descriptor)(char*, const struct pipe_reference*); + +#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)) + +extern int debug_refcnt_state; + +void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change); + +static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ + if (debug_refcnt_state >= 0) + debug_reference_slowpath(p, get_desc, change); +} + +#else + +static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ +} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* U_DEBUG_REFCNT_H_ */ diff --git a/src/gallium/auxiliary/util/u_debug_symbol.c b/src/gallium/auxiliary/util/u_debug_symbol.c index 6e250575d6..332952af88 100644 --- a/src/gallium/auxiliary/util/u_debug_symbol.c +++ b/src/gallium/auxiliary/util/u_debug_symbol.c @@ -33,9 +33,12 @@ */ #include "pipe/p_compiler.h" +#include "os/os_thread.h" +#include "u_string.h" #include "u_debug.h" #include "u_debug_symbol.h" +#include "u_hash_table.h" #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86) @@ -113,8 +116,8 @@ BOOL WINAPI j_SymGetSymFromAddr(HANDLE hProcess, DWORD Address, PDWORD Displacem } -static INLINE boolean -debug_symbol_print_imagehlp(const void *addr) +static INLINE void +debug_symbol_name_imagehlp(const void *addr, char* buf, unsigned size) { HANDLE hProcess; BYTE symbolBuffer[1024]; @@ -131,25 +134,95 @@ debug_symbol_print_imagehlp(const void *addr) if(j_SymInitialize(hProcess, NULL, TRUE)) bSymInitialized = TRUE; } - + if(!j_SymGetSymFromAddr(hProcess, (DWORD)addr, &dwDisplacement, pSymbol)) - return FALSE; + buf[0] = 0; + else + { + strncpy(buf, pSymbol->Name, size); + buf[size - 1] = 0; + } +} +#endif - debug_printf("\t%s\n", pSymbol->Name); +#ifdef __GLIBC__ +#include <execinfo.h> - return TRUE; - +/* This can only provide dynamic symbols, or binary offsets into a file. + * + * To fix this, post-process the output with tools/addr2line.sh + */ +static INLINE void +debug_symbol_name_glibc(const void *addr, char* buf, unsigned size) +{ + char** syms = backtrace_symbols((void**)&addr, 1); + strncpy(buf, syms[0], size); + buf[size - 1] = 0; + free(syms); } #endif - void -debug_symbol_print(const void *addr) +debug_symbol_name(const void *addr, char* buf, unsigned size) { #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86) - if(debug_symbol_print_imagehlp(addr)) + debug_symbol_name_imagehlp(addr, buf, size); + if(buf[0]) return; #endif - - debug_printf("\t%p\n", addr); + +#ifdef __GLIBC__ + debug_symbol_name_glibc(addr, buf, size); + if(buf[0]) + return; +#endif + + util_snprintf(buf, size, "%p", addr); + buf[size - 1] = 0; +} + +void +debug_symbol_print(const void *addr) +{ + char buf[1024]; + debug_symbol_name(addr, buf, sizeof(buf)); + debug_printf("\t%s\n", buf); +} + +struct util_hash_table* symbols_hash; +pipe_mutex symbols_mutex; + +static unsigned hash_ptr(void* p) +{ + return (unsigned)(uintptr_t)p; +} + +static int compare_ptr(void* a, void* b) +{ + if(a == b) + return 0; + else if(a < b) + return -1; + else + return 1; +} + +const char* +debug_symbol_name_cached(const void *addr) +{ + const char* name; + pipe_mutex_lock(symbols_mutex); + if(!symbols_hash) + symbols_hash = util_hash_table_create(hash_ptr, compare_ptr); + name = util_hash_table_get(symbols_hash, (void*)addr); + if(!name) + { + char buf[1024]; + debug_symbol_name(addr, buf, sizeof(buf)); + name = strdup(buf); + + util_hash_table_set(symbols_hash, (void*)addr, (void*)name); + } + pipe_mutex_unlock(symbols_mutex); + return name; } diff --git a/src/gallium/auxiliary/util/u_debug_symbol.h b/src/gallium/auxiliary/util/u_debug_symbol.h index 021586987b..b247706c2a 100644 --- a/src/gallium/auxiliary/util/u_debug_symbol.h +++ b/src/gallium/auxiliary/util/u_debug_symbol.h @@ -43,8 +43,13 @@ extern "C" { void -debug_symbol_print(const void *addr); +debug_symbol_name(const void *addr, char* buf, unsigned size); + +const char* +debug_symbol_name_cached(const void *addr); +void +debug_symbol_print(const void *addr); #ifdef __cplusplus } diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h index 99f260bf96..fd1bbe5ffd 100644 --- a/src/gallium/auxiliary/util/u_dirty_surfaces.h +++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h @@ -1,9 +1,39 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #ifndef U_DIRTY_SURFACES_H_ #define U_DIRTY_SURFACES_H_ +#include "pipe/p_state.h" + #include "util/u_double_list.h" #include "util/u_math.h" +struct pipe_context; + typedef void (*util_dirty_surface_flush_t) (struct pipe_context *, struct pipe_surface *); struct util_dirty_surfaces diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h index 2a91ea0f9a..f06d09ef91 100644 --- a/src/gallium/auxiliary/util/u_draw.h +++ b/src/gallium/auxiliary/util/u_draw.h @@ -31,6 +31,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" static INLINE void diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h index 9d1c1713a7..980cadf22d 100644 --- a/src/gallium/auxiliary/util/u_dynarray.h +++ b/src/gallium/auxiliary/util/u_dynarray.h @@ -106,6 +106,9 @@ util_dynarray_trim(struct util_dynarray *buf) #define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type))) #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type) #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type)) +#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx)) +#define util_dynarray_begin(buf) ((buf)->data) +#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size)) #endif /* U_DYNARRAY_H */ diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c index b7fe2d3003..6a931a9581 100644 --- a/src/gallium/auxiliary/util/u_gen_mipmap.c +++ b/src/gallium/auxiliary/util/u_gen_mipmap.c @@ -1255,6 +1255,7 @@ fallback_gen_mipmap(struct gen_mipmap_state *ctx, make_1d_mipmap(ctx, pt, face, baseLevel, lastLevel); break; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_CUBE: make_2d_mipmap(ctx, pt, face, baseLevel, lastLevel); break; diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h index 540305c146..78473bf35a 100644 --- a/src/gallium/auxiliary/util/u_inlines.h +++ b/src/gallium/auxiliary/util/u_inlines.h @@ -33,6 +33,8 @@ #include "pipe/p_state.h" #include "pipe/p_screen.h" #include "util/u_debug.h" +#include "util/u_debug_describe.h" +#include "util/u_debug_refcnt.h" #include "util/u_atomic.h" #include "util/u_box.h" #include "util/u_math.h" @@ -67,7 +69,9 @@ pipe_is_referenced(struct pipe_reference *reference) * \return TRUE if the object's refcount hits zero and should be destroyed. */ static INLINE boolean -pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) +pipe_reference_described(struct pipe_reference *ptr, + struct pipe_reference *reference, + debug_reference_descriptor get_desc) { boolean destroy = FALSE; @@ -76,6 +80,7 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) if (reference) { assert(pipe_is_referenced(reference)); p_atomic_inc(&reference->count); + debug_reference(reference, get_desc, 1); } if (ptr) { @@ -83,41 +88,49 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) if (p_atomic_dec_zero(&ptr->count)) { destroy = TRUE; } + debug_reference(ptr, get_desc, -1); } } return destroy; } +static INLINE boolean +pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) +{ + return pipe_reference_described(ptr, reference, + (debug_reference_descriptor)debug_describe_reference); +} static INLINE void pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf) { struct pipe_surface *old_surf = *ptr; - if (pipe_reference(&(*ptr)->reference, &surf->reference)) + if (pipe_reference_described(&(*ptr)->reference, &surf->reference, + (debug_reference_descriptor)debug_describe_surface)) old_surf->texture->screen->tex_surface_destroy(old_surf); *ptr = surf; } - static INLINE void pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex) { struct pipe_resource *old_tex = *ptr; - if (pipe_reference(&(*ptr)->reference, &tex->reference)) + if (pipe_reference_described(&(*ptr)->reference, &tex->reference, + (debug_reference_descriptor)debug_describe_resource)) old_tex->screen->resource_destroy(old_tex->screen, old_tex); *ptr = tex; } - static INLINE void pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view) { struct pipe_sampler_view *old_view = *ptr; - if (pipe_reference(&(*ptr)->reference, &view->reference)) + if (pipe_reference_described(&(*ptr)->reference, &view->reference, + (debug_reference_descriptor)debug_describe_sampler_view)) old_view->context->sampler_view_destroy(old_view->context, old_view); *ptr = view; } diff --git a/src/gallium/auxiliary/util/u_linkage.c b/src/gallium/auxiliary/util/u_linkage.c new file mode 100644 index 0000000000..2f6f41ba84 --- /dev/null +++ b/src/gallium/auxiliary/util/u_linkage.c @@ -0,0 +1,149 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_debug.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" +#include "util/u_linkage.h" + +/* we must only record the registers that are actually used, not just declared */ +static INLINE boolean +util_semantic_set_test_and_set(struct util_semantic_set *set, unsigned value) +{ + unsigned mask = 1 << (value % (sizeof(long) * 8)); + unsigned long *p = &set->masks[value / (sizeof(long) * 8)]; + unsigned long v = *p & mask; + *p |= mask; + return !!v; +} + +unsigned +util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file) +{ + struct tgsi_shader_info info; + struct tgsi_parse_context parse; + unsigned count = 0; + ubyte *semantic_name; + ubyte *semantic_index; + + tgsi_scan_shader(tokens, &info); + + if(file == TGSI_FILE_INPUT) + { + semantic_name = info.input_semantic_name; + semantic_index = info.input_semantic_index; + } + else if(file == TGSI_FILE_OUTPUT) + { + semantic_name = info.output_semantic_name; + semantic_index = info.output_semantic_index; + } + else + { + assert(0); + semantic_name = NULL; + semantic_index = NULL; + } + + tgsi_parse_init(&parse, tokens); + + memset(set->masks, 0, sizeof(set->masks)); + while(!tgsi_parse_end_of_tokens(&parse)) + { + tgsi_parse_token(&parse); + + if(parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) + { + const struct tgsi_full_instruction *finst = &parse.FullToken.FullInstruction; + unsigned i; + for(i = 0; i < finst->Instruction.NumDstRegs; ++i) + { + if(finst->Dst[i].Register.File == file) + { + unsigned idx = finst->Dst[i].Register.Index; + if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC) + { + if(!util_semantic_set_test_and_set(set, semantic_index[idx])) + ++count; + } + } + } + + for(i = 0; i < finst->Instruction.NumSrcRegs; ++i) + { + if(finst->Src[i].Register.File == file) + { + unsigned idx = finst->Src[i].Register.Index; + if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC) + { + if(!util_semantic_set_test_and_set(set, semantic_index[idx])) + ++count; + } + } + } + } + } + tgsi_parse_free(&parse); + + return count; +} + +#define UTIL_SEMANTIC_SET_FOR_EACH(i, set) for(i = 0; i < 256; ++i) if(set->masks[i / (sizeof(long) * 8)] & (1 << (i % (sizeof(long) * 8)))) + +void +util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots) +{ + int first = -1; + int last = -1; + unsigned i; + + memset(layout, 0xff, num_slots); + + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + { + if(first < 0) + first = i; + last = i; + } + + if(last < efficient_slots) + { + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[i] = i; + } + else if((last - first) < efficient_slots) + { + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[i - first] = i; + } + else + { + unsigned idx = 0; + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[idx++] = i; + } +} diff --git a/src/gallium/auxiliary/util/u_linkage.h b/src/gallium/auxiliary/util/u_linkage.h new file mode 100644 index 0000000000..4720e0ee60 --- /dev/null +++ b/src/gallium/auxiliary/util/u_linkage.h @@ -0,0 +1,66 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_LINKAGE_H_ +#define U_LINKAGE_H_ + +#include "pipe/p_compiler.h" +#include "pipe/p_shader_tokens.h" + +struct util_semantic_set +{ + unsigned long masks[256 / 8 / sizeof(unsigned long)]; +}; + +static INLINE bool +util_semantic_set_contains(struct util_semantic_set *set, unsigned char value) +{ + return !!(set->masks[value / (sizeof(long) * 8)] & (1 << (value / (sizeof(long) * 8)))); +} + +unsigned util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file); + +/* efficient_slots is the number of slots such that hardware performance is + * the same for using that amount, with holes, or less slots but with less + * holes. + * + * num_slots is the size of the layout array and hardware limit instead. + * + * efficient_slots == 0 or efficient_solts == num_slots are typical settings. + */ +void util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots); + +static INLINE void +util_semantic_table_from_layout(unsigned char *table, unsigned char *layout, unsigned char first_slot_value, unsigned char num_slots) +{ + int i; + memset(table, 0xff, sizeof(table)); + + for(i = 0; i < num_slots; ++i) + table[layout[i]] = first_slot_value + i; +} + +#endif /* U_LINKAGE_H_ */ diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index fe19466436..69a7681494 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -361,16 +361,6 @@ util_is_inf_or_nan(float x) /** - * Test whether x is a power of two. - */ -static INLINE boolean -util_is_pot(unsigned x) -{ - return (x & (x - 1)) == 0; -} - - -/** * Find first bit set in word. Least significant bit is 1. * Return 0 if no bits set. */ @@ -566,6 +556,9 @@ util_bswap16(uint16_t n) #define MIN3( A, B, C ) MIN2( MIN2( A, B ), C ) #define MAX3( A, B, C ) MAX2( MAX2( A, B ), C ) +#define MIN4( A, B, C, D ) MIN2( MIN2( A, B ), MIN2(C, D) ) +#define MAX4( A, B, C, D ) MAX2( MAX2( A, B ), MAX2(C, D) ) + /** * Align a value, only works pot alignemnts. diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h index 5f113f742b..aae8b8bdf1 100644 --- a/src/gallium/auxiliary/util/u_pack_color.h +++ b/src/gallium/auxiliary/util/u_pack_color.h @@ -42,12 +42,18 @@ #include "util/u_math.h" - +/** + * Helper union for packing pixel values. + * Will often contain values in formats which are too complex to be described + * in simple terms, hence might just effectively contain a number of bytes. + * Must be big enough to hold data for all formats (currently 256 bits). + */ union util_color { ubyte ub; ushort us; uint ui; float f[4]; + double d[4]; }; /** diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c index 9bbcf1c8c4..56fcfac069 100644 --- a/src/gallium/auxiliary/util/u_rect.c +++ b/src/gallium/auxiliary/util/u_rect.c @@ -32,6 +32,7 @@ #include "util/u_format.h" #include "util/u_rect.h" +#include "util/u_pack_color.h" /** @@ -94,7 +95,7 @@ util_fill_rect(ubyte * dst, unsigned dst_y, unsigned width, unsigned height, - uint32_t value) + union util_color *uc) { unsigned i, j; unsigned width_size; @@ -110,40 +111,54 @@ util_fill_rect(ubyte * dst, dst_y /= blockheight; width = (width + blockwidth - 1)/blockwidth; height = (height + blockheight - 1)/blockheight; - + dst += dst_x * blocksize; dst += dst_y * dst_stride; width_size = width * blocksize; - + switch (blocksize) { case 1: if(dst_stride == width_size) - memset(dst, (ubyte) value, height * width_size); + memset(dst, uc->ub, height * width_size); else { - for (i = 0; i < height; i++) { - memset(dst, (ubyte) value, width_size); - dst += dst_stride; - } + for (i = 0; i < height; i++) { + memset(dst, uc->ub, width_size); + dst += dst_stride; + } } break; case 2: for (i = 0; i < height; i++) { - uint16_t *row = (uint16_t *)dst; - for (j = 0; j < width; j++) - *row++ = (uint16_t) value; - dst += dst_stride; + uint16_t *row = (uint16_t *)dst; + for (j = 0; j < width; j++) + *row++ = uc->us; + dst += dst_stride; } break; case 4: for (i = 0; i < height; i++) { - uint32_t *row = (uint32_t *)dst; - for (j = 0; j < width; j++) - *row++ = value; - dst += dst_stride; + uint32_t *row = (uint32_t *)dst; + for (j = 0; j < width; j++) + *row++ = uc->ui; + dst += dst_stride; + } + break; + case 8: + case 12: + case 16: + case 24: + case 32: + for (i = 0; i < height; i++) { + ubyte *row = dst; + for (j = 0; j < width; j++) { + memcpy(row, uc, blocksize); + row += blocksize; + } + dst += dst_stride; } break; default: - assert(0); - break; + assert(0); + break; } } diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h index 40d57e662d..4cb90d3c31 100644 --- a/src/gallium/auxiliary/util/u_rect.h +++ b/src/gallium/auxiliary/util/u_rect.h @@ -26,17 +26,67 @@ **************************************************************************/ -/** - * Pipe copy/fill rect helpers. +#ifndef U_RECT_H +#define U_RECT_H + +#include "pipe/p_compiler.h" + +struct u_rect { + int x0, x1; + int y0, y1; +}; + +/* Do two rectangles intersect? */ +static INLINE boolean +u_rect_test_intersection(const struct u_rect *a, + const struct u_rect *b) +{ + return (!(a->x1 < b->x0 || + b->x1 < a->x0 || + a->y1 < b->y0 || + b->y1 < a->y0)); +} +/* Find the intersection of two rectangles known to intersect. + */ +static INLINE void +u_rect_find_intersection(const struct u_rect *a, + struct u_rect *b) +{ + /* Caller should verify intersection exists before calling. + */ + if (b->x0 < a->x0) b->x0 = a->x0; + if (b->x1 > a->x1) b->x1 = a->x1; + if (b->y0 < a->y0) b->y0 = a->y0; + if (b->y1 > a->y1) b->y1 = a->y1; +} -#ifndef U_RECT_H -#define U_RECT_H +static INLINE void +u_rect_possible_intersection(const struct u_rect *a, + struct u_rect *b) +{ + if (u_rect_test_intersection(a,b)) { + u_rect_find_intersection(a,b); + } + else { + b->x0 = b->x1 = b->y0 = b->y1 = 0; + } +} #include "pipe/p_format.h" +#include "util/u_pack_color.h" + + + +/********************************************************************** + * Pipe copy/fill rect helpers. + */ +/* These really should move to a different file: + */ +#include "pipe/p_format.h" extern void util_copy_rect(ubyte * dst, enum pipe_format format, @@ -47,7 +97,7 @@ util_copy_rect(ubyte * dst, enum pipe_format format, extern void util_fill_rect(ubyte * dst, enum pipe_format format, unsigned dst_stride, unsigned dst_x, unsigned dst_y, - unsigned width, unsigned height, uint32_t value); + unsigned width, unsigned height, union util_color *uc); #endif /* U_RECT_H */ diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c index 5b682f496c..58ef68377f 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.c +++ b/src/gallium/auxiliary/util/u_simple_shaders.c @@ -37,6 +37,7 @@ #include "pipe/p_context.h" #include "pipe/p_shader_tokens.h" +#include "pipe/p_state.h" #include "util/u_simple_shaders.h" #include "util/u_debug.h" #include "tgsi/tgsi_ureg.h" diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h index 206e1ec311..7f80fc1270 100644 --- a/src/gallium/auxiliary/util/u_split_prim.h +++ b/src/gallium/auxiliary/util/u_split_prim.h @@ -1,5 +1,12 @@ /* Originally written by Ben Skeggs for the nv50 driver*/ -#include <pipe/p_defines.h> + +#ifndef U_SPLIT_PRIM_H +#define U_SPLIT_PRIM_H + +#include "pipe/p_defines.h" +#include "pipe/p_compiler.h" + +#include "util/u_debug.h" struct util_split_prim { void *priv; @@ -48,7 +55,7 @@ util_split_prim_next(struct util_split_prim *s, unsigned max_verts) } } - if (s->p_start + s->close_first + max_verts >= s->p_end) { + if ((s->p_end - s->p_start) + s->close_first <= max_verts) { s->emit(s->priv, s->p_start, s->p_end - s->p_start); if (s->close_first) s->emit(s->priv, s->start, 1); @@ -103,3 +110,5 @@ util_split_prim_next(struct util_split_prim *s, unsigned max_verts) s->p_start += (max_verts - repeat); return FALSE; } + +#endif /* U_SPLIT_PRIM_H */ diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c index 607c31f5ee..c5d68f8df8 100644 --- a/src/gallium/auxiliary/util/u_staging.c +++ b/src/gallium/auxiliary/util/u_staging.c @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #include "util/u_staging.h" #include "pipe/p_context.h" #include "util/u_memory.h" @@ -8,7 +34,7 @@ util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigne { memset(template, 0, sizeof(struct pipe_resource)); if(pt->target != PIPE_BUFFER && depth <= 1) - template->target = PIPE_TEXTURE_2D; + template->target = PIPE_TEXTURE_RECT; else template->target = pt->target; template->format = pt->format; @@ -23,20 +49,16 @@ util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigne } struct util_staging_transfer * -util_staging_transfer_new(struct pipe_context *pipe, +util_staging_transfer_init(struct pipe_context *pipe, struct pipe_resource *pt, struct pipe_subresource sr, unsigned usage, const struct pipe_box *box, - bool direct) + bool direct, struct util_staging_transfer *tx) { struct pipe_screen *pscreen = pipe->screen; - struct util_staging_transfer *tx; - struct pipe_resource staging_resource_template; - tx = CALLOC_STRUCT(util_staging_transfer); - if (!tx) - return NULL; + struct pipe_resource staging_resource_template; pipe_resource_reference(&tx->base.resource, pt); tx->base.sr = sr; diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h index 602faa2971..1aab78cc88 100644 --- a/src/gallium/auxiliary/util/u_staging.h +++ b/src/gallium/auxiliary/util/u_staging.h @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + /* Direct3D 10/11 has no concept of transfers. Applications instead * create resources with a STAGING or DYNAMIC usage, copy between them * and the real resource and use Map to map the STAGING/DYNAMIC resource. @@ -21,15 +47,15 @@ struct util_staging_transfer { }; /* user must be stride, slice_stride and offset */ -/* pt->usage == PIPE_USAGE_DYNAMIC should be a good value to pass for direct */ -/* staging resource is currently created with PIPE_USAGE_DYNAMIC */ +/* pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING should be a good value to pass for direct */ +/* staging resource is currently created with PIPE_USAGE_STAGING */ struct util_staging_transfer * -util_staging_transfer_new(struct pipe_context *pipe, +util_staging_transfer_init(struct pipe_context *pipe, struct pipe_resource *pt, struct pipe_subresource sr, unsigned usage, const struct pipe_box *box, - bool direct); + bool direct, struct util_staging_transfer *tx); void util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx); diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c index cab7691c70..af99163b2e 100644 --- a/src/gallium/auxiliary/util/u_surface.c +++ b/src/gallium/auxiliary/util/u_surface.c @@ -216,7 +216,7 @@ util_clear_render_target(struct pipe_context *pipe, assert(dst->texture); if (!dst->texture) return; - util_pack_color(rgba, dst->texture->format, &uc); + dst_trans = pipe_get_transfer(pipe, dst->texture, dst->face, @@ -232,46 +232,10 @@ util_clear_render_target(struct pipe_context *pipe, if (dst_map) { assert(dst_trans->stride > 0); - switch (util_format_get_blocksize(dst->texture->format)) { - case 1: - case 2: - case 4: - util_pack_color(rgba, dst->texture->format, &uc); - util_fill_rect(dst_map, dst->texture->format, - dst_trans->stride, - 0, 0, width, height, uc.ui); - break; - case 8: - { - /* expand the 4-byte clear value to an 8-byte value */ - /* should probably not convert back from ubyte but not - sure what this code really achieved since it doesn't even - check for format type... */ - ushort *row = (ushort *) dst_map; - ushort val0 = UBYTE_TO_USHORT((uc.ui >> 0) & 0xff); - ushort val1 = UBYTE_TO_USHORT((uc.ui >> 8) & 0xff); - ushort val2 = UBYTE_TO_USHORT((uc.ui >> 16) & 0xff); - ushort val3 = UBYTE_TO_USHORT((uc.ui >> 24) & 0xff); - unsigned i, j; - val0 = (val0 << 8) | val0; - val1 = (val1 << 8) | val1; - val2 = (val2 << 8) | val2; - val3 = (val3 << 8) | val3; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - row[j*4+0] = val0; - row[j*4+1] = val1; - row[j*4+2] = val2; - row[j*4+3] = val3; - } - row += dst_trans->stride/2; - } - } - break; - default: - assert(0); - break; - } + util_pack_color(rgba, dst->texture->format, &uc); + util_fill_rect(dst_map, dst->texture->format, + dst_trans->stride, + 0, 0, width, height, &uc); } pipe->transfer_unmap(pipe, dst_trans); diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c index 7733ad24d0..404e121995 100644 --- a/src/gallium/auxiliary/util/u_surfaces.c +++ b/src/gallium/auxiliary/util/u_surfaces.c @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #include "u_surfaces.h" #include "util/u_hash_table.h" #include "util/u_inlines.h" diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h index af978c7057..17d8a5d3a5 100644 --- a/src/gallium/auxiliary/util/u_surfaces.h +++ b/src/gallium/auxiliary/util/u_surfaces.h @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #ifndef U_SURFACES_H_ #define U_SURFACES_H_ @@ -22,7 +48,7 @@ struct pipe_surface *util_surfaces_do_get(struct util_surfaces *us, unsigned sur static INLINE struct pipe_surface * util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags) { - if(likely(pt->target == PIPE_TEXTURE_2D && us->u.array)) + if(likely((pt->target == PIPE_TEXTURE_2D || pt->target == PIPE_TEXTURE_RECT) && us->u.array)) { struct pipe_surface *ps = us->u.array[level]; if(ps) @@ -52,7 +78,7 @@ void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps); static INLINE void util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps) { - if(likely(ps->texture->target == PIPE_TEXTURE_2D)) + if(likely(ps->texture->target == PIPE_TEXTURE_2D || ps->texture->target == PIPE_TEXTURE_RECT)) { us->u.array[ps->level] = 0; return; diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h index 986eee0743..558351d0ce 100644 --- a/src/gallium/auxiliary/util/u_tile.h +++ b/src/gallium/auxiliary/util/u_tile.h @@ -29,7 +29,10 @@ #define P_TILE_H #include "pipe/p_compiler.h" +#include "pipe/p_format.h" +#include "pipe/p_state.h" +struct pipe_context; struct pipe_transfer; /** diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h index eb07945d15..e3a38730f2 100644 --- a/src/gallium/auxiliary/util/u_transfer.h +++ b/src/gallium/auxiliary/util/u_transfer.h @@ -8,6 +8,7 @@ #include "pipe/p_state.h" struct pipe_context; +struct winsys_handle; boolean u_default_resource_get_handle(struct pipe_screen *screen, struct pipe_resource *resource, diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h index a124924fc8..de016df02e 100644 --- a/src/gallium/auxiliary/util/u_upload_mgr.h +++ b/src/gallium/auxiliary/util/u_upload_mgr.h @@ -32,11 +32,8 @@ #ifndef U_UPLOAD_MGR_H #define U_UPLOAD_MGR_H -#include "pipe/p_defines.h" - -struct pipe_screen; +struct pipe_context; struct pipe_resource; -struct u_upload_mgr; struct u_upload_mgr *u_upload_create( struct pipe_context *pipe, |