diff options
Diffstat (limited to 'src/gallium/auxiliary')
59 files changed, 3346 insertions, 2242 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile index 843b72bc38..eb2a40cbaa 100644 --- a/src/gallium/auxiliary/Makefile +++ b/src/gallium/auxiliary/Makefile @@ -26,7 +26,6 @@ C_SOURCES = \ draw/draw_pipe_wide_line.c \ draw/draw_pipe_wide_point.c \ draw/draw_pt.c \ - draw/draw_pt_elts.c \ draw/draw_pt_emit.c \ draw/draw_pt_fetch.c \ draw/draw_pt_fetch_emit.c \ @@ -35,8 +34,7 @@ C_SOURCES = \ draw/draw_pt_post_vs.c \ draw/draw_pt_so_emit.c \ draw/draw_pt_util.c \ - draw/draw_pt_varray.c \ - draw/draw_pt_vcache.c \ + draw/draw_pt_vsplit.c \ draw/draw_vertex.c \ draw/draw_vs.c \ draw/draw_vs_varient.c \ @@ -131,6 +129,7 @@ C_SOURCES = \ util/u_sampler.c \ util/u_simple_shaders.c \ util/u_snprintf.c \ + util/u_staging.c \ util/u_surface.c \ util/u_surfaces.c \ util/u_texture.c \ diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript index 1f09198721..30e5d02c9b 100644 --- a/src/gallium/auxiliary/SConscript +++ b/src/gallium/auxiliary/SConscript @@ -71,7 +71,6 @@ source = [ 'draw/draw_pipe_wide_line.c', 'draw/draw_pipe_wide_point.c', 'draw/draw_pt.c', - 'draw/draw_pt_elts.c', 'draw/draw_pt_emit.c', 'draw/draw_pt_fetch.c', 'draw/draw_pt_fetch_emit.c', @@ -80,8 +79,7 @@ source = [ 'draw/draw_pt_post_vs.c', 'draw/draw_pt_so_emit.c', 'draw/draw_pt_util.c', - 'draw/draw_pt_varray.c', - 'draw/draw_pt_vcache.c', + 'draw/draw_pt_vsplit.c', 'draw/draw_vertex.c', 'draw/draw_vs.c', 'draw/draw_vs_aos.c', @@ -180,6 +178,7 @@ source = [ 'util/u_sampler.c', 'util/u_simple_shaders.c', 'util/u_snprintf.c', + 'util/u_staging.c', 'util/u_surface.c', 'util/u_surfaces.c', 'util/u_texture.c', diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 995b675b9a..d118a8db52 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -34,6 +34,7 @@ #include "pipe/p_context.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_cpu_detect.h" #include "draw_context.h" #include "draw_vs.h" #include "draw_gs.h" @@ -41,6 +42,25 @@ #if HAVE_LLVM #include "gallivm/lp_bld_init.h" #include "draw_llvm.h" + +static boolean +draw_get_option_use_llvm(void) +{ + static boolean first = TRUE; + static boolean value; + if (first) { + first = FALSE; + value = debug_get_bool_option("DRAW_USE_LLVM", TRUE); + +#ifdef PIPE_ARCH_X86 + util_cpu_detect(); + /* require SSE2 due to LLVM PR6960. */ + if (!util_cpu_caps.has_sse2) + value = FALSE; +#endif + } + return value; +} #endif struct draw_context *draw_create( struct pipe_context *pipe ) @@ -50,10 +70,13 @@ struct draw_context *draw_create( struct pipe_context *pipe ) goto fail; #if HAVE_LLVM - lp_build_init(); - assert(lp_build_engine); - draw->engine = lp_build_engine; - draw->llvm = draw_llvm_create(draw); + if(draw_get_option_use_llvm()) + { + lp_build_init(); + assert(lp_build_engine); + draw->engine = lp_build_engine; + draw->llvm = draw_llvm_create(draw); + } #endif if (!draw_init(draw)) @@ -135,7 +158,8 @@ void draw_destroy( struct draw_context *draw ) draw_vs_destroy( draw ); draw_gs_destroy( draw ); #ifdef HAVE_LLVM - draw_llvm_destroy( draw->llvm ); + if(draw->llvm) + draw_llvm_destroy( draw->llvm ); #endif FREE( draw ); @@ -659,7 +683,8 @@ draw_set_mapped_texture(struct draw_context *draw, const void *data[DRAW_MAX_TEXTURE_LEVELS]) { #ifdef HAVE_LLVM - draw_llvm_set_mapped_texture(draw, + if(draw->llvm) + draw_llvm_set_mapped_texture(draw, sampler_idx, width, height, depth, last_level, row_stride, img_stride, data); diff --git a/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/src/gallium/auxiliary/draw/draw_decompose_tmp.h index a52d2b5058..a142563af9 100644 --- a/src/gallium/auxiliary/draw/draw_decompose_tmp.h +++ b/src/gallium/auxiliary/draw/draw_decompose_tmp.h @@ -54,10 +54,10 @@ FUNC(FUNC_VARS) FUNC_ENTER; - /* prim, count, and last_vertex_last should have been defined */ + /* prim, prim_flags, count, and last_vertex_last should have been defined */ if (0) { - debug_printf("%s: prim 0x%x, count %d, last_vertex_last %d\n", - __FUNCTION__, prim, count, last_vertex_last); + debug_printf("%s: prim 0x%x, prim_flags 0x%x, count %d, last_vertex_last %d\n", + __FUNCTION__, prim, prim_flags, count, last_vertex_last); } switch (prim) { @@ -80,7 +80,7 @@ FUNC(FUNC_VARS) case PIPE_PRIM_LINE_LOOP: case PIPE_PRIM_LINE_STRIP: if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; idx[1] = GET_ELT(0); idx[2] = idx[1]; @@ -90,7 +90,7 @@ FUNC(FUNC_VARS) LINE(flags, idx[0], idx[1]); } /* close the loop */ - if (prim == PIPE_PRIM_LINE_LOOP) + if (prim == PIPE_PRIM_LINE_LOOP && !prim_flags) LINE(flags, idx[1], idx[2]); } break; @@ -255,17 +255,23 @@ FUNC(FUNC_VARS) if (last_vertex_last) { flags = (DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_2 | DRAW_PIPE_EDGE_FLAG_0); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_2; + edge_next = DRAW_PIPE_EDGE_FLAG_0; - edge_finish = DRAW_PIPE_EDGE_FLAG_1; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_1; } else { flags = (DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_0; + edge_next = DRAW_PIPE_EDGE_FLAG_1; - edge_finish = DRAW_PIPE_EDGE_FLAG_2; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_2; } idx[0] = GET_ELT(0); @@ -300,7 +306,7 @@ FUNC(FUNC_VARS) case PIPE_PRIM_LINE_STRIP_ADJACENCY: if (count >= 4) { - flags = DRAW_PIPE_RESET_STIPPLE; + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; idx[1] = GET_ELT(0); idx[2] = GET_ELT(1); idx[3] = GET_ELT(2); diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c index 4a1013e79a..50a03ac95a 100644 --- a/src/gallium/auxiliary/draw/draw_gs.c +++ b/src/gallium/auxiliary/draw/draw_gs.c @@ -380,7 +380,7 @@ static void gs_tri_adj(struct draw_geometry_shader *shader, #define FUNC gs_run_elts #define LOCAL_VARS const ushort *elts = input_prims->elts; -#define GET_ELT(idx) (elts[idx] & ~DRAW_PIPE_FLAG_MASK) +#define GET_ELT(idx) (elts[idx]) #include "draw_gs_tmp.h" @@ -457,6 +457,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader, output_prims->start = 0; output_prims->count = shader->emitted_vertices; output_prims->prim = shader->output_primitive; + output_prims->flags = 0x0; output_prims->primitive_lengths = shader->primitive_lengths; output_prims->primitive_count = shader->emitted_primitives; output_verts->count = shader->emitted_vertices; diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h index 4a17af0dea..de7b02655a 100644 --- a/src/gallium/auxiliary/draw/draw_gs_tmp.h +++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h @@ -6,12 +6,10 @@ #define FUNC_ENTER \ /* declare more local vars */ \ - struct draw_context *draw = gs->draw; \ const unsigned prim = input_prims->prim; \ + const unsigned prim_flags = input_prims->flags; \ const unsigned count = input_prims->count; \ - const boolean last_vertex_last = \ - !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ + const boolean last_vertex_last = TRUE; \ do { \ debug_assert(input_prims->primitive_count == 1); \ switch (prim) { \ diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index de99b00a81..58d3e345e5 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -210,13 +210,6 @@ draw_llvm_create(struct draw_context *draw) { struct draw_llvm *llvm; -#ifdef PIPE_ARCH_X86 - util_cpu_detect(); - /* require SSE2 due to LLVM PR6960. */ - if (!util_cpu_caps.has_sse2) - return NULL; -#endif - llvm = CALLOC_STRUCT( draw_llvm ); if (!llvm) return NULL; @@ -683,7 +676,6 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) unsigned i, j; struct lp_build_context bld; struct lp_build_loop_state lp_loop; - struct lp_type vs_type = lp_type_float_vec(32); const int max_vertices = 4; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; void *code; @@ -732,7 +724,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) builder = LLVMCreateBuilder(); LLVMPositionBuilderAtEnd(builder, block); - lp_build_context_init(&bld, builder, vs_type); + lp_build_context_init(&bld, builder, lp_type_int(32)); end = lp_build_add(&bld, start, count); @@ -845,9 +837,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian struct draw_context *draw = llvm->draw; unsigned i, j; struct lp_build_context bld; - struct lp_build_context bld_int; struct lp_build_loop_state lp_loop; - struct lp_type vs_type = lp_type_float_vec(32); const int max_vertices = 4; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; LLVMValueRef fetch_max; @@ -899,8 +889,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian builder = LLVMCreateBuilder(); LLVMPositionBuilderAtEnd(builder, block); - lp_build_context_init(&bld, builder, vs_type); - lp_build_context_init(&bld_int, builder, lp_type_int(32)); + lp_build_context_init(&bld, builder, lp_type_int(32)); step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); @@ -935,7 +924,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian /* make sure we're not out of bounds which can happen * if fetch_count % 4 != 0, because on the last iteration * a few of the 4 vertex fetches will be out of bounds */ - true_index = lp_build_min(&bld_int, true_index, fetch_max); + true_index = lp_build_min(&bld, true_index, fetch_max); fetch_ptr = LLVMBuildGEP(builder, fetch_elts, &true_index, 1, ""); diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c index 070ac803c8..b75262a357 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.c +++ b/src/gallium/auxiliary/draw/draw_pipe.c @@ -169,27 +169,29 @@ static void do_triangle( struct draw_context *draw, /* * Set up macros for draw_pt_decompose.h template code. * This code uses vertex indexes / elements. - * - * Flags are needed by the stipple and unfilled stages. When the two stages - * are active, vcache_run_extras is called and the flags are stored in the - * higher bits of i0. Otherwise, flags do not matter. */ -#define TRIANGLE(flags,i0,i1,i2) \ - do_triangle( draw, \ - i0, /* flags */ \ - verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (i1), \ - verts + stride * (i2) ) - -#define LINE(flags,i0,i1) \ - do_line( draw, \ - i0, /* flags */ \ - verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (i1) ) +#define TRIANGLE(flags,i0,i1,i2) \ + do { \ + do_triangle( draw, \ + flags, \ + verts + stride * (i0), \ + verts + stride * (i1), \ + verts + stride * (i2) ); \ + } while (0) + +#define LINE(flags,i0,i1) \ + do { \ + do_line( draw, \ + flags, \ + verts + stride * (i0), \ + verts + stride * (i1) ); \ + } while (0) #define POINT(i0) \ - do_point( draw, verts + stride * (i0) ) + do { \ + do_point( draw, verts + stride * (i0) ); \ + } while (0) #define GET_ELT(idx) (elts[idx]) @@ -197,6 +199,7 @@ static void do_triangle( struct draw_context *draw, #define FUNC_VARS \ struct draw_context *draw, \ unsigned prim, \ + unsigned prim_flags, \ struct vertex_header *vertices, \ unsigned stride, \ const ushort *elts, \ @@ -240,8 +243,7 @@ void draw_pipeline_run( struct draw_context *draw, unsigned max_index = 0x0, i; /* find the largest element index */ for (i = 0; i < count; i++) { - unsigned int index = (prim_info->elts[start + i] - & ~DRAW_PIPE_FLAG_MASK); + unsigned int index = prim_info->elts[start + i]; if (index > max_index) max_index = index; } @@ -251,6 +253,7 @@ void draw_pipeline_run( struct draw_context *draw, pipe_run_elts(draw, prim_info->prim, + prim_info->flags, vert_info->verts, vert_info->stride, prim_info->elts + start, @@ -288,6 +291,7 @@ void draw_pipeline_run( struct draw_context *draw, #define FUNC_VARS \ struct draw_context *draw, \ unsigned prim, \ + unsigned prim_flags, \ struct vertex_header *vertices, \ unsigned stride, \ unsigned count @@ -320,6 +324,7 @@ void draw_pipeline_run_linear( struct draw_context *draw, pipe_run_linear(draw, prim_info->prim, + prim_info->flags, (struct vertex_header*)verts, vert_info->stride, count); diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c index 3c93c9014a..58c5858734 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c +++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c @@ -353,9 +353,6 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf ) /* Allocate a new vertex buffer */ vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size; - /* even number */ - vbuf->max_vertices = vbuf->max_vertices & ~1; - if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID) vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1; diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 397d4bf653..854c45f060 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -140,8 +140,7 @@ struct draw_context } middle; struct { - struct draw_pt_front_end *vcache; - struct draw_pt_front_end *varray; + struct draw_pt_front_end *vsplit; } front; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; @@ -296,6 +295,10 @@ struct draw_vertex_info { unsigned count; }; +/* these flags are set if the primitive is a segment of a larger one */ +#define DRAW_SPLIT_BEFORE 0x1 +#define DRAW_SPLIT_AFTER 0x2 + struct draw_prim_info { boolean linear; unsigned start; @@ -304,6 +307,7 @@ struct draw_prim_info { unsigned count; unsigned prim; + unsigned flags; unsigned *primitive_lengths; unsigned primitive_count; }; @@ -369,21 +373,15 @@ void draw_pipeline_destroy( struct draw_context *draw ); -/* We use the top few bits in the elts[] parameter to convey a little - * API information. This limits the number of vertices we can address - * to only 4096 -- if that becomes a problem, we can switch to 32-bit - * draw indices. - * - * These flags expected at first vertex of lines & triangles when - * unfilled and/or line stipple modes are operational. +/* + * These flags are used by the pipeline when unfilled and/or line stipple modes + * are operational. */ -#define DRAW_PIPE_MAX_VERTICES (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_0 (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_1 (0x2<<12) -#define DRAW_PIPE_EDGE_FLAG_2 (0x4<<12) -#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12) -#define DRAW_PIPE_RESET_STIPPLE (0x8<<12) -#define DRAW_PIPE_FLAG_MASK (0xf<<12) +#define DRAW_PIPE_EDGE_FLAG_0 0x1 +#define DRAW_PIPE_EDGE_FLAG_1 0x2 +#define DRAW_PIPE_EDGE_FLAG_2 0x4 +#define DRAW_PIPE_EDGE_FLAG_ALL 0x7 +#define DRAW_PIPE_RESET_STIPPLE 0x8 void draw_pipeline_run( struct draw_context *draw, const struct draw_vertex_info *vert, diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 248927505d..feacd8258b 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -43,21 +43,9 @@ DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE) DEBUG_GET_ONCE_BOOL_OPTION(draw_no_fse, "DRAW_NO_FSE", FALSE) -#ifdef HAVE_LLVM -DEBUG_GET_ONCE_BOOL_OPTION(draw_use_llvm, "DRAW_USE_LLVM", TRUE) -#endif - -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - if (count < first) - return 0; - return count - (count - first) % incr; -} - - /* Overall we split things into: - * - frontend -- prepare fetch_elts, draw_elts - eg vcache + * - frontend -- prepare fetch_elts, draw_elts - eg vsplit * - middle -- fetch, shade, cliptest, viewport * - pipeline -- the prim pipeline: clipping, wide lines, etc * - backend -- the vbuf_render provided by the driver. @@ -77,7 +65,7 @@ draw_pt_arrays(struct draw_context *draw, { unsigned first, incr; draw_pt_split_prim(prim, &first, &incr); - count = trim(count, first, incr); + count = draw_pt_trim_count(count, first, incr); if (count < first) return TRUE; } @@ -115,22 +103,11 @@ draw_pt_arrays(struct draw_context *draw, middle = draw->pt.middle.general; } - - /* Pick the right frontend - */ - if (draw->pt.user.elts || (opt & PT_PIPELINE)) { - frontend = draw->pt.front.vcache; - } else { - frontend = draw->pt.front.varray; - } + frontend = draw->pt.front.vsplit; frontend->prepare( frontend, prim, middle, opt ); - frontend->run(frontend, - draw_pt_elt_func(draw), - draw_pt_elt_ptr(draw, start), - draw->pt.user.eltBias, - count); + frontend->run(frontend, start, count); frontend->finish( frontend ); @@ -143,12 +120,8 @@ boolean draw_pt_init( struct draw_context *draw ) draw->pt.test_fse = debug_get_option_draw_fse(); draw->pt.no_fse = debug_get_option_draw_no_fse(); - draw->pt.front.vcache = draw_pt_vcache( draw ); - if (!draw->pt.front.vcache) - return FALSE; - - draw->pt.front.varray = draw_pt_varray(draw); - if (!draw->pt.front.varray) + draw->pt.front.vsplit = draw_pt_vsplit(draw); + if (!draw->pt.front.vsplit) return FALSE; draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw ); @@ -164,7 +137,7 @@ boolean draw_pt_init( struct draw_context *draw ) return FALSE; #if HAVE_LLVM - if (debug_get_option_draw_use_llvm()) + if (draw->llvm) draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw ); #endif @@ -194,14 +167,9 @@ void draw_pt_destroy( struct draw_context *draw ) draw->pt.middle.fetch_shade_emit = NULL; } - if (draw->pt.front.vcache) { - draw->pt.front.vcache->destroy( draw->pt.front.vcache ); - draw->pt.front.vcache = NULL; - } - - if (draw->pt.front.varray) { - draw->pt.front.varray->destroy( draw->pt.front.varray ); - draw->pt.front.varray = NULL; + if (draw->pt.front.vsplit) { + draw->pt.front.vsplit->destroy( draw->pt.front.vsplit ); + draw->pt.front.vsplit = NULL; } } diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h index 44356fba4c..0db5666529 100644 --- a/src/gallium/auxiliary/draw/draw_pt.h +++ b/src/gallium/auxiliary/draw/draw_pt.h @@ -35,8 +35,6 @@ #include "pipe/p_compiler.h" -typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx ); - struct draw_pt_middle_end; struct draw_context; struct draw_prim_info; @@ -52,13 +50,18 @@ struct draw_vertex_info; /* The "front end" - prepare sets of fetch, draw elements for the * middle end. * - * Currenly one version of this: - * - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims - * Later: - * - varray, varray_split - * - velement, velement_split + * The fetch elements are indices to the vertices. The draw elements are + * indices to the fetched vertices. When both arrays of elements are both + * linear, middle->run_linear is called; When only the fetch elements are + * linear, middle->run_linear_elts is called; Otherwise, middle->run is + * called. + * + * When the number of the draw elements exceeds max_vertex of the middle end, + * the draw elements (as well as the fetch elements) are splitted and the + * middle end is called multiple times. * - * Currenly only using the vcache version. + * Currenly there is: + * - vsplit - catchall implementation, splits big prims */ struct draw_pt_front_end { void (*prepare)( struct draw_pt_front_end *, @@ -67,9 +70,7 @@ struct draw_pt_front_end { unsigned opt ); void (*run)( struct draw_pt_front_end *, - pt_elt_func elt_func, - const void *elt_ptr, - int elt_bias, + unsigned start, unsigned count ); void (*finish)( struct draw_pt_front_end * ); @@ -80,6 +81,8 @@ struct draw_pt_front_end { /* The "middle end" - prepares actual hardware vertices for the * hardware backend. * + * prim_flags is as defined by pipe_draw_info::flags. + * * Currently two versions of this: * - fetch, vertex shade, cliptest, prim-pipeline * - fetch, emit (ie passthrough) @@ -94,11 +97,13 @@ struct draw_pt_middle_end { const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); void (*run_linear)(struct draw_pt_middle_end *, unsigned start, - unsigned count); + unsigned count, + unsigned prim_flags ); /* Transform all vertices in a linear range and then draw them with * the supplied element list. May fail and return FALSE. @@ -107,7 +112,8 @@ struct draw_pt_middle_end { unsigned fetch_start, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); int (*get_max_vertex_count)( struct draw_pt_middle_end * ); @@ -122,19 +128,11 @@ struct vbuf_render; struct vertex_header; -/* Helper functions. - */ -pt_elt_func draw_pt_elt_func( struct draw_context *draw ); -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ); - /* Frontends: * - * Currently only the general-purpose vcache implementation, could add - * a special case for tiny vertex buffers. + * Currently only the general-purpose vsplit implementation. */ -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ); -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw); +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw); /* Middle-ends: @@ -237,6 +235,7 @@ void draw_pt_post_vs_destroy( struct pt_post_vs *pvs ); * Utils: */ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr); +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr); #endif diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c deleted file mode 100644 index 88f4d9f495..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_elts.c +++ /dev/null @@ -1,89 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <keith@tungstengraphics.com> - */ - -#include "draw/draw_pt.h" -#include "draw/draw_private.h" - -/* Neat get_elt func that also works for varrays drawing by encoding - * the start value into a pointer. - */ - -static unsigned elt_uint( const void *elts, unsigned idx ) -{ - return *(((const uint *)elts) + idx); -} - -static unsigned elt_ushort( const void *elts, unsigned idx ) -{ - return *(((const ushort *)elts) + idx); -} - -static unsigned elt_ubyte( const void *elts, unsigned idx ) -{ - return *(((const ubyte *)elts) + idx); -} - -static unsigned elt_vert( const void *elts, unsigned idx ) -{ - /* unsigned index is packed in the pointer */ - return (unsigned)(uintptr_t)elts + idx; -} - -pt_elt_func draw_pt_elt_func( struct draw_context *draw ) -{ - switch (draw->pt.user.eltSize) { - case 0: return &elt_vert; - case 1: return &elt_ubyte; - case 2: return &elt_ushort; - case 4: return &elt_uint; - default: return NULL; - } -} - -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ) -{ - const char *elts = draw->pt.user.elts; - - switch (draw->pt.user.eltSize) { - case 0: - return (const void *)(((const ubyte *)NULL) + start); - case 1: - return (const void *)(((const ubyte *)elts) + start); - case 2: - return (const void *)(((const ushort *)elts) + start); - case 4: - return (const void *)(((const uint *)elts) + start); - default: - return NULL; - } -} diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index 5568fbb9f8..89d96c4235 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -120,9 +120,6 @@ void draw_pt_emit_prepare( struct pt_emit *emit, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* even number */ - *max_vertices = *max_vertices & ~1; } diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index 5c8af17c8e..80a89428b6 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -191,15 +191,6 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; } @@ -210,7 +201,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -273,7 +265,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -334,7 +327,8 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index b8270280b6..a31d3feb16 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -175,15 +175,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; - /* Probably need to do this somewhere (or fix exec shader not to * need it): */ @@ -197,7 +188,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, static void fse_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -265,7 +257,8 @@ fse_run(struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -327,7 +320,8 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index 5b16c3788e..96b40fb363 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -112,16 +112,13 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, gs_out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } - /* return even number */ - *max_vertices = *max_vertices & ~1; - /* No need to prepare the shader. */ vs->prepare(vs, draw); @@ -295,7 +292,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -311,6 +309,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; @@ -320,7 +319,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, unsigned start, - unsigned count) + unsigned count, + unsigned prim_flags) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -336,6 +336,7 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, prim_info.count = count; prim_info.elts = NULL; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &count; @@ -348,7 +349,8 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -364,6 +366,7 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 4b99bee86a..78b1bf988c 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -118,16 +118,13 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } - /* return even number */ - *max_vertices = *max_vertices & ~1; - draw_llvm_make_variant_key(fpme->llvm, &key); li = first_elem(&shader->variants); @@ -294,7 +291,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -310,6 +308,7 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; @@ -319,7 +318,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle, unsigned start, - unsigned count) + unsigned count, + unsigned prim_flags) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -335,6 +335,7 @@ static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle, prim_info.count = count; prim_info.elts = NULL; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &count; @@ -348,7 +349,8 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -364,6 +366,7 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c index f7f4f24d35..c86bdd99a3 100644 --- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c @@ -225,7 +225,7 @@ static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2) #define FUNC so_run_elts #define LOCAL_VARS const ushort *elts = input_prims->elts; -#define GET_ELT(idx) (elts[start + (idx)] & ~DRAW_PIPE_FLAG_MASK) +#define GET_ELT(idx) (elts[start + (idx)]) #include "draw_so_emit_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c index 3236d38e6a..513bbbed21 100644 --- a/src/gallium/auxiliary/draw/draw_pt_util.c +++ b/src/gallium/auxiliary/draw/draw_pt_util.c @@ -53,7 +53,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; case PIPE_PRIM_LINES_ADJACENCY: *first = 4; - *incr = 2; + *incr = 4; break; case PIPE_PRIM_LINE_STRIP_ADJACENCY: *first = 4; @@ -65,7 +65,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; case PIPE_PRIM_TRIANGLES_ADJACENCY: *first = 6; - *incr = 3; + *incr = 6; break; case PIPE_PRIM_TRIANGLE_STRIP: case PIPE_PRIM_TRIANGLE_FAN: @@ -75,7 +75,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: *first = 6; - *incr = 1; + *incr = 2; break; case PIPE_PRIM_QUADS: *first = 4; @@ -92,3 +92,10 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; } } + +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr) +{ + if (count < first) + return 0; + return count - (count - first) % incr; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c deleted file mode 100644 index cd7bb7bf25..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray.c +++ /dev/null @@ -1,200 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "util/u_math.h" -#include "util/u_memory.h" - -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - -#define FETCH_MAX 256 -#define DRAW_MAX (FETCH_MAX+8) - -struct varray_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned driver_fetch_max; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; -}; - - -static void varray_flush_linear(struct varray_frontend *varray, - unsigned start, unsigned count) -{ - if (count) { - assert(varray->middle->run_linear); - varray->middle->run_linear(varray->middle, start, count); - } -} - -static void varray_line_loop_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count, - boolean end ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 1) { - unsigned nr = 0, i; - - for (i = 0; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - if (end) - varray->fetch_elts[nr++] = start; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - -static void varray_fan_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 2) { - unsigned nr = 0, i; - - if (segment_start != 0) - varray->fetch_elts[nr++] = start; - - for (i = 0 ; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - - -#define FUNC varray_run -#include "draw_pt_varray_tmp_linear.h" - -static unsigned decompose_prim[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY + 1] = { - PIPE_PRIM_POINTS, - PIPE_PRIM_LINES, - PIPE_PRIM_LINE_STRIP, /* decomposed LINELOOP */ - PIPE_PRIM_LINE_STRIP, - PIPE_PRIM_TRIANGLES, - PIPE_PRIM_TRIANGLE_STRIP, - PIPE_PRIM_TRIANGLE_FAN, - PIPE_PRIM_QUADS, - PIPE_PRIM_QUAD_STRIP, - PIPE_PRIM_POLYGON, - PIPE_PRIM_LINES_ADJACENCY, - PIPE_PRIM_LINE_STRIP_ADJACENCY, - PIPE_PRIM_TRIANGLES_ADJACENCY, - PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY -}; - - - -static void varray_prepare(struct draw_pt_front_end *frontend, - unsigned in_prim, - struct draw_pt_middle_end *middle, - unsigned opt) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - - varray->base.run = varray_run; - - varray->input_prim = in_prim; - assert(in_prim < Elements(decompose_prim)); - varray->output_prim = decompose_prim[in_prim]; - - varray->middle = middle; - middle->prepare(middle, - varray->output_prim, - opt, &varray->driver_fetch_max ); - - /* check that the max is even */ - assert((varray->driver_fetch_max & 1) == 0); - - varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max); -} - - - - -static void varray_finish(struct draw_pt_front_end *frontend) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - varray->middle->finish(varray->middle); - varray->middle = NULL; -} - -static void varray_destroy(struct draw_pt_front_end *frontend) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw) -{ - ushort i; - struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend); - if (varray == NULL) - return NULL; - - varray->base.prepare = varray_prepare; - varray->base.run = NULL; - varray->base.finish = varray_finish; - varray->base.destroy = varray_destroy; - varray->draw = draw; - - for (i = 0; i < DRAW_MAX; i++) { - varray->draw_elts[i] = i; - } - - return &varray->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h deleted file mode 100644 index 7c722457c3..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h +++ /dev/null @@ -1,238 +0,0 @@ - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - struct draw_context *draw = varray->draw; - unsigned start = (unsigned)elts; - - boolean flatfirst = (draw->rasterizer->flatshade && - draw->rasterizer->flatshade_first); - unsigned i, j; - ushort flags; - unsigned first, incr; - - varray->fetch_start = start; - - draw_pt_split_prim(varray->input_prim, &first, &incr); - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i < end; i++) { - POINT(varray, i + 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+1 < end; i += 2) { - LINE(varray, DRAW_PIPE_RESET_STIPPLE, - i + 0, i + 1); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINE_LOOP: - if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; - - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - LINE(varray, flags, i - 1, 0); - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - case PIPE_PRIM_LINE_STRIP: - flags = DRAW_PIPE_RESET_STIPPLE; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i += 3) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLE_STRIP: - if (flatfirst) { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1 + (i&1), i + 2 - (i&1)); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - else { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i + 2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0 + (i&1), i + 1 - (i&1), i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - break; - - case PIPE_PRIM_TRIANGLE_FAN: - if (count >= 3) { - if (flatfirst) { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - else { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - } - break; - - case PIPE_PRIM_QUADS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 4) { - QUAD(varray, i + 0, i + 1, i + 2, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_QUAD_STRIP: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 2) { - QUAD(varray, i + 2, i + 0, i + 1, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - break; - - case PIPE_PRIM_POLYGON: - { - /* These bitflags look a little odd because we submit the - * vertices as (1,2,0) to satisfy flatshade requirements. - */ - const ushort edge_first = DRAW_PIPE_EDGE_FLAG_2; - const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0; - const ushort edge_last = DRAW_PIPE_EDGE_FLAG_1; - - flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++, flags = edge_middle) { - - if (i + 3 == count) - flags |= edge_last; - - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - default: - assert(0); - break; - } - - varray_flush(varray); -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h deleted file mode 100644 index a292346be9..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h +++ /dev/null @@ -1,98 +0,0 @@ -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - return count - (count - first) % incr; -} - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - unsigned start = (unsigned) ((char *) elts - (char *) NULL); - - unsigned j; - unsigned first, incr; - - assert(elt_bias == 0); - - draw_pt_split_prim(varray->input_prim, &first, &incr); - - /* Sanitize primitive length: - */ - count = trim(count, first, incr); - if (count < first) - return; - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - case PIPE_PRIM_LINES: - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_LINE_STRIP: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_QUADS: - case PIPE_PRIM_QUAD_STRIP: - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr ); - varray_flush_linear(varray, start + j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - case PIPE_PRIM_LINE_LOOP: - /* Always have to decompose as we've stated that this will be - * emitted as a line-strip. - */ - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_line_loop_segment(varray, start, j, nr, nr == remaining); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - - case PIPE_PRIM_POLYGON: - case PIPE_PRIM_TRIANGLE_FAN: - if (count < varray->driver_fetch_max) { - varray_flush_linear(varray, start, count); - } - else { - for ( j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_fan_segment(varray, start, j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - } - break; - - default: - assert(0); - break; - } -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c deleted file mode 100644 index a848b54f7d..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache.c +++ /dev/null @@ -1,610 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <keith@tungstengraphics.com> - */ - -#include "util/u_memory.h" -#include "util/u_prim.h" -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - - -#define CACHE_MAX 256 -#define FETCH_MAX 256 -#define DRAW_MAX (16*1024) - - -struct vcache_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - unsigned in[CACHE_MAX]; - ushort out[CACHE_MAX]; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned draw_count; - unsigned fetch_count; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; - - unsigned middle_prim; - unsigned opt; -}; - - -static INLINE void -vcache_flush( struct vcache_frontend *vcache ) -{ - if (vcache->middle_prim != vcache->output_prim) { - vcache->middle_prim = vcache->output_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - if (vcache->draw_count) { - vcache->middle->run( vcache->middle, - vcache->fetch_elts, - vcache->fetch_count, - vcache->draw_elts, - vcache->draw_count ); - } - - memset(vcache->in, ~0, sizeof(vcache->in)); - vcache->fetch_count = 0; - vcache->draw_count = 0; -} - - -static INLINE void -vcache_check_flush( struct vcache_frontend *vcache ) -{ - if (vcache->draw_count + 6 >= DRAW_MAX || - vcache->fetch_count + 6 >= FETCH_MAX) { - vcache_flush( vcache ); - } -} - - -static INLINE void -vcache_elt( struct vcache_frontend *vcache, - unsigned felt, - ushort flags ) -{ - unsigned idx = felt % CACHE_MAX; - - if (vcache->in[idx] != felt) { - assert(vcache->fetch_count < FETCH_MAX); - - vcache->in[idx] = felt; - vcache->out[idx] = (ushort)vcache->fetch_count; - vcache->fetch_elts[vcache->fetch_count++] = felt; - } - - vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags; -} - - - -static INLINE void -vcache_triangle( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_point( struct vcache_frontend *vcache, - unsigned i0 ) -{ - vcache_elt(vcache, i0, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_adj_flags( struct vcache_frontend *vcache, - unsigned flags, - unsigned a0, unsigned i0, unsigned i1, unsigned a1 ) -{ - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_adj( struct vcache_frontend *vcache, - unsigned a0, unsigned i0, unsigned i1, unsigned a1 ) -{ - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_adj_flags( struct vcache_frontend *vcache, - unsigned flags, - unsigned i0, unsigned a0, - unsigned i1, unsigned a1, - unsigned i2, unsigned a2 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_elt(vcache, i2, 0); - vcache_elt(vcache, a2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_adj( struct vcache_frontend *vcache, - unsigned i0, unsigned a0, - unsigned i1, unsigned a1, - unsigned i2, unsigned a2 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_elt(vcache, i2, 0); - vcache_elt(vcache, a2, 0); - vcache_check_flush(vcache); -} - - -/* At least for now, we're back to using a template include file for - * this. The two paths aren't too different though - it may be - * possible to reunify them. - */ -#define TRIANGLE(flags,i0,i1,i2) vcache_triangle_flags(vcache,flags,i0,i1,i2) -#define LINE(flags,i0,i1) vcache_line_flags(vcache,flags,i0,i1) -#define POINT(i0) vcache_point(vcache,i0) -#define LINE_ADJ(flags,a0,i0,i1,a1) \ - vcache_line_adj_flags(vcache,flags,a0,i0,i1,a1) -#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \ - vcache_triangle_adj_flags(vcache,flags,i0,a0,i1,a1,i2,a2) -#define FUNC vcache_run_extras -#include "draw_pt_vcache_tmp.h" - -#define TRIANGLE(flags,i0,i1,i2) vcache_triangle(vcache,i0,i1,i2) -#define LINE(flags,i0,i1) vcache_line(vcache,i0,i1) -#define POINT(i0) vcache_point(vcache,i0) -#define LINE_ADJ(flags,a0,i0,i1,a1) \ - vcache_line_adj(vcache,a0,i0,i1,a1) -#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \ - vcache_triangle_adj(vcache,i0,a0,i1,a1,i2,a2) -#define FUNC vcache_run -#include "draw_pt_vcache_tmp.h" - -static INLINE void -rebase_uint_elts( const unsigned *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -rebase_ushort_elts( const ushort *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -rebase_ubyte_elts( const ubyte *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -translate_uint_elts( const unsigned *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - -static INLINE void -translate_ushort_elts( const ushort *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - -static INLINE void -translate_ubyte_elts( const ubyte *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - - - -#if 0 -static INLINE enum pipe_format -format_from_get_elt( pt_elt_func get_elt ) -{ - switch (draw->pt.user.eltSize) { - case 1: return PIPE_FORMAT_R8_UNORM; - case 2: return PIPE_FORMAT_R16_UNORM; - case 4: return PIPE_FORMAT_R32_UNORM; - default: return PIPE_FORMAT_NONE; - } -} -#endif - - -/** - * Check if any vertex attributes use instance divisors. - * Note that instance divisors complicate vertex fetching so we need - * to take the vcache path when they're in use. - */ -static boolean -any_instance_divisors(const struct draw_context *draw) -{ - uint i; - - for (i = 0; i < draw->pt.nr_vertex_elements; i++) { - uint div = draw->pt.vertex_element[i].instance_divisor; - if (div) - return TRUE; - } - return FALSE; -} - - -static INLINE void -vcache_check_run( struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned draw_count ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - struct draw_context *draw = vcache->draw; - const unsigned min_index = draw->pt.user.min_index; - const unsigned max_index = draw->pt.user.max_index; - const unsigned index_size = draw->pt.user.eltSize; - unsigned fetch_count; - const ushort *transformed_elts; - ushort *storage = NULL; - boolean ok = FALSE; - - /* debug: verify indexes are in range [min_index, max_index] */ - if (0) { - unsigned i; - for (i = 0; i < draw_count; i++) { - if (index_size == 1) { - assert( ((const ubyte *) elts)[i] >= min_index); - assert( ((const ubyte *) elts)[i] <= max_index); - } - else if (index_size == 2) { - assert( ((const ushort *) elts)[i] >= min_index); - assert( ((const ushort *) elts)[i] <= max_index); - } - else { - assert(index_size == 4); - assert( ((const uint *) elts)[i] >= min_index); - assert( ((const uint *) elts)[i] <= max_index); - } - } - } - - /* Note: max_index is frequently 0xffffffff so we have to be sure - * that any arithmetic involving max_index doesn't overflow! - */ - if (max_index >= (unsigned) DRAW_PIPE_MAX_VERTICES) - goto fail; - - if (any_instance_divisors(draw)) - goto fail; - - fetch_count = max_index + 1 - min_index; - - if (0) - debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, - vcache->fetch_max, - draw_count); - - if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES || - fetch_count >= UNDEFINED_VERTEX_ID || - fetch_count > draw_count) { - if (0) debug_printf("fail\n"); - goto fail; - } - - if (vcache->middle_prim != vcache->input_prim) { - vcache->middle_prim = vcache->input_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - assert((elt_bias >= 0 && min_index + elt_bias >= min_index) || - (elt_bias < 0 && min_index + elt_bias < min_index)); - - if (min_index == 0 && - index_size == 2) { - transformed_elts = (const ushort *)elts; - } - else { - storage = MALLOC( draw_count * sizeof(ushort) ); - if (!storage) - goto fail; - - if (min_index == 0) { - switch(index_size) { - case 1: - translate_ubyte_elts( (const ubyte *)elts, - draw_count, - storage ); - break; - - case 2: - translate_ushort_elts( (const ushort *)elts, - draw_count, - storage ); - break; - - case 4: - translate_uint_elts( (const uint *)elts, - draw_count, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - else { - switch(index_size) { - case 1: - rebase_ubyte_elts( (const ubyte *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 2: - rebase_ushort_elts( (const ushort *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 4: - rebase_uint_elts( (const uint *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - transformed_elts = storage; - } - - if (fetch_count < UNDEFINED_VERTEX_ID) - ok = vcache->middle->run_linear_elts( vcache->middle, - min_index + elt_bias, /* start */ - fetch_count, - transformed_elts, - draw_count ); - - FREE(storage); - - if (ok) - return; - - debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n", - fetch_count, draw_count); - -fail: - vcache_run( frontend, get_elt, elts, elt_bias, draw_count ); -} - - - - -static void -vcache_prepare( struct draw_pt_front_end *frontend, - unsigned in_prim, - struct draw_pt_middle_end *middle, - unsigned opt ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - - if (opt & PT_PIPELINE) { - vcache->base.run = vcache_run_extras; - } - else { - vcache->base.run = vcache_check_run; - } - - /* VCache will always emit the reduced version of its input - * primitive, ie STRIP/FANS become TRIS, etc. - * - * This is not to be confused with what the GS might be up to, - * which is a separate issue. - */ - vcache->input_prim = in_prim; - switch (in_prim) { - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - vcache->output_prim = PIPE_PRIM_LINES_ADJACENCY; - break; - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - vcache->output_prim = PIPE_PRIM_TRIANGLES_ADJACENCY; - break; - default: - vcache->output_prim = u_reduced_prim(in_prim); - } - - vcache->middle = middle; - vcache->opt = opt; - - /* Have to run prepare here, but try and guess a good prim for - * doing so: - */ - vcache->middle_prim = (opt & PT_PIPELINE) - ? vcache->output_prim : vcache->input_prim; - - middle->prepare( middle, - vcache->middle_prim, - opt, &vcache->fetch_max ); -} - - -static void -vcache_finish( struct draw_pt_front_end *frontend ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - vcache->middle->finish( vcache->middle ); - vcache->middle = NULL; -} - - -static void -vcache_destroy( struct draw_pt_front_end *frontend ) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ) -{ - struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend ); - if (vcache == NULL) - return NULL; - - vcache->base.prepare = vcache_prepare; - vcache->base.run = NULL; - vcache->base.finish = vcache_finish; - vcache->base.destroy = vcache_destroy; - vcache->draw = draw; - - memset(vcache->in, ~0, sizeof(vcache->in)); - - return &vcache->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h deleted file mode 100644 index 1a3748d5f0..0000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h +++ /dev/null @@ -1,19 +0,0 @@ -#define FUNC_VARS \ - struct draw_pt_front_end *frontend, \ - pt_elt_func get_elt, \ - const void *elts, \ - int elt_bias, \ - unsigned count - -#define LOCAL_VARS \ - struct vcache_frontend *vcache = (struct vcache_frontend *) frontend; \ - struct draw_context *draw = vcache->draw; \ - const unsigned prim = vcache->input_prim; \ - const boolean last_vertex_last = !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); - -#define GET_ELT(idx) (get_elt(elts, idx) + elt_bias) - -#define FUNC_EXIT do { vcache_flush(vcache); } while (0) - -#include "draw_decompose_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c new file mode 100644 index 0000000000..a687525309 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c @@ -0,0 +1,208 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "util/u_math.h" +#include "util/u_memory.h" + +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_pt.h" + +#define SEGMENT_SIZE 1024 +#define MAP_SIZE 256 + +struct vsplit_frontend { + struct draw_pt_front_end base; + struct draw_context *draw; + + unsigned prim; + + struct draw_pt_middle_end *middle; + + unsigned max_vertices; + ushort segment_size; + + /* buffers for splitting */ + unsigned fetch_elts[SEGMENT_SIZE]; + ushort draw_elts[SEGMENT_SIZE]; + ushort identity_draw_elts[SEGMENT_SIZE]; + + struct { + /* map a fetch element to a draw element */ + unsigned fetches[MAP_SIZE]; + ushort draws[MAP_SIZE]; + boolean has_max_fetch; + + ushort num_fetch_elts; + ushort num_draw_elts; + } cache; +}; + + +static void +vsplit_clear_cache(struct vsplit_frontend *vsplit) +{ + memset(vsplit->cache.fetches, 0xff, sizeof(vsplit->cache.fetches)); + vsplit->cache.has_max_fetch = FALSE; + vsplit->cache.num_fetch_elts = 0; + vsplit->cache.num_draw_elts = 0; +} + +static void +vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags) +{ + vsplit->middle->run(vsplit->middle, + vsplit->fetch_elts, vsplit->cache.num_fetch_elts, + vsplit->draw_elts, vsplit->cache.num_draw_elts, flags); +} + +/** + * Add a fetch element and add it to the draw elements. + */ +static INLINE void +vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch) +{ + unsigned hash = fetch % MAP_SIZE; + + if (vsplit->cache.fetches[hash] != fetch) { + /* update cache */ + vsplit->cache.fetches[hash] = fetch; + vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts; + + /* add fetch */ + assert(vsplit->cache.num_fetch_elts < vsplit->segment_size); + vsplit->fetch_elts[vsplit->cache.num_fetch_elts++] = fetch; + } + + vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash]; +} + + +/** + * Add a fetch element and add it to the draw elements. The fetch element is + * in full range (uint). + */ +static INLINE void +vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch) +{ + /* special care for 0xffffffff */ + if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) { + unsigned hash = fetch % MAP_SIZE; + vsplit->cache.fetches[hash] = fetch - 1; /* force update */ + vsplit->cache.has_max_fetch = TRUE; + } + + vsplit_add_cache(vsplit, fetch); +} + + +#define FUNC vsplit_run_linear +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ubyte +#define ELT_TYPE ubyte +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ushort +#define ELT_TYPE ushort +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_uint +#define ELT_TYPE uint +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + + +static void vsplit_prepare(struct draw_pt_front_end *frontend, + unsigned in_prim, + struct draw_pt_middle_end *middle, + unsigned opt) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + + switch (vsplit->draw->pt.user.eltSize) { + case 0: + vsplit->base.run = vsplit_run_linear; + break; + case 1: + vsplit->base.run = vsplit_run_ubyte; + break; + case 2: + vsplit->base.run = vsplit_run_ushort; + break; + case 4: + vsplit->base.run = vsplit_run_uint; + break; + default: + assert(0); + break; + } + + /* split only */ + vsplit->prim = in_prim; + + vsplit->middle = middle; + middle->prepare(middle, vsplit->prim, opt, &vsplit->max_vertices); + + vsplit->segment_size = MIN2(SEGMENT_SIZE, vsplit->max_vertices); +} + + +static void vsplit_finish(struct draw_pt_front_end *frontend) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + vsplit->middle->finish(vsplit->middle); + vsplit->middle = NULL; +} + + +static void vsplit_destroy(struct draw_pt_front_end *frontend) +{ + FREE(frontend); +} + + +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw) +{ + struct vsplit_frontend *vsplit = CALLOC_STRUCT(vsplit_frontend); + ushort i; + + if (!vsplit) + return NULL; + + vsplit->base.prepare = vsplit_prepare; + vsplit->base.run = NULL; + vsplit->base.finish = vsplit_finish; + vsplit->base.destroy = vsplit_destroy; + vsplit->draw = draw; + + for (i = 0; i < SEGMENT_SIZE; i++) + vsplit->identity_draw_elts[i] = i; + + return &vsplit->base; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h new file mode 100644 index 0000000000..4bb57b1493 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h @@ -0,0 +1,307 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#define CONCAT2(name, elt_type) name ## elt_type +#define CONCAT(name, elt_type) CONCAT2(name, elt_type) + +#ifdef ELT_TYPE + +/** + * Fetch all elements in [min_index, max_index] with bias, and use the + * (rebased) index buffer as the draw elements. + */ +static boolean +CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned istart, unsigned icount) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) draw->pt.user.elts; + const unsigned min_index = draw->pt.user.min_index; + const unsigned max_index = draw->pt.user.max_index; + const int elt_bias = draw->pt.user.eltBias; + unsigned fetch_start, fetch_count; + const ushort *draw_elts = NULL; + unsigned i; + + /* use the ib directly */ + if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) { + if (icount > vsplit->max_vertices) + return FALSE; + + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + assert(idx >= min_index && idx <= max_index); + } + draw_elts = (const ushort *) ib; + } + else { + /* have to go through vsplit->draw_elts */ + if (icount > vsplit->segment_size) + return FALSE; + } + + /* this is faster only when we fetch less elements than the normal path */ + if (max_index - min_index > icount - 1) + return FALSE; + + if (elt_bias < 0 && min_index < -elt_bias) + return FALSE; + + /* why this check? */ + for (i = 0; i < draw->pt.nr_vertex_elements; i++) { + if (draw->pt.vertex_element[i].instance_divisor) + return FALSE; + } + + fetch_start = min_index + elt_bias; + fetch_count = max_index - min_index + 1; + + if (!draw_elts) { + if (min_index == 0) { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) idx; + } + } + else { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) (idx - min_index); + } + } + + draw_elts = vsplit->draw_elts; + } + + return vsplit->middle->run_linear_elts(vsplit->middle, + fetch_start, fetch_count, + draw_elts, icount, 0x0); +} + +/** + * Use the cache to prepare the fetch and draw elements, and flush. + * + * When spoken is TRUE, ispoken replaces istart; When close is TRUE, iclose is + * appended. + */ +static INLINE void +CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, unsigned icount, + boolean spoken, unsigned ispoken, + boolean close, unsigned iclose) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) draw->pt.user.elts; + const int ibias = draw->pt.user.eltBias; + unsigned i; + + assert(icount + !!close <= vsplit->segment_size); + + vsplit_clear_cache(vsplit); + + spoken = !!spoken; + if (ibias == 0) { + if (spoken) + ADD_CACHE(vsplit, ib[ispoken]); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, ib[istart + i]); + + if (close) + ADD_CACHE(vsplit, ib[iclose]); + } + else if (ibias > 0) { + if (spoken) + ADD_CACHE(vsplit, (uint) ib[ispoken] + ibias); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, (uint) ib[istart + i] + ibias); + + if (close) + ADD_CACHE(vsplit, (uint) ib[iclose] + ibias); + } + else { + if (spoken) { + if (ib[ispoken] < -ibias) + return; + ADD_CACHE(vsplit, ib[ispoken] + ibias); + } + + for (i = spoken; i < icount; i++) { + if (ib[istart + i] < -ibias) + return; + ADD_CACHE(vsplit, ib[istart + i] + ibias); + } + + if (close) { + if (ib[iclose] < -ibias) + return; + ADD_CACHE(vsplit, ib[iclose] + ibias); + } + } + + vsplit_flush_cache(vsplit, flags); +} + +static void +CONCAT(vsplit_segment_simple_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount) +{ + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, FALSE, 0); +} + +static void +CONCAT(vsplit_segment_loop_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean close_loop = ((flags) == DRAW_SPLIT_BEFORE); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, close_loop, i0); +} + +static void +CONCAT(vsplit_segment_fan_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean use_spoken = (((flags) & DRAW_SPLIT_BEFORE) != 0); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, use_spoken, i0, FALSE, 0); +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->segment_size; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) \ + CONCAT(vsplit_primitive_, ELT_TYPE)(vsplit, istart, icount) + +#else /* ELT_TYPE */ + +static void +vsplit_segment_simple_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount) +{ + assert(icount <= vsplit->max_vertices); + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); +} + +static void +vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean close_loop = (flags == DRAW_SPLIT_BEFORE); + unsigned nr; + + assert(icount + !!close_loop <= vsplit->segment_size); + + if (close_loop) { + for (nr = 0; nr < icount; nr++) + vsplit->fetch_elts[nr] = istart + nr; + vsplit->fetch_elts[nr++] = i0; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +static void +vsplit_segment_fan_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean use_spoken = ((flags & DRAW_SPLIT_BEFORE) != 0); + unsigned nr = 0, i; + + assert(icount + !!use_spoken <= vsplit->segment_size); + + if (use_spoken) { + vsplit->fetch_elts[nr++] = i0; + for (i = 1 ; i < icount; i++) + vsplit->fetch_elts[nr++] = istart + i; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->max_vertices; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) FALSE + +#define ELT_TYPE linear + +#endif /* ELT_TYPE */ + +#define FUNC_VARS \ + struct draw_pt_front_end *frontend, \ + unsigned start, \ + unsigned count + +#define SEGMENT_SIMPLE(flags, istart, icount) \ + CONCAT(vsplit_segment_simple_, ELT_TYPE)(vsplit, flags, istart, icount) + +#define SEGMENT_LOOP(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_loop_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#define SEGMENT_FAN(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_fan_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#include "draw_split_tmp.h" + +#undef CONCAT2 +#undef CONCAT + +#undef ELT_TYPE +#undef ADD_CACHE diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h index 6d8937a0b4..7fafde9d5e 100644 --- a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h +++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h @@ -7,11 +7,9 @@ #define FUNC_ENTER \ /* declare more local vars */ \ - struct draw_context *draw = so->draw; \ const unsigned prim = input_prims->prim; \ - const boolean last_vertex_last = \ - !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ + const unsigned prim_flags = input_prims->flags; \ + const boolean last_vertex_last = TRUE; \ do { \ debug_assert(input_prims->primitive_count == 1); \ switch (prim) { \ diff --git a/src/gallium/auxiliary/draw/draw_split_tmp.h b/src/gallium/auxiliary/draw/draw_split_tmp.h new file mode 100644 index 0000000000..47defc62b9 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_split_tmp.h @@ -0,0 +1,176 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +static void +FUNC(FUNC_VARS) +{ + unsigned first, incr; + LOCAL_VARS + + /* + * prim, start, count, and max_count_{simple,loop,fan} should have been + * defined + */ + if (0) { + debug_printf("%s: prim 0x%x, start %d, count %d, max_count_simple %d, " + "max_count_loop %d, max_count_fan %d\n", + __FUNCTION__, prim, start, count, max_count_simple, + max_count_loop, max_count_fan); + } + + draw_pt_split_prim(prim, &first, &incr); + /* sanitize primitive length */ + count = draw_pt_trim_count(count, first, incr); + if (count < first) + return; + + /* try flushing the entire primitive */ + if (PRIMITIVE(start, count)) + return; + + /* must be able to at least flush two complete primitives */ + assert(max_count_simple >= first + incr && + max_count_loop >= first + incr && + max_count_fan >= first + incr); + + /* no splitting required */ + if (count <= max_count_simple) { + SEGMENT_SIMPLE(0x0, start, count); + } + else { + const unsigned rollback = first - incr; + unsigned flags = DRAW_SPLIT_AFTER, seg_start = 0, seg_max; + + /* + * Both count and seg_max below are explicitly trimmed. Because + * + * seg_start = N * (seg_max - rollback) = N' * incr, + * + * we have + * + * remaining = count - seg_start = first + N'' * incr. + * + * That is, remaining is implicitly trimmed. + */ + switch (prim) { + case PIPE_PRIM_POINTS: + case PIPE_PRIM_LINES: + case PIPE_PRIM_LINE_STRIP: + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_QUADS: + case PIPE_PRIM_QUAD_STRIP: + case PIPE_PRIM_LINES_ADJACENCY: + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + case PIPE_PRIM_TRIANGLES_ADJACENCY: + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + seg_max = + draw_pt_trim_count(MIN2(max_count_simple, count), first, incr); + if (prim == PIPE_PRIM_TRIANGLE_STRIP || + prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) { + /* make sure we flush even number of triangles at a time */ + if (seg_max < count && !(((seg_max - first) / incr) & 1)) + seg_max -= incr; + } + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_SIMPLE(flags, start + seg_start, seg_max); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_SIMPLE(flags, start + seg_start, remaining); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_LINE_LOOP: + seg_max = + draw_pt_trim_count(MIN2(max_count_loop, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_LOOP(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_LOOP(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_TRIANGLE_FAN: + case PIPE_PRIM_POLYGON: + seg_max = + draw_pt_trim_count(MIN2(max_count_fan, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_FAN(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_FAN(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + default: + assert(0); + break; + } + } +} + +#undef FUNC +#undef FUNC_VARS +#undef LOCAL_VARS + +#undef PRIMITIVE +#undef SEGMENT_SIMPLE +#undef SEGMENT_LOOP +#undef SEGMENT_FAN diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index f5f2623e46..7b35dd4bb4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2009 VMware, Inc. + * Copyright 2009-2010 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -59,6 +59,19 @@ #include "lp_bld_arit.h" +/* + * XXX: Increasing eliminates some artifacts, but adds others, most + * noticeably corruption in the Earth halo in Google Earth. + */ +#define RCP_NEWTON_STEPS 0 + +#define RSQRT_NEWTON_STEPS 0 + +#define EXP_POLY_DEGREE 3 + +#define LOG_POLY_DEGREE 5 + + /** * Generate min(a, b) * No checks for special case values of a or b = 1 or 0 are done. @@ -72,6 +85,9 @@ lp_build_min_simple(struct lp_build_context *bld, const char *intrinsic = NULL; LLVMValueRef cond; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + /* TODO: optimize the constant case */ if(type.width * type.length == 128) { @@ -118,6 +134,9 @@ lp_build_max_simple(struct lp_build_context *bld, const char *intrinsic = NULL; LLVMValueRef cond; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + /* TODO: optimize the constant case */ if(type.width * type.length == 128) { @@ -160,6 +179,8 @@ lp_build_comp(struct lp_build_context *bld, { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + if(a == bld->one) return bld->zero; if(a == bld->zero) @@ -173,9 +194,15 @@ lp_build_comp(struct lp_build_context *bld, } if(LLVMIsConstant(a)) - return LLVMConstSub(bld->one, a); + if (type.floating) + return LLVMConstFSub(bld->one, a); + else + return LLVMConstSub(bld->one, a); else - return LLVMBuildSub(bld->builder, bld->one, a, ""); + if (type.floating) + return LLVMBuildFSub(bld->builder, bld->one, a, ""); + else + return LLVMBuildSub(bld->builder, bld->one, a, ""); } @@ -190,6 +217,9 @@ lp_build_add(struct lp_build_context *bld, const struct lp_type type = bld->type; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == bld->zero) return b; if(b == bld->zero) @@ -217,9 +247,15 @@ lp_build_add(struct lp_build_context *bld, } if(LLVMIsConstant(a) && LLVMIsConstant(b)) - res = LLVMConstAdd(a, b); + if (type.floating) + res = LLVMConstFAdd(a, b); + else + res = LLVMConstAdd(a, b); else - res = LLVMBuildAdd(bld->builder, a, b, ""); + if (type.floating) + res = LLVMBuildFAdd(bld->builder, a, b, ""); + else + res = LLVMBuildAdd(bld->builder, a, b, ""); /* clamp to ceiling of 1.0 */ if(bld->type.norm && (bld->type.floating || bld->type.fixed)) @@ -240,6 +276,8 @@ lp_build_sum_vector(struct lp_build_context *bld, LLVMValueRef index, res; unsigned i; + assert(lp_check_value(type, a)); + if (a == bld->zero) return bld->zero; if (a == bld->undef) @@ -253,9 +291,16 @@ lp_build_sum_vector(struct lp_build_context *bld, for (i = 1; i < type.length; i++) { index = LLVMConstInt(LLVMInt32Type(), i, 0); - res = LLVMBuildAdd(bld->builder, res, - LLVMBuildExtractElement(bld->builder, a, index, ""), - ""); + if (type.floating) + res = LLVMBuildFAdd(bld->builder, res, + LLVMBuildExtractElement(bld->builder, + a, index, ""), + ""); + else + res = LLVMBuildAdd(bld->builder, res, + LLVMBuildExtractElement(bld->builder, + a, index, ""), + ""); } return res; @@ -273,6 +318,9 @@ lp_build_sub(struct lp_build_context *bld, const struct lp_type type = bld->type; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(b == bld->zero) return a; if(a == bld->undef || b == bld->undef) @@ -300,9 +348,15 @@ lp_build_sub(struct lp_build_context *bld, } if(LLVMIsConstant(a) && LLVMIsConstant(b)) - res = LLVMConstSub(a, b); + if (type.floating) + res = LLVMConstFSub(a, b); + else + res = LLVMConstSub(a, b); else - res = LLVMBuildSub(bld->builder, a, b, ""); + if (type.floating) + res = LLVMBuildFSub(bld->builder, a, b, ""); + else + res = LLVMBuildSub(bld->builder, a, b, ""); if(bld->type.norm && (bld->type.floating || bld->type.fixed)) res = lp_build_max_simple(bld, res, bld->zero); @@ -360,6 +414,10 @@ lp_build_mul_u8n(LLVMBuilderRef builder, LLVMValueRef c8; LLVMValueRef ab; + assert(!i16_type.floating); + assert(lp_check_value(i16_type, a)); + assert(lp_check_value(i16_type, b)); + c8 = lp_build_const_int_vec(i16_type, 8); #if 0 @@ -395,6 +453,9 @@ lp_build_mul(struct lp_build_context *bld, LLVMValueRef shift; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == bld->zero) return bld->zero; if(a == bld->one) @@ -433,7 +494,10 @@ lp_build_mul(struct lp_build_context *bld, shift = NULL; if(LLVMIsConstant(a) && LLVMIsConstant(b)) { - res = LLVMConstMul(a, b); + if (type.floating) + res = LLVMConstFMul(a, b); + else + res = LLVMConstMul(a, b); if(shift) { if(type.sign) res = LLVMConstAShr(res, shift); @@ -442,7 +506,10 @@ lp_build_mul(struct lp_build_context *bld, } } else { - res = LLVMBuildMul(bld->builder, a, b, ""); + if (type.floating) + res = LLVMBuildFMul(bld->builder, a, b, ""); + else + res = LLVMBuildMul(bld->builder, a, b, ""); if(shift) { if(type.sign) res = LLVMBuildAShr(bld->builder, res, shift, ""); @@ -465,6 +532,8 @@ lp_build_mul_imm(struct lp_build_context *bld, { LLVMValueRef factor; + assert(lp_check_value(bld->type, a)); + if(b == 0) return bld->zero; @@ -472,7 +541,7 @@ lp_build_mul_imm(struct lp_build_context *bld, return a; if(b == -1) - return LLVMBuildNeg(bld->builder, a, ""); + return lp_build_negate(bld, a); if(b == 2 && bld->type.floating) return lp_build_add(bld, a, a); @@ -518,6 +587,9 @@ lp_build_div(struct lp_build_context *bld, { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == bld->zero) return bld->zero; if(a == bld->one) @@ -529,13 +601,24 @@ lp_build_div(struct lp_build_context *bld, if(a == bld->undef || b == bld->undef) return bld->undef; - if(LLVMIsConstant(a) && LLVMIsConstant(b)) - return LLVMConstFDiv(a, b); + if(LLVMIsConstant(a) && LLVMIsConstant(b)) { + if (type.floating) + return LLVMConstFDiv(a, b); + else if (type.sign) + return LLVMConstSDiv(a, b); + else + return LLVMConstUDiv(a, b); + } if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) return lp_build_mul(bld, a, lp_build_rcp(bld, b)); - return LLVMBuildFDiv(bld->builder, a, b, ""); + if (type.floating) + return LLVMBuildFDiv(bld->builder, a, b, ""); + else if (type.sign) + return LLVMBuildSDiv(bld->builder, a, b, ""); + else + return LLVMBuildUDiv(bld->builder, a, b, ""); } @@ -555,6 +638,10 @@ lp_build_lerp(struct lp_build_context *bld, LLVMValueRef delta; LLVMValueRef res; + assert(lp_check_value(bld->type, x)); + assert(lp_check_value(bld->type, v0)); + assert(lp_check_value(bld->type, v1)); + delta = lp_build_sub(bld, v1, v0); res = lp_build_mul(bld, x, delta); @@ -596,6 +683,9 @@ lp_build_min(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { + assert(lp_check_value(bld->type, a)); + assert(lp_check_value(bld->type, b)); + if(a == bld->undef || b == bld->undef) return bld->undef; @@ -624,6 +714,9 @@ lp_build_max(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { + assert(lp_check_value(bld->type, a)); + assert(lp_check_value(bld->type, b)); + if(a == bld->undef || b == bld->undef) return bld->undef; @@ -653,6 +746,10 @@ lp_build_clamp(struct lp_build_context *bld, LLVMValueRef min, LLVMValueRef max) { + assert(lp_check_value(bld->type, a)); + assert(lp_check_value(bld->type, min)); + assert(lp_check_value(bld->type, max)); + a = lp_build_min(bld, a, max); a = lp_build_max(bld, a, min); return a; @@ -669,6 +766,8 @@ lp_build_abs(struct lp_build_context *bld, const struct lp_type type = bld->type; LLVMTypeRef vec_type = lp_build_vec_type(type); + assert(lp_check_value(type, a)); + if(!type.sign) return a; @@ -702,7 +801,16 @@ LLVMValueRef lp_build_negate(struct lp_build_context *bld, LLVMValueRef a) { - return LLVMBuildNeg(bld->builder, a, ""); + assert(lp_check_value(bld->type, a)); + +#if HAVE_LLVM >= 0x0207 + if (bld->type.floating) + a = LLVMBuildFNeg(bld->builder, a, ""); + else +#endif + a = LLVMBuildNeg(bld->builder, a, ""); + + return a; } @@ -715,6 +823,8 @@ lp_build_sgn(struct lp_build_context *bld, LLVMValueRef cond; LLVMValueRef res; + assert(lp_check_value(type, a)); + /* Handle non-zero case */ if(!type.sign) { /* if not zero then sign must be positive */ @@ -773,6 +883,7 @@ lp_build_set_sign(struct lp_build_context *bld, LLVMValueRef val, res; assert(type.floating); + assert(lp_check_value(type, a)); /* val = reinterpret_cast<int>(a) */ val = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); @@ -1021,7 +1132,7 @@ lp_build_iround(struct lp_build_context *bld, half = LLVMBuildOr(bld->builder, sign, half, ""); half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); - res = LLVMBuildAdd(bld->builder, a, half, ""); + res = LLVMBuildFAdd(bld->builder, a, half, ""); } res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); @@ -1070,7 +1181,7 @@ lp_build_ifloor(struct lp_build_context *bld, offset = LLVMBuildAnd(bld->builder, offset, sign, ""); offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); - res = LLVMBuildAdd(bld->builder, a, offset, "ifloor.res"); + res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res"); } /* round to nearest (toward zero) */ @@ -1120,7 +1231,7 @@ lp_build_iceil(struct lp_build_context *bld, offset = LLVMBuildAnd(bld->builder, offset, sign, ""); offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); - res = LLVMBuildAdd(bld->builder, a, offset, "iceil.res"); + res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res"); } /* round to nearest (toward zero) */ @@ -1138,6 +1249,8 @@ lp_build_sqrt(struct lp_build_context *bld, LLVMTypeRef vec_type = lp_build_vec_type(type); char intrinsic[32]; + assert(lp_check_value(type, a)); + /* TODO: optimize the constant case */ /* TODO: optimize the constant case */ @@ -1148,12 +1261,39 @@ lp_build_sqrt(struct lp_build_context *bld, } +/** + * Do one Newton-Raphson step to improve reciprocate precision: + * + * x_{i+1} = x_i * (2 - a * x_i) + * + * See also: + * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division + * - http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static INLINE LLVMValueRef +lp_build_rcp_refine(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef rcp_a) +{ + LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); + LLVMValueRef res; + + res = LLVMBuildFMul(bld->builder, a, rcp_a, ""); + res = LLVMBuildFSub(bld->builder, two, res, ""); + res = LLVMBuildFMul(bld->builder, rcp_a, res, ""); + + return res; +} + + LLVMValueRef lp_build_rcp(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + if(a == bld->zero) return bld->undef; if(a == bld->one) @@ -1167,32 +1307,16 @@ lp_build_rcp(struct lp_build_context *bld, return LLVMConstFDiv(bld->one, a); if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { - /* - * XXX: Added precision is not always necessary, so only enable this - * when we have a better system in place to track minimum precision. - */ - -#if 0 - /* - * Do one Newton-Raphson step to improve precision: - * - * x1 = (2 - a * rcp(a)) * rcp(a) - */ - - LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); - LLVMValueRef rcp_a; LLVMValueRef res; + unsigned i; - rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); + res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); - res = LLVMBuildMul(bld->builder, a, rcp_a, ""); - res = LLVMBuildSub(bld->builder, two, res, ""); - res = LLVMBuildMul(bld->builder, res, rcp_a, ""); + for (i = 0; i < RCP_NEWTON_STEPS; ++i) { + res = lp_build_rcp_refine(bld, a, res); + } - return rcp_a; -#else - return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); -#endif + return res; } return LLVMBuildFDiv(bld->builder, bld->one, a, ""); @@ -1200,6 +1324,33 @@ lp_build_rcp(struct lp_build_context *bld, /** + * Do one Newton-Raphson step to improve rsqrt precision: + * + * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) + * + * See also: + * - http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static INLINE LLVMValueRef +lp_build_rsqrt_refine(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef rsqrt_a) +{ + LLVMValueRef half = lp_build_const_vec(bld->type, 0.5); + LLVMValueRef three = lp_build_const_vec(bld->type, 3.0); + LLVMValueRef res; + + res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, ""); + res = LLVMBuildFMul(bld->builder, a, res, ""); + res = LLVMBuildFSub(bld->builder, three, res, ""); + res = LLVMBuildFMul(bld->builder, rsqrt_a, res, ""); + res = LLVMBuildFMul(bld->builder, half, res, ""); + + return res; +} + + +/** * Generate 1/sqrt(a) */ LLVMValueRef @@ -1208,10 +1359,22 @@ lp_build_rsqrt(struct lp_build_context *bld, { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + assert(type.floating); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) - return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a); + if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + LLVMValueRef res; + unsigned i; + + res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); + + for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) { + res = lp_build_rsqrt_refine(bld, a, res); + } + + return res; + } return lp_build_rcp(bld, lp_build_sqrt(bld, a)); } @@ -1270,7 +1433,7 @@ lp_build_sin(struct lp_build_context *bld, */ LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); - LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y"); + LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); /* * store the integer part of y in mm0 @@ -1344,9 +1507,9 @@ lp_build_sin(struct lp_build_context *bld, * xmm2 = _mm_mul_ps(y, xmm2); * xmm3 = _mm_mul_ps(y, xmm3); */ - LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1"); - LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2"); - LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3"); + LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); + LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); + LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); /* * x = _mm_add_ps(x, xmm1); @@ -1354,16 +1517,16 @@ lp_build_sin(struct lp_build_context *bld, * x = _mm_add_ps(x, xmm3); */ - LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1"); - LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2"); - LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3"); + LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); + LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); + LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); /* * Evaluate the first polynom (0 <= x <= Pi/4) * * z = _mm_mul_ps(x,x); */ - LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z"); + LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); /* * _PS_CONST(coscof_p0, 2.443315711809948E-005); @@ -1378,12 +1541,12 @@ lp_build_sin(struct lp_build_context *bld, * y = *(v4sf*)_ps_coscof_p0; * y = _mm_mul_ps(y, z); */ - LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3"); - LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4"); - LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5"); - LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6"); - LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7"); - LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8"); + LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); + LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); + LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); + LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); + LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); + LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); /* @@ -1392,10 +1555,10 @@ lp_build_sin(struct lp_build_context *bld, * y = _mm_add_ps(y, *(v4sf*)_ps_1); */ LLVMValueRef half = lp_build_const_v4sf(0.5); - LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp"); - LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8"); + LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); + LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); LLVMValueRef one = lp_build_const_v4sf(1.0); - LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9"); + LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); /* * _PS_CONST(sincof_p0, -1.9515295891E-4); @@ -1419,13 +1582,13 @@ lp_build_sin(struct lp_build_context *bld, * y2 = _mm_add_ps(y2, x); */ - LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3"); - LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4"); - LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5"); - LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6"); - LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7"); - LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8"); - LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9"); + LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); + LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); + LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); + LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); + LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); + LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); + LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); /* * select the correct result from the two polynoms @@ -1481,7 +1644,7 @@ lp_build_cos(struct lp_build_context *bld, */ LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); - LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y"); + LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); /* * store the integer part of y in mm0 @@ -1561,9 +1724,9 @@ lp_build_cos(struct lp_build_context *bld, * xmm2 = _mm_mul_ps(y, xmm2); * xmm3 = _mm_mul_ps(y, xmm3); */ - LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1"); - LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2"); - LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3"); + LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); + LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); + LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); /* * x = _mm_add_ps(x, xmm1); @@ -1571,16 +1734,16 @@ lp_build_cos(struct lp_build_context *bld, * x = _mm_add_ps(x, xmm3); */ - LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1"); - LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2"); - LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3"); + LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); + LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); + LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); /* * Evaluate the first polynom (0 <= x <= Pi/4) * * z = _mm_mul_ps(x,x); */ - LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z"); + LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); /* * _PS_CONST(coscof_p0, 2.443315711809948E-005); @@ -1595,12 +1758,12 @@ lp_build_cos(struct lp_build_context *bld, * y = *(v4sf*)_ps_coscof_p0; * y = _mm_mul_ps(y, z); */ - LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3"); - LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4"); - LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5"); - LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6"); - LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7"); - LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8"); + LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); + LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); + LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); + LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); + LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); + LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); /* @@ -1609,10 +1772,10 @@ lp_build_cos(struct lp_build_context *bld, * y = _mm_add_ps(y, *(v4sf*)_ps_1); */ LLVMValueRef half = lp_build_const_v4sf(0.5); - LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp"); - LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8"); + LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); + LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); LLVMValueRef one = lp_build_const_v4sf(1.0); - LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9"); + LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); /* * _PS_CONST(sincof_p0, -1.9515295891E-4); @@ -1636,13 +1799,13 @@ lp_build_cos(struct lp_build_context *bld, * y2 = _mm_add_ps(y2, x); */ - LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3"); - LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4"); - LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5"); - LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6"); - LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7"); - LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8"); - LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9"); + LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); + LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); + LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); + LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); + LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); + LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); + LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); /* * select the correct result from the two polynoms @@ -1695,6 +1858,8 @@ lp_build_exp(struct lp_build_context *bld, /* log2(e) = 1/log(2) */ LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634); + assert(lp_check_value(bld->type, x)); + return lp_build_mul(bld, log2e, lp_build_exp2(bld, x)); } @@ -1709,14 +1874,12 @@ lp_build_log(struct lp_build_context *bld, /* log(2) */ LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529); + assert(lp_check_value(bld->type, x)); + return lp_build_mul(bld, log2, lp_build_exp2(bld, x)); } -#define EXP_POLY_DEGREE 3 -#define LOG_POLY_DEGREE 5 - - /** * Generate polynomial. * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. @@ -1731,6 +1894,8 @@ lp_build_polynomial(struct lp_build_context *bld, LLVMValueRef res = NULL; unsigned i; + assert(lp_check_value(bld->type, x)); + /* TODO: optimize the constant case */ if(LLVMIsConstant(x)) debug_printf("%s: inefficient/imprecise constant arithmetic\n", @@ -1802,6 +1967,8 @@ lp_build_exp2_approx(struct lp_build_context *bld, LLVMValueRef expfpart = NULL; LLVMValueRef res = NULL; + assert(lp_check_value(bld->type, x)); + if(p_exp2_int_part || p_frac_part || p_exp2) { /* TODO: optimize the constant case */ if(LLVMIsConstant(x)) @@ -1817,7 +1984,7 @@ lp_build_exp2_approx(struct lp_build_context *bld, ipart = lp_build_floor(bld, x); /* fpart = x - ipart */ - fpart = LLVMBuildSub(bld->builder, x, ipart, ""); + fpart = LLVMBuildFSub(bld->builder, x, ipart, ""); } if(p_exp2_int_part || p_exp2) { @@ -1832,7 +1999,7 @@ lp_build_exp2_approx(struct lp_build_context *bld, expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, Elements(lp_build_exp2_polynomial)); - res = LLVMBuildMul(bld->builder, expipart, expfpart, ""); + res = LLVMBuildFMul(bld->builder, expipart, expfpart, ""); } if(p_exp2_int_part) @@ -1915,6 +2082,8 @@ lp_build_log2_approx(struct lp_build_context *bld, LLVMValueRef logmant = NULL; LLVMValueRef res = NULL; + assert(lp_check_value(bld->type, x)); + if(p_exp || p_floor_log2 || p_log2) { /* TODO: optimize the constant case */ if(LLVMIsConstant(x)) @@ -1945,9 +2114,9 @@ lp_build_log2_approx(struct lp_build_context *bld, Elements(lp_build_log2_polynomial)); /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ - logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), ""); + logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), ""); - res = LLVMBuildAdd(bld->builder, logmant, logexp, ""); + res = LLVMBuildFAdd(bld->builder, logmant, logexp, ""); } if(p_exp) { diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 77012f1fac..8b477313d4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -117,8 +117,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, scale = (double)mask/ubound; bias = (double)((unsigned long long)1 << (mantissa - n)); - res = LLVMBuildMul(builder, src, lp_build_const_vec(src_type, scale), ""); - res = LLVMBuildAdd(builder, res, lp_build_const_vec(src_type, bias), ""); + res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); + res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), ""); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); if(dst_width > n) { @@ -175,6 +175,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, double scale; double bias; + assert(dst_type.floating); + mantissa = lp_mantissa(dst_type); n = MIN2(mantissa, src_width); @@ -199,8 +201,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, res = LLVMBuildBitCast(builder, res, vec_type, ""); - res = LLVMBuildSub(builder, res, bias_, ""); - res = LLVMBuildMul(builder, res, lp_build_const_vec(dst_type, scale), ""); + res = LLVMBuildFSub(builder, res, bias_, ""); + res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), ""); return res; } @@ -296,7 +298,7 @@ lp_build_conv(LLVMBuilderRef builder, if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) - tmp[i] = LLVMBuildMul(builder, tmp[i], scale, ""); + tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ @@ -391,7 +393,7 @@ lp_build_conv(LLVMBuilderRef builder, if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) - tmp[i] = LLVMBuildMul(builder, tmp[i], scale, ""); + tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 0f01fc1d75..247cb83ce6 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -240,7 +240,7 @@ lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder, */ if (normalized) - scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), ""); + scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), ""); else scaled = casted; @@ -322,7 +322,7 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, } if (normalized) - scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), ""); + scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), ""); else scaled = unswizzled; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index 9f405921b0..c724a4453e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -197,7 +197,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, if (format_desc->channel[chan].normalized) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); LLVMValueRef scale_val = lp_build_const_vec(type, scale); - input = LLVMBuildMul(builder, input, scale_val, ""); + input = LLVMBuildFMul(builder, input, scale_val, ""); } } else { @@ -227,7 +227,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); LLVMValueRef scale_val = lp_build_const_vec(type, scale); input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), ""); - input = LLVMBuildMul(builder, input, scale_val, ""); + input = LLVMBuildFMul(builder, input, scale_val, ""); } else { /* FIXME */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index ef0888079c..60d8bcfa55 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -46,7 +46,7 @@ static const struct debug_named_value lp_bld_debug_flags[] = { DEBUG_NAMED_VALUE_END }; -DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, 0); +DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, 0) #endif diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index ab4ddb81c4..7d7db3b0d9 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -83,6 +83,8 @@ lp_build_compare(LLVMBuilderRef builder, assert(func >= PIPE_FUNC_NEVER); assert(func <= PIPE_FUNC_ALWAYS); + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); if(func == PIPE_FUNC_NEVER) return zeros; @@ -374,6 +376,9 @@ lp_build_select_bitwise(struct lp_build_context *bld, struct lp_type type = bld->type; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if (a == b) { return a; } @@ -419,6 +424,9 @@ lp_build_select(struct lp_build_context *bld, struct lp_type type = bld->type; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == b) return a; @@ -484,6 +492,9 @@ lp_build_select_aos(struct lp_build_context *bld, const unsigned n = type.length; unsigned i, j; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == b) return a; if(cond[0] && cond[1] && cond[2] && cond[3]) @@ -539,7 +550,22 @@ lp_build_select_aos(struct lp_build_context *bld, LLVMValueRef lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { + const struct lp_type type = bld->type; + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + + /* can't do bitwise ops on floating-point values */ + if(type.floating) { + a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, ""); + b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, ""); + } + b = LLVMBuildNot(bld->builder, b, ""); b = LLVMBuildAnd(bld->builder, a, b, ""); + + if(type.floating) { + b = LLVMBuildBitCast(bld->builder, b, bld->vec_type, ""); + } return b; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index 7748f8f099..b7b630f2e8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -171,14 +171,13 @@ lp_build_unpack2(LLVMBuilderRef builder, msb = lp_build_zero(src_type); /* Interleave bits */ - if(util_cpu_caps.little_endian) { +#ifdef PIPE_ARCH_LITTLE_ENDIAN *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0); *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1); - } - else { +#else *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0); *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1); - } +#endif /* Cast the result into the new type (twice as wide) */ @@ -261,13 +260,14 @@ lp_build_pack2(LLVMBuilderRef builder, #endif LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type); LLVMValueRef shuffle; - LLVMValueRef res; + LLVMValueRef res = NULL; assert(!src_type.floating); assert(!dst_type.floating); assert(src_type.width == dst_type.width * 2); assert(src_type.length * 2 == dst_type.length); + /* Check for special cases first */ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) { switch(src_type.width) { case 32: @@ -283,8 +283,8 @@ lp_build_pack2(LLVMBuilderRef builder, return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi); } else { - assert(0); - return LLVMGetUndef(dst_vec_type); + /* use generic shuffle below */ + res = NULL; } } break; @@ -310,10 +310,13 @@ lp_build_pack2(LLVMBuilderRef builder, break; } - res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); - return res; + if (res) { + res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); + return res; + } } + /* generic shuffle */ lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c index ca36046d22..7b1088939b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c @@ -85,7 +85,7 @@ lp_build_scalar_ddx(struct lp_build_context *bld, LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0); LLVMValueRef a_left = LLVMBuildExtractElement(bld->builder, a, idx_left, ""); LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, ""); - return LLVMBuildSub(bld->builder, a_right, a_left, ""); + return lp_build_sub(bld, a_right, a_left); } @@ -97,5 +97,5 @@ lp_build_scalar_ddy(struct lp_build_context *bld, LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0); LLVMValueRef a_top = LLVMBuildExtractElement(bld->builder, a, idx_top, ""); LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, ""); - return LLVMBuildSub(bld->builder, a_bottom, a_top, ""); + return lp_build_sub(bld, a_bottom, a_top); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 1a20d74cac..806c7d56a8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -40,7 +40,6 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_format.h" -#include "util/u_cpu_detect.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -811,7 +810,7 @@ lp_build_minify(struct lp_build_sample_context *bld, LLVMValueRef base_size, LLVMValueRef level) { - LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify"); + LLVMValueRef size = LLVMBuildLShr(bld->builder, base_size, level, "minify"); size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one); return size; } @@ -888,17 +887,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, /* Compute rho = max of all partial derivatives scaled by texture size. * XXX this could be vectorized somewhat */ - rho = LLVMBuildMul(bld->builder, + rho = LLVMBuildFMul(bld->builder, lp_build_max(float_bld, dsdx, dsdy), lp_build_int_to_float(float_bld, width), ""); if (dims > 1) { LLVMValueRef max; - max = LLVMBuildMul(bld->builder, + max = LLVMBuildFMul(bld->builder, lp_build_max(float_bld, dtdx, dtdy), lp_build_int_to_float(float_bld, height), ""); rho = lp_build_max(float_bld, rho, max); if (dims > 2) { - max = LLVMBuildMul(bld->builder, + max = LLVMBuildFMul(bld->builder, lp_build_max(float_bld, drdx, drdy), lp_build_int_to_float(float_bld, depth), ""); rho = lp_build_max(float_bld, rho, max); @@ -912,12 +911,12 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, if (lod_bias) { lod_bias = LLVMBuildExtractElement(bld->builder, lod_bias, index0, ""); - lod = LLVMBuildAdd(bld->builder, lod, lod_bias, "shader_lod_bias"); + lod = LLVMBuildFAdd(bld->builder, lod, lod_bias, "shader_lod_bias"); } } /* add sampler lod bias */ - lod = LLVMBuildAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); + lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); /* clamp lod */ lod = lp_build_clamp(float_bld, lod, min_lod, max_lod); @@ -1219,8 +1218,7 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) /* ima = -0.5 / abs(coord); */ LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5); LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); - LLVMValueRef ima = lp_build_mul(coord_bld, negHalf, - lp_build_rcp(coord_bld, absCoord)); + LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord); return ima; } @@ -1841,7 +1839,11 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, unsigned i, j; for(j = 0; j < h16.type.length; j += 4) { - unsigned subindex = util_cpu_caps.little_endian ? 0 : 1; +#ifdef PIPE_ARCH_LITTLE_ENDIAN + unsigned subindex = 0; +#else + unsigned subindex = 1; +#endif LLVMValueRef index; index = LLVMConstInt(elem_type, j/2 + subindex, 0); @@ -2029,6 +2031,8 @@ lp_build_sample_soa(LLVMBuilderRef builder, debug_printf("Sample from %s\n", util_format_name(fmt)); } + assert(type.floating); + /* Setup our build context */ memset(&bld, 0, sizeof bld); bld.builder = builder; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 21236839fb..0aa64affac 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -489,7 +489,7 @@ get_indirect_offsets(struct lp_build_tgsi_soa_context *bld, int_vec_type, ""); /* addr_vec = addr_vec * 4 */ - addr_vec = lp_build_mul(&bld->base, addr_vec, vec4); + addr_vec = lp_build_mul(&bld->int_bld, addr_vec, vec4); return addr_vec; } @@ -533,7 +533,7 @@ emit_fetch( reg->Register.Index * 4 + swizzle); /* index_vec = index_vec + addr_vec */ - index_vec = lp_build_add(&bld->base, index_vec, addr_vec); + index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec); /* Gather values from the constant buffer */ res = build_gather(bld, bld->consts_ptr, index_vec); @@ -612,11 +612,9 @@ emit_fetch( case TGSI_UTIL_SIGN_SET: /* TODO: Use bitwese OR for floating point */ res = lp_build_abs( &bld->base, res ); - res = LLVMBuildNeg( bld->base.builder, res, "" ); - break; - + /* fall through */ case TGSI_UTIL_SIGN_TOGGLE: - res = LLVMBuildNeg( bld->base.builder, res, "" ); + res = lp_build_negate( &bld->base, res ); break; case TGSI_UTIL_SIGN_KEEP: @@ -773,7 +771,9 @@ emit_store( addr = LLVMBuildExtractElement(bld->base.builder, addr, LLVMConstInt(LLVMInt32Type(), 0, 0), ""); - addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0)); + addr = LLVMBuildMul(bld->base.builder, + addr, LLVMConstInt(LLVMInt32Type(), 4, 0), + ""); } switch( reg->Register.File ) { diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c index 2e15751e50..0461c81550 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c +++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c @@ -30,7 +30,7 @@ #include "rtasm_cpu.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) static boolean rtasm_sse_enabled(void) { static boolean firsttime = 1; @@ -49,7 +49,7 @@ static boolean rtasm_sse_enabled(void) int rtasm_cpu_has_sse(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; @@ -59,7 +59,7 @@ int rtasm_cpu_has_sse(void) int rtasm_cpu_has_sse2(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 9f70b73698..0fe6ebfcb4 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -22,8 +22,9 @@ **************************************************************************/ #include "pipe/p_config.h" +#include "util/u_cpu_detect.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "pipe/p_compiler.h" #include "util/u_debug.h" @@ -231,6 +232,10 @@ static void emit_modrm( struct x86_function *p, assert(reg.mod == mod_REG); + /* TODO: support extended x86-64 registers */ + assert(reg.idx < 8); + assert(regmem.idx < 8); + val |= regmem.mod << 6; /* mod field */ val |= reg.idx << 3; /* reg field */ val |= regmem.idx; /* r/m field */ @@ -363,6 +368,12 @@ int x86_get_label( struct x86_function *p ) */ +void x64_rexw(struct x86_function *p) +{ + if(x86_target(p) != X86_32) + emit_1ub(p, 0x48); +} + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ) @@ -449,6 +460,52 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ) emit_1i(p, imm); } +void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + x86_mov_reg_imm(p, dst, imm); + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_1i(p, imm); + } +} + +void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm ) +{ + DUMP_RI( dst, imm ); + emit_1ub(p, 0x66); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb8 + dst.idx); + emit_2ub(p, imm & 0xff, imm >> 8); + } + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_2ub(p, imm & 0xff, imm >> 8); + } +} + +void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb0 + dst.idx); + emit_1ub(p, imm); + } + else + { + emit_1ub(p, 0xc6); + emit_modrm_noreg(p, 0, dst); + emit_1ub(p, imm); + } +} + /** * Immediate group 1 instructions. */ @@ -520,7 +577,7 @@ void x86_push( struct x86_function *p, } - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } void x86_push_imm32( struct x86_function *p, @@ -530,7 +587,7 @@ void x86_push_imm32( struct x86_function *p, emit_1ub(p, 0x68); emit_1i(p, imm32); - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } @@ -540,23 +597,33 @@ void x86_pop( struct x86_function *p, DUMP_R( reg ); assert(reg.mod == mod_REG); emit_1ub(p, 0x58 + reg.idx); - p->stack_offset -= 4; + p->stack_offset -= sizeof(void*); } void x86_inc( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x40 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x40 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 0, reg); } void x86_dec( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x48 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x48 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 1, reg); } void x86_ret( struct x86_function *p ) @@ -583,9 +650,82 @@ void x86_mov( struct x86_function *p, struct x86_reg src ) { DUMP_RR( dst, src ); + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + uint8_t rex = 0x40; + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + emit_1ub(p, rex); + } + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_mov16( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_1ub(p, 0x66); + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_mov8( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_op_modrm( p, 0x8a, 0x88, dst, src ); +} + +void x64_mov64( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + uint8_t rex = 0x48; + DUMP_RR( dst, src ); + assert(x86_target(p) != X86_32); + + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + } + emit_1ub(p, rex); emit_op_modrm( p, 0x8b, 0x89, dst, src ); } +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb6); + emit_modrm(p, dst, src); +} + +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb7); + emit_modrm(p, dst, src); +} + void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -680,6 +820,61 @@ void x86_div( struct x86_function *p, emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src); } +void x86_bswap( struct x86_function *p, struct x86_reg reg ) +{ + DUMP_R(reg); + assert(reg.file == file_REG32); + assert(reg.mod == mod_REG); + emit_2ub(p, 0x0f, 0xc8 + reg.idx); +} + +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 5, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 5, reg); + emit_1ub(p, imm); + } +} + +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 7, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 7, reg); + emit_1ub(p, imm); + } +} + +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 4, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 4, reg); + emit_1ub(p, imm); + } +} /*********************************************************************** @@ -1013,6 +1208,77 @@ void sse_movmskps( struct x86_function *p, * SSE2 instructions */ +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + if(dst.mod == mod_REG && dst.file == file_REG32) + { + emit_1ub(p, 0x7e); + emit_modrm(p, src, dst); + } + else + { + emit_op_modrm(p, 0x6e, 0x7e, dst, src); + } +} + +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + switch (dst.mod) { + case mod_REG: + emit_3ub(p, 0xf3, 0x0f, 0x7e); + emit_modrm(p, dst, src); + break; + case mod_INDIRECT: + case mod_DISP32: + case mod_DISP8: + assert(src.mod == mod_REG); + emit_3ub(p, 0x66, 0x0f, 0xd6); + emit_modrm(p, src, dst); + break; + default: + assert(0); + break; + } +} + +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf3, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf2, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x28, 0x29, dst, src); +} + /** * Perform a reduced swizzle: */ @@ -1027,6 +1293,28 @@ void sse2_pshufd( struct x86_function *p, emit_1ub(p, shuf); } +void sse2_pshuflw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf2, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + +void sse2_pshufhw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf3, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1045,6 +1333,24 @@ void sse2_cvtps2dq( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_cvtsd2ss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0xf2, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + +void sse2_cvtpd2ps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1081,6 +1387,97 @@ void sse2_punpcklbw( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x61); + emit_modrm( p, dst, src ); +} + +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x62); + emit_modrm( p, dst, src ); +} + +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x6c); + emit_modrm( p, dst, src ); +} + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_3ub(p, 0x66, 0x0f, 0xeb); + emit_modrm(p, dst, src); +} void sse2_rcpps( struct x86_function *p, struct x86_reg dst, @@ -1100,18 +1497,6 @@ void sse2_rcpss( struct x86_function *p, emit_modrm( p, dst, src ); } -void sse2_movd( struct x86_function *p, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( dst, src ); - emit_2ub(p, 0x66, X86_TWOB); - emit_op_modrm( p, 0x6e, 0x7e, dst, src ); -} - - - - /*********************************************************************** * x87 instructions */ @@ -1702,23 +2087,79 @@ void x86_cdecl_caller_pop_regs( struct x86_function *p ) } -/* Retreive a reference to one of the function arguments, taking into - * account any push/pop activity: - */ struct x86_reg x86_fn_arg( struct x86_function *p, - unsigned arg ) + unsigned arg ) { - return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + switch(x86_target(p)) + { + case X86_64_WIN64_ABI: + /* Microsoft uses a different calling convention than the rest of the world */ + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_CX); + case 2: + return x86_make_reg(file_REG32, reg_DX); + case 3: + return x86_make_reg(file_REG32, reg_R8); + case 4: + return x86_make_reg(file_REG32, reg_R9); + default: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + (arg - 4) * 8); /* ??? */ + } + case X86_64_STD_ABI: + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_DI); + case 2: + return x86_make_reg(file_REG32, reg_SI); + case 3: + return x86_make_reg(file_REG32, reg_DX); + case 4: + return x86_make_reg(file_REG32, reg_CX); + case 5: + return x86_make_reg(file_REG32, reg_R8); + case 6: + return x86_make_reg(file_REG32, reg_R9); + default: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + (arg - 6) * 8); /* ??? */ + } + case X86_32: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), p->stack_offset + arg * 4); /* ??? */ + default: + abort(); + } } +static void x86_init_func_common( struct x86_function *p ) +{ + util_cpu_detect(); + p->caps = 0; + if(util_cpu_caps.has_mmx) + p->caps |= X86_MMX; + if(util_cpu_caps.has_mmx2) + p->caps |= X86_MMX2; + if(util_cpu_caps.has_sse) + p->caps |= X86_SSE; + if(util_cpu_caps.has_sse2) + p->caps |= X86_SSE2; + if(util_cpu_caps.has_sse3) + p->caps |= X86_SSE3; + if(util_cpu_caps.has_sse4_1) + p->caps |= X86_SSE4_1; + p->csr = p->store; + DUMP_START(); +} void x86_init_func( struct x86_function *p ) { p->size = 0; p->store = NULL; - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_init_func_size( struct x86_function *p, unsigned code_size ) @@ -1728,8 +2169,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size ) if (p->store == NULL) { p->store = p->error_overflow; } - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_release_func( struct x86_function *p ) diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h index 6208e8f707..aa77892b2d 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h @@ -26,20 +26,28 @@ #include "pipe/p_config.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* It is up to the caller to ensure that instructions issued are * suitable for the host cpu. There are no checks made in this module * for mmx/sse/sse2 support on the cpu. */ struct x86_reg { - unsigned file:3; - unsigned idx:3; + unsigned file:2; + unsigned idx:4; unsigned mod:2; /* mod_REG if this is just a register */ int disp:24; /* only +/- 23bits of offset - should be enough... */ }; +#define X86_MMX 1 +#define X86_MMX2 2 +#define X86_SSE 4 +#define X86_SSE2 8 +#define X86_SSE3 0x10 +#define X86_SSE4_1 0x20 + struct x86_function { + unsigned caps; unsigned size; unsigned char *store; unsigned char *csr; @@ -75,7 +83,15 @@ enum x86_reg_name { reg_SP, reg_BP, reg_SI, - reg_DI + reg_DI, + reg_R8, + reg_R9, + reg_R10, + reg_R11, + reg_R12, + reg_R13, + reg_R14, + reg_R15 }; @@ -110,6 +126,29 @@ typedef void (*x86_func)(void); /* Begin/end/retrieve function creation: */ +enum x86_target +{ + X86_32, + X86_64_STD_ABI, + X86_64_WIN64_ABI +}; + +/* make this read a member of x86_function if target != host is desired */ +static INLINE enum x86_target x86_target( struct x86_function* p ) +{ +#ifdef PIPE_ARCH_X86 + return X86_32; +#elif defined(_WIN64) + return X86_64_WIN64_ABI; +#elif defined(PIPE_ARCH_X86_64) + return X86_64_STD_ABI; +#endif +} + +static INLINE unsigned x86_target_caps( struct x86_function* p ) +{ + return p->caps; +} void x86_init_func( struct x86_function *p ); void x86_init_func_size( struct x86_function *p, unsigned code_size ); @@ -138,6 +177,8 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg ); */ int x86_get_label( struct x86_function *p ); +void x64_rexw(struct x86_function *p); + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ); @@ -178,18 +219,54 @@ void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, unsigned char shuf ); +void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr); void sse_prefetch0( struct x86_function *p, struct x86_reg ptr); @@ -227,7 +304,6 @@ void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src ); -void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src); void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -237,6 +313,14 @@ void x86_dec( struct x86_function *p, struct x86_reg reg ); void x86_inc( struct x86_function *p, struct x86_reg reg ); void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm ); +void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm ); +void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm ); void x86_mul( struct x86_function *p, struct x86_reg src ); void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -250,7 +334,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_sahf( struct x86_function *p ); void x86_div( struct x86_function *p, struct x86_reg src ); - +void x86_bswap( struct x86_function *p, struct x86_reg src ); +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); void x86_cdecl_caller_push_regs( struct x86_function *p ); void x86_cdecl_caller_pop_regs( struct x86_function *p ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c index 9e02040f6c..acbff103ef 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c @@ -34,7 +34,7 @@ #include "tgsi_iterate.h" -DEBUG_GET_ONCE_BOOL_OPTION(print_sanity, "TGSI_PRINT_SANITY", TRUE); +DEBUG_GET_ONCE_BOOL_OPTION(print_sanity, "TGSI_PRINT_SANITY", FALSE) typedef struct { diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h index 46d8d18419..73f0f414e3 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h @@ -36,7 +36,7 @@ extern "C" { /* Check the given token stream for errors and common mistakes. * Diagnostic messages are printed out to the debug output, and is - * controlled by the debug option TGSI_PRINT_SANITY (default true). + * controlled by the debug option TGSI_PRINT_SANITY (default false). * Returns TRUE if there are no errors, even though there could be some warnings. */ boolean diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c index a9b7253bf4..03a7f050aa 100644 --- a/src/gallium/auxiliary/translate/translate.c +++ b/src/gallium/auxiliary/translate/translate.c @@ -38,7 +38,8 @@ struct translate *translate_create( const struct translate_key *key ) { struct translate *translate = NULL; -#if defined(PIPE_ARCH_X86) +/* TODO: enable Win64 once it has actually been tested */ +#if defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(_WIN64)) translate = translate_sse2_create( key ); if (translate) return translate; @@ -48,3 +49,8 @@ struct translate *translate_create( const struct translate_key *key ) return translate_generic_create( key ); } + +boolean translate_is_output_format_supported(enum pipe_format format) +{ + return translate_generic_is_output_format_supported(format); +} diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h index edd95e0788..a75380228b 100644 --- a/src/gallium/auxiliary/translate/translate.h +++ b/src/gallium/auxiliary/translate/translate.h @@ -85,6 +85,18 @@ struct translate { unsigned instance_id, void *output_buffer); + void (PIPE_CDECL *run_elts16)( struct translate *, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + + void (PIPE_CDECL *run_elts8)( struct translate *, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + void (PIPE_CDECL *run)( struct translate *, unsigned start, unsigned count, @@ -105,6 +117,8 @@ struct translate *translate_lookup_or_create( struct translate_context *tctx, struct translate *translate_create( const struct translate_key *key ); +boolean translate_is_output_format_supported(enum pipe_format format); + static INLINE int translate_keysize( const struct translate_key *key ) { return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element); @@ -138,5 +152,6 @@ struct translate *translate_sse2_create( const struct translate_key *key ); struct translate *translate_generic_create( const struct translate_key *key ); +boolean translate_generic_is_output_format_supported(enum pipe_format format); #endif diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index 4d1977229e..ad809db720 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -64,6 +64,14 @@ struct translate_generic { unsigned input_stride; unsigned max_index; + /* this value is set to -1 if this is a normal element with output_format != input_format: + * in this case, u_format is used to do a full conversion + * + * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids: + * in this case, memcpy is used to copy this amount of bytes + */ + int copy_size; + } attrib[PIPE_MAX_ATTRIBS]; unsigned nr_attrib; @@ -187,9 +195,15 @@ ATTRIB( R8G8B8_SNORM, 3, char, TO_8_SNORM ) ATTRIB( R8G8_SNORM, 2, char, TO_8_SNORM ) ATTRIB( R8_SNORM, 1, char, TO_8_SNORM ) -ATTRIB( A8R8G8B8_UNORM, 4, ubyte, TO_8_UNORM ) -/*ATTRIB( R8G8B8A8_UNORM, 4, ubyte, TO_8_UNORM )*/ - +static void +emit_A8R8G8B8_UNORM( const float *attrib, void *ptr) +{ + ubyte *out = (ubyte *)ptr; + out[0] = TO_8_UNORM(attrib[3]); + out[1] = TO_8_UNORM(attrib[0]); + out[2] = TO_8_UNORM(attrib[1]); + out[3] = TO_8_UNORM(attrib[2]); +} static void emit_B8G8R8A8_UNORM( const float *attrib, void *ptr) @@ -348,7 +362,65 @@ static emit_func get_emit_func( enum pipe_format format ) } } +static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg, + unsigned elt, + unsigned instance_id, + void *vert ) +{ + unsigned nr_attrs = tg->nr_attrib; + unsigned attr; + + for (attr = 0; attr < nr_attrs; attr++) { + float data[4]; + uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset; + if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { + const uint8_t *src; + unsigned index; + int copy_size; + + if (tg->attrib[attr].instance_divisor) { + index = instance_id / tg->attrib[attr].instance_divisor; + } + else { + index = elt; + } + + /* clamp to void going out of bounds */ + index = MIN2(index, tg->attrib[attr].max_index); + + src = tg->attrib[attr].input_ptr + + tg->attrib[attr].input_stride * index; + + copy_size = tg->attrib[attr].copy_size; + if(likely(copy_size >= 0)) + memcpy(dst, src, copy_size); + else + { + tg->attrib[attr].fetch( data, src, 0, 0 ); + + if (0) + debug_printf("Fetch linear attr %d from %p stride %d index %d: " + " %f, %f, %f, %f \n", + attr, + tg->attrib[attr].input_ptr, + tg->attrib[attr].input_stride, + index, + data[0], data[1],data[2], data[3]); + + tg->attrib[attr].emit( data, dst ); + } + } else { + if(likely(tg->attrib[attr].copy_size >= 0)) + memcpy(data, &instance_id, 4); + else + { + data[0] = (float)instance_id; + tg->attrib[attr].emit( data, dst ); + } + } + } +} /** * Fetch vertex attributes for 'count' vertices. @@ -361,62 +433,45 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate, { struct translate_generic *tg = translate_generic(translate); char *vert = output_buffer; - unsigned nr_attrs = tg->nr_attrib; - unsigned attr; unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ for (i = 0; i < count; i++) { - const unsigned elt = *elts++; - - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - char *dst = vert + tg->attrib[attr].output_offset; - - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { - const uint8_t *src; - unsigned index; - - if (tg->attrib[attr].instance_divisor) { - index = instance_id / tg->attrib[attr].instance_divisor; - } else { - index = elt; - } - - /* clamp to void going out of bounds */ - index = MIN2(index, tg->attrib[attr].max_index); - - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * index; - - tg->attrib[attr].fetch( data, src, 0, 0 ); - - if (0) - debug_printf("Fetch elt attr %d from %p stride %d div %u max %u index %d: " - " %f, %f, %f, %f \n", - attr, - tg->attrib[attr].input_ptr, - tg->attrib[attr].input_stride, - tg->attrib[attr].instance_divisor, - tg->attrib[attr].max_index, - index, - data[0], data[1],data[2], data[3]); - } else { - data[0] = (float)instance_id; - } + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} - if (0) - debug_printf("vert %d/%d attr %d: %f %f %f %f\n", - i, elt, attr, data[0], data[1], data[2], data[3]); +static void PIPE_CDECL generic_run_elts16( struct translate *translate, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; - tg->attrib[attr].emit( data, dst ); - } + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); vert += tg->translate.key.output_stride; } } +static void PIPE_CDECL generic_run_elts8( struct translate *translate, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} static void PIPE_CDECL generic_run( struct translate *translate, unsigned start, @@ -426,57 +481,10 @@ static void PIPE_CDECL generic_run( struct translate *translate, { struct translate_generic *tg = translate_generic(translate); char *vert = output_buffer; - unsigned nr_attrs = tg->nr_attrib; - unsigned attr; unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ for (i = 0; i < count; i++) { - unsigned elt = start + i; - - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - char *dst = vert + tg->attrib[attr].output_offset; - - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { - const uint8_t *src; - unsigned index; - - if (tg->attrib[attr].instance_divisor) { - index = instance_id / tg->attrib[attr].instance_divisor; - } - else { - index = elt; - } - - /* clamp to void going out of bounds */ - index = MIN2(index, tg->attrib[attr].max_index); - - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * index; - - tg->attrib[attr].fetch( data, src, 0, 0 ); - - if (0) - debug_printf("Fetch linear attr %d from %p stride %d index %d: " - " %f, %f, %f, %f \n", - attr, - tg->attrib[attr].input_ptr, - tg->attrib[attr].input_stride, - index, - data[0], data[1],data[2], data[3]); - } else { - data[0] = (float)instance_id; - } - - if (0) - debug_printf("vert %d attr %d: %f %f %f %f\n", - i, attr, data[0], data[1], data[2], data[3]); - - tg->attrib[attr].emit( data, dst ); - } - + generic_run_one(tg, start + i, instance_id, vert); vert += tg->translate.key.output_stride; } } @@ -522,6 +530,8 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->translate.release = generic_release; tg->translate.set_buffer = generic_set_buffer; tg->translate.run_elts = generic_run_elts; + tg->translate.run_elts16 = generic_run_elts16; + tg->translate.run_elts8 = generic_run_elts8; tg->translate.run = generic_run; for (i = 0; i < key->nr_elements; i++) { @@ -538,9 +548,28 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->attrib[i].input_offset = key->element[i].input_offset; tg->attrib[i].instance_divisor = key->element[i].instance_divisor; - tg->attrib[i].emit = get_emit_func(key->element[i].output_format); tg->attrib[i].output_offset = key->element[i].output_offset; + tg->attrib[i].copy_size = -1; + if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID) + { + if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED + || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED) + tg->attrib[i].copy_size = 4; + } + else + { + if(key->element[i].input_format == key->element[i].output_format + && format_desc->block.width == 1 + && format_desc->block.height == 1 + && !(format_desc->block.bits & 7)) + tg->attrib[i].copy_size = format_desc->block.bits >> 3; + } + + if(tg->attrib[i].copy_size < 0) + tg->attrib[i].emit = get_emit_func(key->element[i].output_format); + else + tg->attrib[i].emit = NULL; } tg->nr_attrib = key->nr_elements; @@ -548,3 +577,83 @@ struct translate *translate_generic_create( const struct translate_key *key ) return &tg->translate; } + +boolean translate_generic_is_output_format_supported(enum pipe_format format) +{ + switch(format) + { + case PIPE_FORMAT_R64G64B64A64_FLOAT: return TRUE; + case PIPE_FORMAT_R64G64B64_FLOAT: return TRUE; + case PIPE_FORMAT_R64G64_FLOAT: return TRUE; + case PIPE_FORMAT_R64_FLOAT: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_FLOAT: return TRUE; + case PIPE_FORMAT_R32G32B32_FLOAT: return TRUE; + case PIPE_FORMAT_R32G32_FLOAT: return TRUE; + case PIPE_FORMAT_R32_FLOAT: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_USCALED: return TRUE; + case PIPE_FORMAT_R32G32B32_USCALED: return TRUE; + case PIPE_FORMAT_R32G32_USCALED: return TRUE; + case PIPE_FORMAT_R32_USCALED: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_SSCALED: return TRUE; + case PIPE_FORMAT_R32G32B32_SSCALED: return TRUE; + case PIPE_FORMAT_R32G32_SSCALED: return TRUE; + case PIPE_FORMAT_R32_SSCALED: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_UNORM: return TRUE; + case PIPE_FORMAT_R32G32B32_UNORM: return TRUE; + case PIPE_FORMAT_R32G32_UNORM: return TRUE; + case PIPE_FORMAT_R32_UNORM: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_SNORM: return TRUE; + case PIPE_FORMAT_R32G32B32_SNORM: return TRUE; + case PIPE_FORMAT_R32G32_SNORM: return TRUE; + case PIPE_FORMAT_R32_SNORM: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_USCALED: return TRUE; + case PIPE_FORMAT_R16G16B16_USCALED: return TRUE; + case PIPE_FORMAT_R16G16_USCALED: return TRUE; + case PIPE_FORMAT_R16_USCALED: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_SSCALED: return TRUE; + case PIPE_FORMAT_R16G16B16_SSCALED: return TRUE; + case PIPE_FORMAT_R16G16_SSCALED: return TRUE; + case PIPE_FORMAT_R16_SSCALED: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_UNORM: return TRUE; + case PIPE_FORMAT_R16G16B16_UNORM: return TRUE; + case PIPE_FORMAT_R16G16_UNORM: return TRUE; + case PIPE_FORMAT_R16_UNORM: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_SNORM: return TRUE; + case PIPE_FORMAT_R16G16B16_SNORM: return TRUE; + case PIPE_FORMAT_R16G16_SNORM: return TRUE; + case PIPE_FORMAT_R16_SNORM: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_USCALED: return TRUE; + case PIPE_FORMAT_R8G8B8_USCALED: return TRUE; + case PIPE_FORMAT_R8G8_USCALED: return TRUE; + case PIPE_FORMAT_R8_USCALED: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_SSCALED: return TRUE; + case PIPE_FORMAT_R8G8B8_SSCALED: return TRUE; + case PIPE_FORMAT_R8G8_SSCALED: return TRUE; + case PIPE_FORMAT_R8_SSCALED: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_UNORM: return TRUE; + case PIPE_FORMAT_R8G8B8_UNORM: return TRUE; + case PIPE_FORMAT_R8G8_UNORM: return TRUE; + case PIPE_FORMAT_R8_UNORM: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_SNORM: return TRUE; + case PIPE_FORMAT_R8G8B8_SNORM: return TRUE; + case PIPE_FORMAT_R8G8_SNORM: return TRUE; + case PIPE_FORMAT_R8_SNORM: return TRUE; + + case PIPE_FORMAT_A8R8G8B8_UNORM: return TRUE; + case PIPE_FORMAT_B8G8R8A8_UNORM: return TRUE; + default: return FALSE; + } +} diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index ef3aa674a3..56c5b36ce2 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -30,11 +30,12 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_format.h" #include "translate.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "rtasm/rtasm_cpu.h" #include "rtasm/rtasm_x86sse.h" @@ -46,21 +47,9 @@ #define W 3 -typedef void (PIPE_CDECL *run_func)( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer); - -typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - struct translate_buffer { const void *base_ptr; - unsigned stride; + uintptr_t stride; unsigned max_index; }; @@ -79,15 +68,15 @@ struct translate_sse { struct x86_function linear_func; struct x86_function elt_func; + struct x86_function elt16_func; + struct x86_function elt8_func; struct x86_function *func; boolean loaded_identity; - boolean loaded_255; - boolean loaded_inv_255; + boolean loaded_const[5]; float identity[4]; - float float_255[4]; - float inv_255[4]; + float const_value[5][4]; struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; @@ -102,17 +91,16 @@ struct translate_sse { boolean use_instancing; unsigned instance_id; - run_func gen_run; - run_elts_func gen_run_elts; - /* these are actually known values, but putting them in a struct * like this is helpful to keep them in sync across the file. */ struct x86_reg tmp_EAX; - struct x86_reg idx_EBX; /* either start+i or &elt[i] */ - struct x86_reg outbuf_ECX; - struct x86_reg machine_EDX; - struct x86_reg count_ESI; /* decrements to zero */ + struct x86_reg tmp2_EDX; + struct x86_reg tmp3_ECX; + struct x86_reg idx_ESI; /* either start+i or &elt[i] */ + struct x86_reg machine_EDI; + struct x86_reg outbuf_EBX; + struct x86_reg count_EBP; /* decrements to zero */ }; static int get_offset( const void *a, const void *b ) @@ -124,7 +112,7 @@ static int get_offset( const void *a, const void *b ) static struct x86_reg get_identity( struct translate_sse *p ) { - struct x86_reg reg = x86_make_reg(file_XMM, 6); + struct x86_reg reg = x86_make_reg(file_XMM, 7); if (!p->loaded_identity) { p->loaded_identity = TRUE; @@ -134,267 +122,924 @@ static struct x86_reg get_identity( struct translate_sse *p ) p->identity[3] = 1; sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->identity[0]))); } return reg; } -static struct x86_reg get_255( struct translate_sse *p ) +static struct x86_reg get_const( struct translate_sse *p, unsigned i, float v) { - struct x86_reg reg = x86_make_reg(file_XMM, 7); - - if (!p->loaded_255) { - p->loaded_255 = TRUE; - p->float_255[0] = - p->float_255[1] = - p->float_255[2] = - p->float_255[3] = 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->float_255[0]))); + struct x86_reg reg = x86_make_reg(file_XMM, 2 + i); + + if (!p->loaded_const[i]) { + p->loaded_const[i] = TRUE; + p->const_value[i][0] = + p->const_value[i][1] = + p->const_value[i][2] = + p->const_value[i][3] = v; + + sse_movups(p->func, reg, + x86_make_disp(p->machine_EDI, + get_offset(p, &p->const_value[i][0]))); } return reg; } -static struct x86_reg get_inv_255( struct translate_sse *p ) +static struct x86_reg get_inv_127( struct translate_sse *p ) { - struct x86_reg reg = x86_make_reg(file_XMM, 5); - - if (!p->loaded_inv_255) { - p->loaded_inv_255 = TRUE; - p->inv_255[0] = - p->inv_255[1] = - p->inv_255[2] = - p->inv_255[3] = 1.0f / 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->inv_255[0]))); - } - - return reg; + return get_const(p, 0, 1.0f / 127.0f); } - -static void emit_load_R32G32B32A32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static struct x86_reg get_inv_255( struct translate_sse *p ) { - sse_movups(p->func, data, arg0); + return get_const(p, 1, 1.0f / 255.0f); } -static void emit_load_R32G32B32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static struct x86_reg get_inv_32767( struct translate_sse *p ) { - /* Have to jump through some hoops: - * - * c 0 0 0 - * c 0 0 1 - * 0 0 c 1 - * a b c 1 - */ - sse_movss(p->func, data, x86_make_disp(arg0, 8)); - sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); - sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); - sse_movlps(p->func, data, arg0); + return get_const(p, 2, 1.0f / 32767.0f); } -static void emit_load_R32G32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static struct x86_reg get_inv_65535( struct translate_sse *p ) { - /* 0 0 0 1 - * a b 0 1 - */ - sse_movups(p->func, data, get_identity(p) ); - sse_movlps(p->func, data, arg0); + return get_const(p, 3, 1.0f / 65535.0f); } - -static void emit_load_R32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static struct x86_reg get_inv_2147483647( struct translate_sse *p ) { - /* a 0 0 0 - * a 0 0 1 - */ - sse_movss(p->func, data, arg0); - sse_orps(p->func, data, get_identity(p) ); + return get_const(p, 4, 1.0f / 2147483647.0f); } - -static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, +/* load the data in a SSE2 register, padding with zeros */ +static boolean emit_load_sse2( struct translate_sse *p, struct x86_reg data, - struct x86_reg src ) + struct x86_reg src, + unsigned size) { - - /* Load and unpack twice: - */ - sse_movss(p->func, data, src); - sse2_punpcklbw(p->func, data, get_identity(p)); - sse2_punpcklbw(p->func, data, get_identity(p)); - - /* Convert to float: - */ - sse2_cvtdq2ps(p->func, data, data); - - - /* Scale by 1/255.0 - */ - sse_mulps(p->func, data, get_inv_255(p)); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + switch(size) + { + case 1: + x86_movzx8(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 2: + x86_movzx16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + case 3: + x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); + x86_shl_imm(p->func, tmp, 16); + x86_mov16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + case 4: + sse2_movd(p->func, data, src); + break; + case 6: + sse2_movd(p->func, data, src); + x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); + sse2_movd(p->func, tmpXMM, tmp); + sse2_punpckldq(p->func, data, tmpXMM); + break; + case 8: + sse2_movq(p->func, data, src); + break; + case 12: + sse2_movq(p->func, data, src); + sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); + sse2_punpcklqdq(p->func, data, tmpXMM); + break; + case 16: + sse2_movdqu(p->func, data, src); + break; + default: + return FALSE; + } + return TRUE; } +/* this value can be passed for the out_chans argument */ +#define CHANNELS_0001 5 +/* this function will load #chans float values, and will + * pad the register with zeroes at least up to out_chans. + * + * If out_chans is set to CHANNELS_0001, then the fourth + * value will be padded with 1. Only pass this value if + * chans < 4 or results are undefined. + */ +static void emit_load_float32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) +{ + switch(chans) + { + case 1: + /* a 0 0 0 + * a 0 0 1 + */ + sse_movss(p->func, data, arg0); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_identity(p) ); + break; + case 2: + /* 0 0 0 1 + * a b 0 1 + */ + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_identity(p) ); + sse_movlps(p->func, data, arg0); + break; + case 3: + /* Have to jump through some hoops: + * + * c 0 0 0 + * c 0 0 1 if out_chans == CHANNELS_0001 + * 0 0 c 0/1 + * a b c 0/1 + */ + sse_movss(p->func, data, x86_make_disp(arg0, 8)); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); + sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); + sse_movlps(p->func, data, arg0); + break; + case 4: + sse_movups(p->func, data, arg0); + break; + } +} +/* this function behaves like emit_load_float32, but loads + 64-bit floating point numbers, converting them to 32-bit + ones */ +static void emit_load_float64to32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) +{ + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + switch(chans) + { + case 1: + sse2_movsd(p->func, data, arg0); + if(out_chans > 1) + sse2_cvtpd2ps(p->func, data, data); + else + sse2_cvtsd2ss(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) ); + break; + case 2: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_identity(p) ); + break; + case 3: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + if(out_chans > 3) + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + else + sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_identity(p) ); + break; + case 4: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + break; + } +} -static void emit_store_R32G32B32A32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) +static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) { - sse_movups(p->func, dest, dataXMM); + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, dst_gpr, src_gpr); + else + { + /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movq(p->func, dst_xmm, src_xmm); + else + sse_movlps(p->func, dst_xmm, src_xmm); + } } -static void emit_store_R32G32B32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) +static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) { - /* Emit two, shuffle, emit one. - */ - sse_movlps(p->func, dest, dataXMM); - sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ - sse_movss(p->func, x86_make_disp(dest,8), dataXMM); + emit_mov64(p, dst_gpr, dst_xmm, src, src); } -static void emit_store_R32G32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) +static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) { - sse_movlps(p->func, dest, dataXMM); + emit_mov64(p, dst, dst, src_gpr, src_xmm); } -static void emit_store_R32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) +static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) { - sse_movss(p->func, dest, dataXMM); + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movdqu(p->func, dst, src); + else + sse_movups(p->func, dst, src); } +/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, + * but may or may not be good on older processors + * TODO: may perhaps want to use non-temporal stores here if possible + */ +static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) +{ + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); + struct x86_reg dataGPR = p->tmp_EAX; + struct x86_reg dataGPR2 = p->tmp2_EDX; + + if(size < 8) + { + switch (size) + { + case 1: + x86_mov8(p->func, dataGPR, src); + x86_mov8(p->func, dst, dataGPR); + break; + case 2: + x86_mov16(p->func, dataGPR, src); + x86_mov16(p->func, dst, dataGPR); + break; + case 3: + x86_mov16(p->func, dataGPR, src); + x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); + x86_mov16(p->func, dst, dataGPR); + x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); + break; + case 4: + x86_mov(p->func, dataGPR, src); + x86_mov(p->func, dst, dataGPR); + break; + case 6: + x86_mov(p->func, dataGPR, src); + x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); + x86_mov(p->func, dst, dataGPR); + x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); + break; + } + } + else if(!(x86_target_caps(p->func) & X86_SSE)) + { + unsigned i = 0; + assert((size & 3) == 0); + for(i = 0; i < size; i += 4) + { + x86_mov(p->func, dataGPR, x86_make_disp(src, i)); + x86_mov(p->func, x86_make_disp(dst, i), dataGPR); + } + } + else + { + switch(size) + { + case 8: + emit_load64(p, dataGPR, dataXMM, src); + emit_store64(p, dst, dataGPR, dataXMM); + break; + case 12: + emit_load64(p, dataGPR2, dataXMM, src); + x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); + emit_store64(p, dst, dataGPR2, dataXMM); + x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); + break; + case 16: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dst, dataXMM); + break; + case 24: + emit_mov128(p, dataXMM, src); + emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); + break; + case 32: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); + break; + default: + assert(0); + } + } +} +static boolean translate_attr_convert( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) -static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) { - /* Scale by 255.0 - */ - sse_mulps(p->func, dataXMM, get_255(p)); + const struct util_format_description* input_desc = util_format_description(a->input_format); + const struct util_format_description* output_desc = util_format_description(a->output_format); + unsigned i; + boolean id_swizzle = TRUE; + unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; + unsigned needed_chans = 0; + unsigned imms[2] = {0, 0x3f800000}; - /* Pack and emit: - */ - sse2_cvtps2dq(p->func, dataXMM, dataXMM); - sse2_packssdw(p->func, dataXMM, dataXMM); - sse2_packuswb(p->func, dataXMM, dataXMM); - sse_movss(p->func, dest, dataXMM); -} + if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) + return FALSE; + + if(input_desc->channel[0].size & 7) + return FALSE; + if(input_desc->colorspace != output_desc->colorspace) + return FALSE; + for(i = 1; i < input_desc->nr_channels; ++i) + { + if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) + return FALSE; + } + for(i = 1; i < output_desc->nr_channels; ++i) + { + if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) + return FALSE; + } + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(output_desc->swizzle[i] < 4) + swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; + } -/* Extended swizzles? Maybe later. - */ -static void emit_swizzle( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg src, - unsigned char shuffle ) -{ - sse_shufps(p->func, dest, src, shuffle); -} + if((x86_target_caps(p->func) & X86_SSE) && (0 + || a->output_format == PIPE_FORMAT_R32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static boolean translate_attr( struct translate_sse *p, - const struct translate_element *a, - struct x86_reg srcECX, - struct x86_reg dstEAX) -{ - struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } - switch (a->input_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_load_R32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_load_R32G32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_load_R32G32B32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_load_R32G32B32A32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - break; - default: - return FALSE; + if(needed_chans > 0) + { + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovzx */ + switch(input_desc->channel[0].size) + { + case 8: + /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ + sse2_punpcklbw(p->func, dataXMM, get_identity(p)); + sse2_punpcklbw(p->func, dataXMM, get_identity(p)); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, get_identity(p)); + break; + case 32: /* we lose precision here */ + sse2_psrld_imm(p->func, dataXMM, 1); + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_inv_255(p); + break; + case 16: + factor = get_inv_65535(p); + break; + case 32: + factor = get_inv_2147483647(p); + break; + } + sse_mulps(p->func, dataXMM, factor); + } + else if(input_desc->channel[0].size == 32) + sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovsx */ + switch(input_desc->channel[0].size) + { + case 8: + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 24); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 16); + break; + case 32: /* we lose precision here */ + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_inv_127(p); + break; + case 16: + factor = get_inv_32767(p); + break; + case 32: + factor = get_inv_2147483647(p); + break; + } + sse_mulps(p->func, dataXMM, factor); + } + break; + + break; + case UTIL_FORMAT_TYPE_FLOAT: + if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) + return FALSE; + if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) + { + swizzle[3] = UTIL_FORMAT_SWIZZLE_W; + needed_chans = CHANNELS_0001; + } + switch(input_desc->channel[0].size) + { + case 32: + emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + case 64: /* we lose precision here */ + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + default: + return FALSE; + } + break; + default: + return FALSE; + } + + if(!id_swizzle) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); + } + + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse_movups(p->func, dst, dataXMM); + else + { + if(output_desc->nr_channels >= 2 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse_movlps(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + sse_movss(p->func, dst, dataXMM); + else + x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 2) + { + if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + + if(output_desc->nr_channels >= 3) + { + if(output_desc->nr_channels >= 4 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); + else + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); + sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + } + return TRUE; } + else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 + && output_desc->channel[0].normalized == input_desc->channel[0].normalized + && (0 + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + unsigned imms[2] = {0, 1}; + + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } - switch (a->output_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_store_R32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_store_R32G32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_store_R32G32B32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_store_R32G32B32A32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - default: - return FALSE; + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } + + if(needed_chans > 0) + { + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(input_desc->channel[0].normalized) + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + sse2_psrlw_imm(p->func, dataXMM, 1); + } + else + sse2_punpcklbw(p->func, dataXMM, get_identity(p)); + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(input_desc->channel[0].normalized) + { + sse2_movq(p->func, tmpXMM, get_identity(p)); + sse2_punpcklbw(p->func, tmpXMM, dataXMM); + sse2_psllw_imm(p->func, dataXMM, 9); + sse2_psrlw_imm(p->func, dataXMM, 8); + sse2_por(p->func, tmpXMM, dataXMM); + sse2_psrlw_imm(p->func, dataXMM, 7); + sse2_por(p->func, tmpXMM, dataXMM); + { + struct x86_reg t = dataXMM; + dataXMM = tmpXMM; + tmpXMM = t; + } + } + else + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psraw_imm(p->func, dataXMM, 8); + } + break; + default: + assert(0); + } + + if(output_desc->channel[0].normalized) + imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; + + if(!id_swizzle) + sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); + } + + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse2_movq(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse2_movd(p->func, dst, dataXMM); + else + { + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, dst, tmp); + if(output_desc->nr_channels >= 2) + x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + else + { + if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + if(output_desc->nr_channels >= 2) + { + sse2_movd(p->func, tmp, dataXMM); + x86_shr_imm(p->func, tmp, 16); + x86_mov16(p->func, x86_make_disp(dst, 2), tmp); + } + } + } + + if(output_desc->nr_channels >= 3) + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 4), tmp); + if(output_desc->nr_channels >= 4) + { + x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + else + { + if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + sse2_psrlq_imm(p->func, dataXMM, 48); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 6), tmp); + } + } + } + } + } + return TRUE; } + else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) + { + struct x86_reg tmp = p->tmp_EAX; + unsigned i; + if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 + && swizzle[0] == UTIL_FORMAT_SWIZZLE_W + && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z + && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y + && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) + { + /* TODO: support movbe */ + x86_mov(p->func, tmp, src); + x86_bswap(p->func, tmp); + x86_mov(p->func, dst, tmp); + return TRUE; + } - return TRUE; + for(i = 0; i < output_desc->nr_channels; ++i) + { + switch(output_desc->channel[0].size) + { + case 8: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[0].normalized ? 0xff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[0].normalized ? 0x7f : 1; + break; + default: + return FALSE; + } + } + x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); + } + else + { + x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); + x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); + } + break; + case 16: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3c00; + break; + default: + return FALSE; + } + } + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); + } + else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); + else + { + x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); + x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); + } + break; + case 32: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3f800000; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); + x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); + } + break; + case 64: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned l = 0; + unsigned h = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + h = output_desc->channel[1].normalized ? 0xffffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + h = output_desc->channel[1].normalized ? 0x7fffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + h = 0x3ff00000; + l = 0; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); + x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); + } + else + { + if(x86_target_caps(p->func) & X86_SSE) + { + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); + emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); + emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); + x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); + x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); + } + } + break; + default: + return FALSE; + } + } + return TRUE; + } + return FALSE; } +static boolean translate_attr( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) +{ + if(a->input_format == a->output_format) + { + emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); + return TRUE; + } + + return translate_attr_convert(p, a, src, dst); +} static boolean init_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { unsigned i; - struct x86_reg instance_id = x86_make_disp(p->machine_EDX, + struct x86_reg instance_id = x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; - if (linear || varient->instance_divisor) { - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + if (!index_size || varient->instance_divisor) { + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); - struct x86_reg elt = p->idx_EBX; + struct x86_reg elt = p->idx_ESI; struct x86_reg tmp_EAX = p->tmp_EAX; /* Calculate pointer to first attrib: @@ -406,20 +1051,16 @@ static boolean init_inputs( struct translate_sse *p, x86_mov(p->func, tmp_EAX, instance_id); if (varient->instance_divisor != 1) { - struct x86_reg tmp_EDX = p->machine_EDX; - struct x86_reg tmp_ECX = p->outbuf_ECX; + struct x86_reg tmp_EDX = p->tmp2_EDX; + struct x86_reg tmp_ECX = p->tmp3_ECX; /* TODO: Add x86_shr() to rtasm and use it whenever * instance divisor is power of two. */ - x86_push(p->func, tmp_EDX); - x86_push(p->func, tmp_ECX); x86_xor(p->func, tmp_EDX, tmp_EDX); x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ - x86_pop(p->func, tmp_ECX); - x86_pop(p->func, tmp_EDX); } } else { x86_mov(p->func, tmp_EAX, elt); @@ -430,16 +1071,23 @@ static boolean init_inputs( struct translate_sse *p, */ x86_imul(p->func, tmp_EAX, buf_stride); + x64_rexw(p->func); x86_add(p->func, tmp_EAX, buf_base_ptr); /* In the linear case, keep the buffer pointer instead of the * index number. */ - if (linear && p->nr_buffer_varients == 1) + if (!index_size && p->nr_buffer_varients == 1) + { + x64_rexw(p->func); x86_mov(p->func, elt, tmp_EAX); + } else + { + x64_rexw(p->func); x86_mov(p->func, buf_ptr, tmp_EAX); + } } } @@ -448,23 +1096,24 @@ static boolean init_inputs( struct translate_sse *p, static struct x86_reg get_buffer_ptr( struct translate_sse *p, - boolean linear, + unsigned index_size, unsigned var_idx, struct x86_reg elt ) { if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { - return x86_make_disp(p->machine_EDX, + return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); } - if (linear && p->nr_buffer_varients == 1) { - return p->idx_EBX; + if (!index_size && p->nr_buffer_varients == 1) { + return p->idx_ESI; } - else if (linear || p->buffer_varient[var_idx].instance_divisor) { + else if (!index_size || p->buffer_varient[var_idx].instance_divisor) { struct x86_reg ptr = p->tmp_EAX; struct x86_reg buf_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer_varient[var_idx].ptr)); + x64_rexw(p->func); x86_mov(p->func, ptr, buf_ptr); return ptr; } @@ -473,19 +1122,31 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p, const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx]; struct x86_reg buf_stride = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); struct x86_reg buf_base_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].base_ptr)); /* Calculate pointer to current attrib: */ - x86_mov(p->func, ptr, buf_stride); - x86_imul(p->func, ptr, elt); + switch(index_size) + { + case 1: + x86_movzx8(p->func, ptr, elt); + break; + case 2: + x86_movzx16(p->func, ptr, elt); + break; + case 4: + x86_mov(p->func, ptr, elt); + break; + } + x86_imul(p->func, ptr, buf_stride); + x64_rexw(p->func); x86_add(p->func, ptr, buf_base_ptr); return ptr; } @@ -494,39 +1155,42 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p, static boolean incr_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { - if (linear && p->nr_buffer_varients == 1) { - struct x86_reg stride = x86_make_disp(p->machine_EDX, + if (!index_size && p->nr_buffer_varients == 1) { + struct x86_reg stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[0].stride)); if (p->buffer_varient[0].instance_divisor == 0) { - x86_add(p->func, p->idx_EBX, stride); - sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); + x64_rexw(p->func); + x86_add(p->func, p->idx_ESI, stride); + sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); } } - else if (linear) { + else if (!index_size) { unsigned i; /* Is this worthwhile?? */ for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); if (varient->instance_divisor == 0) { - x86_mov(p->func, p->tmp_EAX, buf_ptr); - x86_add(p->func, p->tmp_EAX, buf_stride); + x86_mov(p->func, p->tmp_EAX, buf_stride); + x64_rexw(p->func); + x86_add(p->func, p->tmp_EAX, buf_ptr); if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); + x64_rexw(p->func); x86_mov(p->func, buf_ptr, p->tmp_EAX); } } } else { - x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); + x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); } return TRUE; @@ -551,35 +1215,51 @@ static boolean incr_inputs( struct translate_sse *p, */ static boolean build_vertex_emit( struct translate_sse *p, struct x86_function *func, - boolean linear ) + unsigned index_size ) { int fixup, label; unsigned j; p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); - p->idx_EBX = x86_make_reg(file_REG32, reg_BX); - p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); - p->machine_EDX = x86_make_reg(file_REG32, reg_DX); - p->count_ESI = x86_make_reg(file_REG32, reg_SI); + p->idx_ESI = x86_make_reg(file_REG32, reg_SI); + p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); + p->machine_EDI = x86_make_reg(file_REG32, reg_DI); + p->count_EBP = x86_make_reg(file_REG32, reg_BP); + p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); + p->tmp3_ECX = x86_make_reg(file_REG32, reg_CX); p->func = func; - p->loaded_inv_255 = FALSE; - p->loaded_255 = FALSE; + memset(&p->loaded_const, 0, sizeof(p->loaded_const)); p->loaded_identity = FALSE; x86_init_func(p->func); - /* Push a few regs? - */ - x86_push(p->func, p->idx_EBX); - x86_push(p->func, p->count_ESI); + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); + } - /* Load arguments into regs: - */ - x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); - x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); - x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); - x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); + x86_push(p->func, p->outbuf_EBX); + x86_push(p->func, p->count_EBP); + +/* on non-Win64 x86-64, these are already in the right registers */ + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_push(p->func, p->machine_EDI); + x86_push(p->func, p->idx_ESI); + + x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); + x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); + } + + x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); + + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); + else + x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); /* Load instance ID. */ @@ -588,25 +1268,25 @@ static boolean build_vertex_emit( struct translate_sse *p, p->tmp_EAX, x86_fn_arg(p->func, 4)); x86_mov(p->func, - x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), + x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), p->tmp_EAX); } /* Get vertex count, compare to zero */ x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); - x86_cmp(p->func, p->count_ESI, p->tmp_EAX); + x86_cmp(p->func, p->count_EBP, p->tmp_EAX); fixup = x86_jcc_forward(p->func, cc_E); /* always load, needed or not: */ - init_inputs(p, linear); + init_inputs(p, index_size); /* Note address for loop jump */ label = x86_get_label(p->func); { - struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); + struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); int last_varient = -1; struct x86_reg vb; @@ -618,30 +1298,31 @@ static boolean build_vertex_emit( struct translate_sse *p, */ if (varient != last_varient) { last_varient = varient; - vb = get_buffer_ptr(p, linear, varient, elt); + vb = get_buffer_ptr(p, index_size, varient, elt); } if (!translate_attr( p, a, x86_make_disp(vb, a->input_offset), - x86_make_disp(p->outbuf_ECX, a->output_offset))) + x86_make_disp(p->outbuf_EBX, a->output_offset))) return FALSE; } /* Next output vertex: */ + x64_rexw(p->func); x86_lea(p->func, - p->outbuf_ECX, - x86_make_disp(p->outbuf_ECX, + p->outbuf_EBX, + x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); /* Incr index */ - incr_inputs( p, linear ); + incr_inputs( p, index_size ); } /* decr count, loop if not zero */ - x86_dec(p->func, p->count_ESI); + x86_dec(p->func, p->count_EBP); x86_jcc(p->func, cc_NZ, label); /* Exit mmx state? @@ -656,8 +1337,20 @@ static boolean build_vertex_emit( struct translate_sse *p, /* Pop regs and return */ - x86_pop(p->func, p->count_ESI); - x86_pop(p->func, p->idx_EBX); + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_pop(p->func, p->idx_ESI); + x86_pop(p->func, p->machine_EDI); + } + + x86_pop(p->func, p->count_EBP); + x86_pop(p->func, p->outbuf_EBX); + + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); + sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); + } x86_ret(p->func); return TRUE; @@ -700,43 +1393,14 @@ static void translate_sse_release( struct translate *translate ) FREE(p); } -static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run_elts( translate, - elts, - count, - instance_id, - output_buffer); -} - -static void PIPE_CDECL translate_sse_run( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run( translate, - start, - count, - instance_id, - output_buffer); -} - struct translate *translate_sse2_create( const struct translate_key *key ) { struct translate_sse *p = NULL; unsigned i; - if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) + /* this is misnamed, it actually refers to whether rtasm is enabled or not */ + if (!rtasm_cpu_has_sse()) goto fail; p = CALLOC_STRUCT( translate_sse ); @@ -746,8 +1410,6 @@ struct translate *translate_sse2_create( const struct translate_key *key ) p->translate.key = *key; p->translate.release = translate_sse_release; p->translate.set_buffer = translate_sse_set_buffer; - p->translate.run_elts = translate_sse_run_elts; - p->translate.run = translate_sse_run; for (i = 0; i < key->nr_elements; i++) { if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { @@ -783,18 +1445,32 @@ struct translate *translate_sse2_create( const struct translate_key *key ) if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); - if (!build_vertex_emit(p, &p->linear_func, TRUE)) + if (!build_vertex_emit(p, &p->linear_func, 0)) + goto fail; + + if (!build_vertex_emit(p, &p->elt_func, 4)) + goto fail; + + if (!build_vertex_emit(p, &p->elt16_func, 2)) + goto fail; + + if (!build_vertex_emit(p, &p->elt8_func, 1)) + goto fail; + + p->translate.run = (void*)x86_get_func(&p->linear_func); + if (p->translate.run == NULL) goto fail; - if (!build_vertex_emit(p, &p->elt_func, FALSE)) + p->translate.run_elts = (void*)x86_get_func(&p->elt_func); + if (p->translate.run_elts == NULL) goto fail; - p->gen_run = (run_func)x86_get_func(&p->linear_func); - if (p->gen_run == NULL) + p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func); + if (p->translate.run_elts16 == NULL) goto fail; - p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); - if (p->gen_run_elts == NULL) + p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func); + if (p->translate.run_elts8 == NULL) goto fail; return &p->translate; diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index 6f38d22285..b9b9f9257a 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -73,7 +73,7 @@ #endif -DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", TRUE); +DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE) struct util_cpu_caps util_cpu_caps; @@ -194,123 +194,8 @@ check_os_altivec_support(void) } #endif /* PIPE_ARCH_PPC */ -/* If we're running on a processor that can do SSE, let's see if we - * are allowed to or not. This will catch 2.4.0 or later kernels that - * haven't been configured for a Pentium III but are running on one, - * and RedHat patched 2.2 kernels that have broken exception handling - * support for user space apps that do SSE. - */ -#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) -static void -check_os_katmai_support(void) -{ -#if defined(PIPE_ARCH_X86) -#if defined(PIPE_OS_FREEBSD) - int has_sse=0, ret; - int len = sizeof (has_sse); - - ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0); - if (ret || !has_sse) - util_cpu_caps.has_sse=0; - -#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) - int has_sse, has_sse2, ret, mib[2]; - int varlen; - - mib[0] = CTL_MACHDEP; - mib[1] = CPU_SSE; - varlen = sizeof (has_sse); - - ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0); - if (ret < 0 || !has_sse) { - util_cpu_caps.has_sse = 0; - } else { - util_cpu_caps.has_sse = 1; - } - - mib[1] = CPU_SSE2; - varlen = sizeof (has_sse2); - ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0); - if (ret < 0 || !has_sse2) { - util_cpu_caps.has_sse2 = 0; - } else { - util_cpu_caps.has_sse2 = 1; - } - util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */ - -#elif defined(PIPE_OS_WINDOWS) - LPTOP_LEVEL_EXCEPTION_FILTER exc_fil; - if (util_cpu_caps.has_sse) { - exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse); -#if defined(PIPE_CC_GCC) - __asm __volatile ("xorps %xmm0, %xmm0"); -#elif defined(PIPE_CC_MSVC) - __asm { - xorps xmm0, xmm0 /* executing SSE instruction */ - } -#else -#error Unsupported compiler -#endif - SetUnhandledExceptionFilter(exc_fil); - } -#elif defined(PIPE_OS_LINUX) - struct sigaction saved_sigill; - struct sigaction saved_sigfpe; - - /* Save the original signal handlers. - */ - sigaction(SIGILL, NULL, &saved_sigill); - sigaction(SIGFPE, NULL, &saved_sigfpe); - - signal(SIGILL, (void (*)(int))sigill_handler_sse); - signal(SIGFPE, (void (*)(int))sigfpe_handler_sse); - - /* Emulate test for OSFXSR in CR4. The OS will set this bit if it - * supports the extended FPU save and restore required for SSE. If - * we execute an SSE instruction on a PIII and get a SIGILL, the OS - * doesn't support Streaming SIMD Exceptions, even if the processor - * does. - */ - if (util_cpu_caps.has_sse) { - __asm __volatile ("xorps %xmm1, %xmm0"); - } - - /* Emulate test for OSXMMEXCPT in CR4. The OS will set this bit if - * it supports unmasked SIMD FPU exceptions. If we unmask the - * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS - * doesn't support unmasked SIMD FPU exceptions. If we get a SIGFPE - * as expected, we're okay but we need to clean up after it. - * - * Are we being too stringent in our requirement that the OS support - * unmasked exceptions? Certain RedHat 2.2 kernels enable SSE by - * setting CR4.OSFXSR but don't support unmasked exceptions. Win98 - * doesn't even support them. We at least know the user-space SSE - * support is good in kernels that do support unmasked exceptions, - * and therefore to be safe I'm going to leave this test in here. - */ - if (util_cpu_caps.has_sse) { - /* test_os_katmai_exception_support(); */ - } - - /* Restore the original signal handlers. - */ - sigaction(SIGILL, &saved_sigill, NULL); - sigaction(SIGFPE, &saved_sigfpe, NULL); - -#else - /* We can't use POSIX signal handling to test the availability of - * SSE, so we disable it by default. - */ - util_cpu_caps.has_sse = 0; -#endif /* __linux__ */ -#endif - -#if defined(PIPE_ARCH_X86_64) - util_cpu_caps.has_sse = 1; -#endif -} - +#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) static int has_cpuid(void) { #if defined(PIPE_ARCH_X86) @@ -391,23 +276,6 @@ util_cpu_detect(void) memset(&util_cpu_caps, 0, sizeof util_cpu_caps); - /* Check for arch type */ -#if defined(PIPE_ARCH_MIPS) - util_cpu_caps.arch = UTIL_CPU_ARCH_MIPS; -#elif defined(PIPE_ARCH_ALPHA) - util_cpu_caps.arch = UTIL_CPU_ARCH_ALPHA; -#elif defined(PIPE_ARCH_SPARC) - util_cpu_caps.arch = UTIL_CPU_ARCH_SPARC; -#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) - util_cpu_caps.arch = UTIL_CPU_ARCH_X86; - util_cpu_caps.little_endian = 1; -#elif defined(PIPE_ARCH_PPC) - util_cpu_caps.arch = UTIL_CPU_ARCH_POWERPC; - util_cpu_caps.little_endian = 0; -#else - util_cpu_caps.arch = UTIL_CPU_ARCH_UNKNOWN; -#endif - /* Count the number of CPUs in system */ #if defined(PIPE_OS_WINDOWS) { @@ -486,9 +354,6 @@ util_cpu_detect(void) util_cpu_caps.cacheline = regs2[2] & 0xFF; } - if (util_cpu_caps.has_sse) - check_os_katmai_support(); - if (!util_cpu_caps.has_sse) { util_cpu_caps.has_sse2 = 0; util_cpu_caps.has_sse3 = 0; @@ -504,7 +369,6 @@ util_cpu_detect(void) #ifdef DEBUG if (debug_get_option_dump_cpu()) { - debug_printf("util_cpu_caps.arch = %i\n", util_cpu_caps.arch); debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h index 4b3dc39c34..f3bef0993c 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.h +++ b/src/gallium/auxiliary/util/u_cpu_detect.h @@ -36,26 +36,15 @@ #define _UTIL_CPU_DETECT_H #include "pipe/p_compiler.h" - -enum util_cpu_arch { - UTIL_CPU_ARCH_UNKNOWN = 0, - UTIL_CPU_ARCH_MIPS, - UTIL_CPU_ARCH_ALPHA, - UTIL_CPU_ARCH_SPARC, - UTIL_CPU_ARCH_X86, - UTIL_CPU_ARCH_POWERPC -}; +#include "pipe/p_config.h" struct util_cpu_caps { - enum util_cpu_arch arch; unsigned nr_cpus; /* Feature flags */ int x86_cpu_type; unsigned cacheline; - unsigned little_endian:1; - unsigned has_tsc:1; unsigned has_mmx:1; unsigned has_mmx2:1; diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c index ad162558bc..504e6d2a18 100644 --- a/src/gallium/auxiliary/util/u_debug.c +++ b/src/gallium/auxiliary/util/u_debug.c @@ -88,7 +88,7 @@ debug_get_option_should_print(void) * but its cool since we set first to false */ first = FALSE; - value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", TRUE); + value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", FALSE); /* XXX should we print this option? Currently it wont */ return value; } diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h index 38254b1096..8e786a390a 100644 --- a/src/gallium/auxiliary/util/u_format.h +++ b/src/gallium/auxiliary/util/u_format.h @@ -631,6 +631,44 @@ util_format_has_alpha(enum pipe_format format) } /** + * Return the matching SRGB format, or PIPE_FORMAT_NONE if none. + */ +static INLINE enum pipe_format +util_format_srgb(enum pipe_format format) +{ + switch (format) { + case PIPE_FORMAT_L8_UNORM: + return PIPE_FORMAT_L8_SRGB; + case PIPE_FORMAT_L8A8_UNORM: + return PIPE_FORMAT_L8A8_SRGB; + case PIPE_FORMAT_R8G8B8_UNORM: + return PIPE_FORMAT_R8G8B8_SRGB; + case PIPE_FORMAT_A8B8G8R8_UNORM: + return PIPE_FORMAT_A8B8G8R8_SRGB; + case PIPE_FORMAT_X8B8G8R8_UNORM: + return PIPE_FORMAT_X8B8G8R8_SRGB; + case PIPE_FORMAT_B8G8R8A8_UNORM: + return PIPE_FORMAT_B8G8R8A8_SRGB; + case PIPE_FORMAT_B8G8R8X8_UNORM: + return PIPE_FORMAT_B8G8R8X8_SRGB; + case PIPE_FORMAT_A8R8G8B8_UNORM: + return PIPE_FORMAT_A8R8G8B8_SRGB; + case PIPE_FORMAT_X8R8G8B8_UNORM: + return PIPE_FORMAT_X8R8G8B8_SRGB; + case PIPE_FORMAT_DXT1_RGB: + return PIPE_FORMAT_DXT1_SRGB; + case PIPE_FORMAT_DXT1_RGBA: + return PIPE_FORMAT_DXT1_SRGBA; + case PIPE_FORMAT_DXT3_RGBA: + return PIPE_FORMAT_DXT3_SRGBA; + case PIPE_FORMAT_DXT5_RGBA: + return PIPE_FORMAT_DXT5_SRGBA; + default: + return PIPE_FORMAT_NONE; + } +} + +/** * Return the number of components stored. * Formats with block size != 1x1 will always have 1 component (the block). */ diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c index 768ae9ceb5..7803ec6a8b 100644 --- a/src/gallium/auxiliary/util/u_framebuffer.c +++ b/src/gallium/auxiliary/util/u_framebuffer.c @@ -85,9 +85,11 @@ util_copy_framebuffer_state(struct pipe_framebuffer_state *dst, dst->width = src->width; dst->height = src->height; - for (i = 0; i < Elements(src->cbufs); i++) { + for (i = 0; i < src->nr_cbufs; i++) pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]); - } + + for (i = src->nr_cbufs; i < dst->nr_cbufs; i++) + pipe_surface_reference(&dst->cbufs[i], NULL); dst->nr_cbufs = src->nr_cbufs; diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h new file mode 100644 index 0000000000..206e1ec311 --- /dev/null +++ b/src/gallium/auxiliary/util/u_split_prim.h @@ -0,0 +1,105 @@ +/* Originally written by Ben Skeggs for the nv50 driver*/ +#include <pipe/p_defines.h> + +struct util_split_prim { + void *priv; + void (*emit)(void *priv, unsigned start, unsigned count); + void (*edge)(void *priv, boolean enabled); + + unsigned mode; + unsigned start; + unsigned p_start; + unsigned p_end; + + uint repeat_first:1; + uint close_first:1; + uint edgeflag_off:1; +}; + +static INLINE void +util_split_prim_init(struct util_split_prim *s, + unsigned mode, unsigned start, unsigned count) +{ + if (mode == PIPE_PRIM_LINE_LOOP) { + s->mode = PIPE_PRIM_LINE_STRIP; + s->close_first = 1; + } else { + s->mode = mode; + s->close_first = 0; + } + s->start = start; + s->p_start = start; + s->p_end = start + count; + s->edgeflag_off = 0; + s->repeat_first = 0; +} + +static INLINE boolean +util_split_prim_next(struct util_split_prim *s, unsigned max_verts) +{ + int repeat = 0; + + if (s->repeat_first) { + s->emit(s->priv, s->start, 1); + max_verts--; + if (s->edgeflag_off) { + s->edge(s->priv, TRUE); + s->edgeflag_off = FALSE; + } + } + + if (s->p_start + s->close_first + max_verts >= s->p_end) { + s->emit(s->priv, s->p_start, s->p_end - s->p_start); + if (s->close_first) + s->emit(s->priv, s->start, 1); + return TRUE; + } + + switch (s->mode) { + case PIPE_PRIM_LINES: + max_verts &= ~1; + break; + case PIPE_PRIM_LINE_STRIP: + repeat = 1; + break; + case PIPE_PRIM_POLYGON: + max_verts--; + s->emit(s->priv, s->p_start, max_verts); + s->edge(s->priv, FALSE); + s->emit(s->priv, s->p_start + max_verts, 1); + s->p_start += max_verts; + s->repeat_first = TRUE; + s->edgeflag_off = TRUE; + return FALSE; + case PIPE_PRIM_TRIANGLES: + max_verts = max_verts - (max_verts % 3); + break; + case PIPE_PRIM_TRIANGLE_STRIP: + /* to ensure winding stays correct, always split + * on an even number of generated triangles + */ + max_verts = max_verts & ~1; + repeat = 2; + break; + case PIPE_PRIM_TRIANGLE_FAN: + s->repeat_first = TRUE; + repeat = 1; + break; + case PIPE_PRIM_QUADS: + max_verts &= ~3; + break; + case PIPE_PRIM_QUAD_STRIP: + max_verts &= ~1; + repeat = 2; + break; + case PIPE_PRIM_POINTS: + break; + default: + /* TODO: implement adjacency primitives */ + assert(0); + } + + s->emit (s->priv, s->p_start, max_verts); + s->p_start += (max_verts - repeat); + return FALSE; +} diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h index 6145e34aa3..87959ab0aa 100644 --- a/src/gallium/auxiliary/util/u_sse.h +++ b/src/gallium/auxiliary/util/u_sse.h @@ -71,6 +71,35 @@ _mm_castps_si128(__m128 a) #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */ + +#if defined(PIPE_ARCH_SSSE3) + +#include <tmmintrin.h> + +#else /* !PIPE_ARCH_SSSE3 */ + +#include <emmintrin.h> + +/** + * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases + * where -mssse3 is not supported/enabled. + * + * MSVC will never get in here as its intrinsics support do not rely on + * compiler command line options. + */ +static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8(__m128i a, __m128i mask) +{ + __m128i result; + __asm__("pshufb %1, %0" + : "=x" (result) + : "xm" (mask), "0" (a)); + return result; +} + +#endif /* !PIPE_ARCH_SSSE3 */ + + #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ #endif /* U_SSE_H_ */ diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c new file mode 100644 index 0000000000..607c31f5ee --- /dev/null +++ b/src/gallium/auxiliary/util/u_staging.c @@ -0,0 +1,95 @@ +#include "util/u_staging.h" +#include "pipe/p_context.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" + +static void +util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigned height, unsigned depth, struct pipe_resource *template) +{ + memset(template, 0, sizeof(struct pipe_resource)); + if(pt->target != PIPE_BUFFER && depth <= 1) + template->target = PIPE_TEXTURE_2D; + else + template->target = pt->target; + template->format = pt->format; + template->width0 = width; + template->height0 = height; + template->depth0 = depth; + template->last_level = 0; + template->nr_samples = pt->nr_samples; + template->bind = 0; + template->usage = PIPE_USAGE_STAGING; + template->flags = 0; +} + +struct util_staging_transfer * +util_staging_transfer_new(struct pipe_context *pipe, + struct pipe_resource *pt, + struct pipe_subresource sr, + unsigned usage, + const struct pipe_box *box, + bool direct) +{ + struct pipe_screen *pscreen = pipe->screen; + struct util_staging_transfer *tx; + struct pipe_resource staging_resource_template; + + tx = CALLOC_STRUCT(util_staging_transfer); + if (!tx) + return NULL; + + pipe_resource_reference(&tx->base.resource, pt); + tx->base.sr = sr; + tx->base.usage = usage; + tx->base.box = *box; + + if (direct) + { + tx->staging_resource = pt; + return tx; + } + + util_staging_resource_template(pt, box->width, box->height, box->depth, &staging_resource_template); + tx->staging_resource = pscreen->resource_create(pscreen, &staging_resource_template); + if (!tx->staging_resource) + { + pipe_resource_reference(&tx->base.resource, NULL); + FREE(tx); + return NULL; + } + + if (usage & PIPE_TRANSFER_READ) + { + struct pipe_subresource dstsr; + unsigned zi; + dstsr.face = 0; + dstsr.level = 0; + for(zi = 0; zi < box->depth; ++zi) + pipe->resource_copy_region(pipe, tx->staging_resource, dstsr, 0, 0, 0, tx->base.resource, sr, box->x, box->y, box->z + zi, box->width, box->height); + } + + return tx; +} + +void +util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx) +{ + struct util_staging_transfer *tx = (struct util_staging_transfer *)ptx; + + if (tx->staging_resource != tx->base.resource) + { + if(tx->base.usage & PIPE_TRANSFER_WRITE) { + struct pipe_subresource srcsr; + unsigned zi; + srcsr.face = 0; + srcsr.level = 0; + for(zi = 0; zi < tx->base.box.depth; ++zi) + pipe->resource_copy_region(pipe, tx->base.resource, tx->base.sr, tx->base.box.x, tx->base.box.y, tx->base.box.z + zi, tx->staging_resource, srcsr, 0, 0, 0, tx->base.box.width, tx->base.box.height); + } + + pipe_resource_reference(&tx->staging_resource, NULL); + } + + pipe_resource_reference(&ptx->resource, NULL); + FREE(ptx); +} diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h new file mode 100644 index 0000000000..602faa2971 --- /dev/null +++ b/src/gallium/auxiliary/util/u_staging.h @@ -0,0 +1,37 @@ +/* Direct3D 10/11 has no concept of transfers. Applications instead + * create resources with a STAGING or DYNAMIC usage, copy between them + * and the real resource and use Map to map the STAGING/DYNAMIC resource. + * + * This util module allows to implement Gallium drivers as a Direct3D + * driver would be implemented: transfers allocate a resource with + * PIPE_USAGE_STAGING, and copy the data between it and the real resource + * with resource_copy_region. + */ + +#ifndef U_STAGING_H +#define U_STAGING_H + +#include "pipe/p_state.h" + +struct util_staging_transfer { + struct pipe_transfer base; + + /* if direct, same as base.resource, otherwise the temporary staging resource */ + struct pipe_resource *staging_resource; +}; + +/* user must be stride, slice_stride and offset */ +/* pt->usage == PIPE_USAGE_DYNAMIC should be a good value to pass for direct */ +/* staging resource is currently created with PIPE_USAGE_DYNAMIC */ +struct util_staging_transfer * +util_staging_transfer_new(struct pipe_context *pipe, + struct pipe_resource *pt, + struct pipe_subresource sr, + unsigned usage, + const struct pipe_box *box, + bool direct); + +void +util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx); + +#endif diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c index b5d21570d5..7733ad24d0 100644 --- a/src/gallium/auxiliary/util/u_surfaces.c +++ b/src/gallium/auxiliary/util/u_surfaces.c @@ -3,40 +3,22 @@ #include "util/u_inlines.h" #include "util/u_memory.h" -/* TODO: ouch, util_hash_table should do these by default when passed a null function pointer - * this indirect function call is quite bad - */ -static unsigned -hash(void *key) -{ - return (unsigned)(uintptr_t)key; -} - -static int -compare(void *key1, void *key2) -{ - return (unsigned)(uintptr_t)key1 - (unsigned)(uintptr_t)key2; -} - struct pipe_surface * util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags) { struct pipe_surface *ps; - void *key = NULL; if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - { /* or 2D array */ - if(!us->u.table) - us->u.table = util_hash_table_create(hash, compare); - key = (void *)(uintptr_t)(((zslice + face) << 8) | level); - /* TODO: ouch, should have a get-reference function... - * also, shouldn't allocate a two-pointer structure for each item... */ - ps = util_hash_table_get(us->u.table, key); + { /* or 2D array */ + if(!us->u.hash) + us->u.hash = cso_hash_create(); + + ps = cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level)); } else { if(!us->u.array) - us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *)); + us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *)); ps = us->u.array[level]; } @@ -54,7 +36,7 @@ util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, str ps->offset = ~0; if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - util_hash_table_set(us->u.table, key, ps); + cso_hash_insert(us->u.hash, ((zslice + face) << 8) | level, ps); else us->u.array[level] = ps; @@ -66,47 +48,44 @@ util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps) { struct pipe_resource *pt = ps->texture; if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - { /* or 2D array */ - void* key = (void*)(uintptr_t)(((ps->zslice + ps->face) << 8) | ps->level); - util_hash_table_remove(us->u.table, key); + { /* or 2D array */ + cso_hash_erase(us->u.hash, cso_hash_find(us->u.hash, ((ps->zslice + ps->face) << 8) | ps->level)); } else us->u.array[ps->level] = 0; } -static enum pipe_error -util_surfaces_destroy_callback(void *key, void *value, void *data) -{ - void (*destroy_surface) (struct pipe_surface * ps) = data; - destroy_surface((struct pipe_surface *)value); - return PIPE_OK; -} - void util_surfaces_destroy(struct util_surfaces *us, struct pipe_resource *pt, void (*destroy_surface) (struct pipe_surface *)) { if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - { /* or 2D array */ - if(us->u.table) + { /* or 2D array */ + if(us->u.hash) { - util_hash_table_foreach(us->u.table, util_surfaces_destroy_callback, destroy_surface); - util_hash_table_destroy(us->u.table); - us->u.table = NULL; + struct cso_hash_iter iter; + iter = cso_hash_first_node(us->u.hash); + while (!cso_hash_iter_is_null(iter)) { + destroy_surface(cso_hash_iter_data(iter)); + iter = cso_hash_iter_next(iter); + } + + cso_hash_delete(us->u.hash); + us->u.hash = NULL; } } else { if(us->u.array) { - unsigned i; - for(i = 0; i < pt->last_level; ++i) - { - struct pipe_surface *ps = us->u.array[i]; - if(ps) - destroy_surface(ps); - } - FREE(us->u.array); - us->u.array = NULL; + unsigned i; + for(i = 0; i <= pt->last_level; ++i) + { + struct pipe_surface *ps = us->u.array[i]; + if(ps) + destroy_surface(ps); + } + FREE(us->u.array); + us->u.array = NULL; } } } diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h index 0195bf5afb..af978c7057 100644 --- a/src/gallium/auxiliary/util/u_surfaces.h +++ b/src/gallium/auxiliary/util/u_surfaces.h @@ -4,15 +4,15 @@ #include "pipe/p_compiler.h" #include "pipe/p_state.h" #include "util/u_atomic.h" - -struct util_hash_table; +#include "cso_cache/cso_hash.h" struct util_surfaces { union { - struct util_hash_table *table; + struct cso_hash *hash; struct pipe_surface **array; + void* pv; } u; }; @@ -35,6 +35,18 @@ util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct return util_surfaces_do_get(us, surface_struct_size, pscreen, pt, face, level, zslice, flags); } +static INLINE struct pipe_surface * +util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice) +{ + if(!us->u.pv) + return 0; + + if(unlikely(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)) + return cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level)); + else + return us->u.array[level]; +} + void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps); static INLINE void |