diff options
author | Eric Anholt <eric@anholt.net> | 2010-07-26 17:47:59 -0700 |
---|---|---|
committer | Eric Anholt <eric@anholt.net> | 2010-07-26 17:53:27 -0700 |
commit | afe125e0a18ac3886c45c7e6b02b122fb2d327b5 (patch) | |
tree | 78621707e71154c0b388b0baacffc26432b7e992 /src/gallium/drivers/llvmpipe | |
parent | d64343f1ae84979bd154475badf11af8a9bfc2eb (diff) | |
parent | 5403ca79b225605c79f49866a6497c97da53be3b (diff) |
Merge remote branch 'origin/master' into glsl2
This pulls in multiple i965 driver fixes which will help ensure better
testing coverage during development, and also gets past the conflicts
of the src/mesa/shader -> src/mesa/program move.
Conflicts:
src/mesa/Makefile
src/mesa/main/shaderapi.c
src/mesa/main/shaderobj.h
Diffstat (limited to 'src/gallium/drivers/llvmpipe')
48 files changed, 2334 insertions, 1186 deletions
diff --git a/src/gallium/drivers/llvmpipe/.gitignore b/src/gallium/drivers/llvmpipe/.gitignore index a1b6f56e0d..6ebd2b8a63 100644 --- a/src/gallium/drivers/llvmpipe/.gitignore +++ b/src/gallium/drivers/llvmpipe/.gitignore @@ -3,3 +3,5 @@ lp_test_blend lp_test_conv lp_test_format lp_test_printf +lp_test_round +lp_test_sincos diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile index ee28179c30..2892b62920 100644 --- a/src/gallium/drivers/llvmpipe/Makefile +++ b/src/gallium/drivers/llvmpipe/Makefile @@ -18,6 +18,7 @@ C_SOURCES = \ lp_fence.c \ lp_flush.c \ lp_jit.c \ + lp_memory.c \ lp_perf.c \ lp_query.c \ lp_rast.c \ @@ -53,8 +54,12 @@ PROGS := lp_test_format \ lp_test_blend \ lp_test_conv \ lp_test_printf \ + lp_test_round \ lp_test_sincos +# Need this for the lp_test_*.o files +CLEAN_EXTRA = *.o + lp_test_sincos.o : sse_mathfun.h PROGS_DEPS := ../../auxiliary/libgallium.a diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript index a1ef71da89..fd6ba1561e 100644 --- a/src/gallium/drivers/llvmpipe/SConscript +++ b/src/gallium/drivers/llvmpipe/SConscript @@ -1,3 +1,5 @@ +import distutils.version + Import('*') if not env['llvm']: @@ -23,6 +25,16 @@ env.Depends('lp_tile_soa.c', [ '#src/gallium/auxiliary/util/u_format_pack.py', ]) + +# Only enable SSSE3 for lp_tile_soa_sse3.c +ssse3_env = env.Clone() +if env['gcc'] \ + and distutils.version.LooseVersion(env['CCVERSION']) >= distutils.version.LooseVersion('4.3') \ + and env['machine'] in ('x86', 'x86_64') : + ssse3_env.Append(CCFLAGS = ['-mssse3']) +lp_tile_soa_os = ssse3_env.SharedObject('lp_tile_soa.c') + + llvmpipe = env.ConvenienceLibrary( target = 'llvmpipe', source = [ @@ -38,6 +50,7 @@ llvmpipe = env.ConvenienceLibrary( 'lp_fence.c', 'lp_flush.c', 'lp_jit.c', + 'lp_memory.c', 'lp_perf.c', 'lp_query.c', 'lp_rast.c', @@ -65,7 +78,7 @@ llvmpipe = env.ConvenienceLibrary( 'lp_tex_sample.c', 'lp_texture.c', 'lp_tile_image.c', - 'lp_tile_soa.c', + lp_tile_soa_os, ]) @@ -82,6 +95,9 @@ if env['platform'] != 'embedded': 'sincos', ] + if not msvc: + tests.append('round') + for test in tests: target = env.Program( target = 'lp_test_' + test, diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c index 70d08e71f6..09e9833057 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c @@ -190,30 +190,27 @@ lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld, enum lp_build_blend_swizzle rgb_swizzle, unsigned alpha_swizzle) { - if(rgb == alpha) { - if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) - return rgb; - if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) - return lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle); + LLVMValueRef swizzled_rgb; + + switch (rgb_swizzle) { + case LP_BUILD_BLEND_SWIZZLE_RGBA: + swizzled_rgb = rgb; + break; + case LP_BUILD_BLEND_SWIZZLE_AAAA: + swizzled_rgb = lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle); + break; + default: + assert(0); + swizzled_rgb = bld->base.undef; } - else { - if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) { - boolean cond[4] = {0, 0, 0, 0}; - cond[alpha_swizzle] = 1; - return lp_build_select_aos(&bld->base, alpha, rgb, cond); - } - if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) { - unsigned char swizzle[4]; - swizzle[0] = alpha_swizzle; - swizzle[1] = alpha_swizzle; - swizzle[2] = alpha_swizzle; - swizzle[3] = alpha_swizzle; - swizzle[alpha_swizzle] += 4; - return lp_build_swizzle2_aos(&bld->base, rgb, alpha, swizzle); - } + + if (rgb != alpha) { + boolean cond[4] = {0, 0, 0, 0}; + cond[alpha_swizzle] = 1; + swizzled_rgb = lp_build_select_aos(&bld->base, alpha, swizzled_rgb, cond); } - assert(0); - return bld->base.undef; + + return swizzled_rgb; } diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c index 90d2b26f9f..78744da500 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c @@ -261,7 +261,7 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index) const unsigned interp = bld->interp[attrib]; for(chan = 0; chan < NUM_CHANNELS; ++chan) { if(mask & (1 << chan)) { - LLVMValueRef a = coeff_bld->undef; + LLVMValueRef a; if (interp == LP_INTERP_CONSTANT || interp == LP_INTERP_FACING) { a = bld->a[attrib][chan]; diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 986e604ce7..b2643ab33c 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -83,6 +83,7 @@ struct llvmpipe_context { int so_count[PIPE_MAX_SO_BUFFERS]; int num_buffers; } so_target; + struct pipe_resource *mapped_vs_tex[PIPE_MAX_VERTEX_SAMPLERS]; unsigned num_samplers; unsigned num_fragment_sampler_views; diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c index 98780d7631..625d0c8a8c 100644 --- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c +++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c @@ -43,18 +43,23 @@ /** - * Draw vertex arrays, with optional indexing. + * Draw vertex arrays, with optional indexing, optional instancing. + * All the other drawing functions are implemented in terms of this function. * Basically, map the vertex buffers (and drawing surfaces), then hand off * the drawing to the 'draw' module. */ static void -llvmpipe_draw_range_elements(struct pipe_context *pipe, - struct pipe_resource *indexBuffer, - unsigned indexSize, - int indexBias, - unsigned min_index, - unsigned max_index, - unsigned mode, unsigned start, unsigned count) +llvmpipe_draw_range_elements_instanced(struct pipe_context *pipe, + struct pipe_resource *indexBuffer, + unsigned indexSize, + int indexBias, + unsigned minIndex, + unsigned maxIndex, + unsigned mode, + unsigned start, + unsigned count, + unsigned startInstance, + unsigned instanceCount) { struct llvmpipe_context *lp = llvmpipe_context(pipe); struct draw_context *draw = lp->draw; @@ -74,9 +79,11 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe, /* Map index buffer, if present */ if (indexBuffer) { void *mapped_indexes = llvmpipe_resource_data(indexBuffer); - draw_set_mapped_element_buffer_range(draw, indexSize, indexBias, - min_index, - max_index, + draw_set_mapped_element_buffer_range(draw, + indexSize, + indexBias, + minIndex, + maxIndex, mapped_indexes); } else { @@ -84,9 +91,13 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe, draw_set_mapped_element_buffer_range(draw, 0, 0, start, start + count - 1, NULL); } + llvmpipe_prepare_vertex_sampling(lp, + lp->num_vertex_sampler_views, + lp->vertex_sampler_views); /* draw! */ - draw_arrays(draw, mode, start, count); + draw_arrays_instanced(draw, mode, start, count, + startInstance, instanceCount); /* * unmap vertex/index buffers @@ -97,6 +108,7 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe, if (indexBuffer) { draw_set_mapped_element_buffer(draw, 0, 0, NULL); } + llvmpipe_cleanup_vertex_sampling(lp); /* * TODO: Flush only when a user vertex/index buffer is present @@ -108,24 +120,102 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe, static void +llvmpipe_draw_arrays_instanced(struct pipe_context *pipe, + unsigned mode, + unsigned start, + unsigned count, + unsigned startInstance, + unsigned instanceCount) +{ + llvmpipe_draw_range_elements_instanced(pipe, + NULL, /* no indexBuffer */ + 0, 0, /* indexSize, indexBias */ + 0, ~0, /* minIndex, maxIndex */ + mode, + start, + count, + startInstance, + instanceCount); +} + + +static void +llvmpipe_draw_elements_instanced(struct pipe_context *pipe, + struct pipe_resource *indexBuffer, + unsigned indexSize, + int indexBias, + unsigned mode, + unsigned start, + unsigned count, + unsigned startInstance, + unsigned instanceCount) +{ + llvmpipe_draw_range_elements_instanced(pipe, + indexBuffer, + indexSize, indexBias, + 0, ~0, /* minIndex, maxIndex */ + mode, + start, + count, + startInstance, + instanceCount); +} + + +static void llvmpipe_draw_elements(struct pipe_context *pipe, struct pipe_resource *indexBuffer, unsigned indexSize, int indexBias, - unsigned mode, unsigned start, unsigned count) + unsigned mode, + unsigned start, + unsigned count) +{ + llvmpipe_draw_range_elements_instanced(pipe, + indexBuffer, + indexSize, indexBias, + 0, 0xffffffff, /* min, maxIndex */ + mode, start, count, + 0, /* startInstance */ + 1); /* instanceCount */ +} + + +static void +llvmpipe_draw_range_elements(struct pipe_context *pipe, + struct pipe_resource *indexBuffer, + unsigned indexSize, + int indexBias, + unsigned min_index, + unsigned max_index, + unsigned mode, + unsigned start, + unsigned count) { - llvmpipe_draw_range_elements( pipe, indexBuffer, - indexSize, indexBias, - 0, 0xffffffff, - mode, start, count ); + llvmpipe_draw_range_elements_instanced(pipe, + indexBuffer, + indexSize, indexBias, + min_index, max_index, + mode, start, count, + 0, /* startInstance */ + 1); /* instanceCount */ } static void -llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode, - unsigned start, unsigned count) +llvmpipe_draw_arrays(struct pipe_context *pipe, + unsigned mode, + unsigned start, + unsigned count) { - llvmpipe_draw_elements(pipe, NULL, 0, 0, mode, start, count); + llvmpipe_draw_range_elements_instanced(pipe, + NULL, /* indexBuffer */ + 0, /* indexSize */ + 0, /* indexBias */ + 0, ~0, /* min, maxIndex */ + mode, start, count, + 0, /* startInstance */ + 1); /* instanceCount */ } @@ -135,4 +225,6 @@ llvmpipe_init_draw_funcs(struct llvmpipe_context *llvmpipe) llvmpipe->pipe.draw_arrays = llvmpipe_draw_arrays; llvmpipe->pipe.draw_elements = llvmpipe_draw_elements; llvmpipe->pipe.draw_range_elements = llvmpipe_draw_range_elements; + llvmpipe->pipe.draw_arrays_instanced = llvmpipe_draw_arrays_instanced; + llvmpipe->pipe.draw_elements_instanced = llvmpipe_draw_elements_instanced; } diff --git a/src/gallium/drivers/llvmpipe/lp_fence.c b/src/gallium/drivers/llvmpipe/lp_fence.c index 75d8d2b825..f9805e5d68 100644 --- a/src/gallium/drivers/llvmpipe/lp_fence.c +++ b/src/gallium/drivers/llvmpipe/lp_fence.c @@ -28,7 +28,6 @@ #include "pipe/p_screen.h" #include "util/u_memory.h" -#include "util/u_inlines.h" #include "lp_debug.h" #include "lp_fence.h" @@ -59,7 +58,7 @@ lp_fence_create(unsigned rank) /** Destroy a fence. Called when refcount hits zero. */ -static void +void lp_fence_destroy(struct lp_fence *fence) { pipe_mutex_destroy(fence->mutex); @@ -77,12 +76,10 @@ llvmpipe_fence_reference(struct pipe_screen *screen, struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence) { - struct lp_fence *old = (struct lp_fence *) *ptr; + struct lp_fence **old = (struct lp_fence **) ptr; struct lp_fence *f = (struct lp_fence *) fence; - if (pipe_reference(&old->reference, &f->reference)) { - lp_fence_destroy(old); - } + lp_fence_reference(old, f); } diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h index d9270f5784..13358fb99f 100644 --- a/src/gallium/drivers/llvmpipe/lp_fence.h +++ b/src/gallium/drivers/llvmpipe/lp_fence.h @@ -32,6 +32,7 @@ #include "os/os_thread.h" #include "pipe/p_state.h" +#include "util/u_inlines.h" struct pipe_screen; @@ -61,4 +62,21 @@ void llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen); +void +lp_fence_destroy(struct lp_fence *fence); + +static INLINE void +lp_fence_reference(struct lp_fence **ptr, + struct lp_fence *f) +{ + struct lp_fence *old = *ptr; + + if (pipe_reference(&old->reference, &f->reference)) { + lp_fence_destroy(old); + } + + *ptr = f; +} + + #endif /* LP_FENCE_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c index 0cd288bb73..845292f4ab 100644 --- a/src/gallium/drivers/llvmpipe/lp_flush.c +++ b/src/gallium/drivers/llvmpipe/lp_flush.c @@ -40,27 +40,19 @@ /** * \param flags bitmask of PIPE_FLUSH_x flags - * \param fence if non-null, returns pointer to a fench which can be waited on + * \param fence if non-null, returns pointer to a fence which can be waited on */ void llvmpipe_flush( struct pipe_context *pipe, - unsigned flags, + unsigned flags, struct pipe_fence_handle **fence ) { struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); draw_flush(llvmpipe->draw); - if (fence) { - /* if we're going to flush the setup/rasterization modules, emit - * a fence. - * XXX this (and the code below) may need fine tuning... - */ - *fence = lp_setup_fence( llvmpipe->setup ); - } - /* ask the setup module to flush */ - lp_setup_flush(llvmpipe->setup, flags); + lp_setup_flush(llvmpipe->setup, flags, fence); /* Enable to dump BMPs of the color/depth buffers each frame */ if (0) { diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index 23aa34ddec..8e6dfb293d 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -103,10 +103,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen) elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatType(); elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] = LLVMInt32Type(); elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32Type(); - elem_types[LP_JIT_CTX_SCISSOR_XMIN] = LLVMFloatType(); - elem_types[LP_JIT_CTX_SCISSOR_YMIN] = LLVMFloatType(); - elem_types[LP_JIT_CTX_SCISSOR_XMAX] = LLVMFloatType(); - elem_types[LP_JIT_CTX_SCISSOR_YMAX] = LLVMFloatType(); elem_types[LP_JIT_CTX_BLEND_COLOR] = LLVMPointerType(LLVMInt8Type(), 0); elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); @@ -125,18 +121,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen) LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back, screen->target, context_type, LP_JIT_CTX_STENCIL_REF_BACK); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmin, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_XMIN); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymin, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_YMIN); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_xmax, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_XMAX); - LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, scissor_ymax, - screen->target, context_type, - LP_JIT_CTX_SCISSOR_YMAX); LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color, screen->target, context_type, LP_JIT_CTX_BLEND_COLOR); diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 8d06e65725..c94189413a 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -89,9 +89,6 @@ struct lp_jit_context uint32_t stencil_ref_front, stencil_ref_back; - /** floats, not ints */ - float scissor_xmin, scissor_ymin, scissor_xmax, scissor_ymax; - /* FIXME: store (also?) in floats */ uint8_t *blend_color; @@ -108,10 +105,6 @@ enum { LP_JIT_CTX_ALPHA_REF, LP_JIT_CTX_STENCIL_REF_FRONT, LP_JIT_CTX_STENCIL_REF_BACK, - LP_JIT_CTX_SCISSOR_XMIN, - LP_JIT_CTX_SCISSOR_YMIN, - LP_JIT_CTX_SCISSOR_XMAX, - LP_JIT_CTX_SCISSOR_YMAX, LP_JIT_CTX_BLEND_COLOR, LP_JIT_CTX_TEXTURES, LP_JIT_CTX_COUNT @@ -130,18 +123,6 @@ enum { #define lp_jit_context_stencil_ref_back_value(_builder, _ptr) \ lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back") -#define lp_jit_context_scissor_xmin_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMIN, "scissor_xmin") - -#define lp_jit_context_scissor_ymin_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMIN, "scissor_ymin") - -#define lp_jit_context_scissor_xmax_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_XMAX, "scissor_xmax") - -#define lp_jit_context_scissor_ymax_value(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_SCISSOR_YMAX, "scissor_ymax") - #define lp_jit_context_blend_color(_builder, _ptr) \ lp_build_struct_get(_builder, _ptr, LP_JIT_CTX_BLEND_COLOR, "blend_color") @@ -160,12 +141,7 @@ typedef void const void *dady, uint8_t **color, void *depth, - const int32_t c1, - const int32_t c2, - const int32_t c3, - const int32_t *step1, - const int32_t *step2, - const int32_t *step3, + uint32_t mask, uint32_t *counter); diff --git a/src/gallium/drivers/llvmpipe/lp_memory.c b/src/gallium/drivers/llvmpipe/lp_memory.c new file mode 100644 index 0000000000..0f55d4a80a --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_memory.c @@ -0,0 +1,45 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "util/u_debug.h" +#include "lp_limits.h" +#include "lp_memory.h" + +/** + * 32bpp RGBA swizzled tiles. One for for each thread and each + * possible colorbuf. Adds up to quite a bit 8*8*64*64*4 == 1MB. + * Several schemes exist to reduce this, such as scaling back the + * number of threads or using a smaller tilesize when multiple + * colorbuffers are bound. + */ +PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; + + +/* A single dummy tile used in a couple of out-of-memory situations. + */ +PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; + diff --git a/src/gallium/drivers/llvmpipe/lp_memory.h b/src/gallium/drivers/llvmpipe/lp_memory.h new file mode 100644 index 0000000000..f7418f5e08 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_memory.h @@ -0,0 +1,40 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef LP_MEMORY_H +#define LP_MEMORY_H + + +#include "pipe/p_compiler.h" +#include "pipe/p_state.h" +#include "lp_limits.h" + +extern PIPE_ALIGN_VAR(16) uint8_t lp_swizzled_cbuf[LP_MAX_THREADS][PIPE_MAX_COLOR_BUFS][TILE_SIZE * TILE_SIZE * 4]; + +extern PIPE_ALIGN_VAR(16) uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4]; + +#endif /* LP_MEMORY_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c index a316597675..083e7e30a5 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.c +++ b/src/gallium/drivers/llvmpipe/lp_perf.c @@ -46,10 +46,10 @@ lp_print_counters(void) { if (LP_DEBUG & DEBUG_COUNTERS) { unsigned total_64, total_16, total_4; - float p1, p2, p3; + float p1, p2, p3, p4; - debug_printf("llvmpipe: nr_triangles: %9u\n", lp_count.nr_tris); - debug_printf("llvmpipe: nr_culled_triangles: %9u\n", lp_count.nr_culled_tris); + debug_printf("llvmpipe: nr_triangles: %9u\n", lp_count.nr_tris); + debug_printf("llvmpipe: nr_culled_triangles: %9u\n", lp_count.nr_culled_tris); total_64 = (lp_count.nr_empty_64 + lp_count.nr_fully_covered_64 + @@ -58,10 +58,13 @@ lp_print_counters(void) p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64; p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64; p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64; + p4 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64; - debug_printf("llvmpipe: nr_empty_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64); - debug_printf("llvmpipe: nr_fully_covered_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64); - debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64); + debug_printf("llvmpipe: nr_64x64: %9u\n", total_64); + debug_printf("llvmpipe: nr_fully_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64); + debug_printf("llvmpipe: nr_shade_opaque_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p4, total_64); + debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64); + debug_printf("llvmpipe: nr_empty_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64); total_16 = (lp_count.nr_empty_16 + lp_count.nr_fully_covered_16 + @@ -71,25 +74,27 @@ lp_print_counters(void) p2 = 100.0 * (float) lp_count.nr_fully_covered_16 / (float) total_16; p3 = 100.0 * (float) lp_count.nr_partially_covered_16 / (float) total_16; - debug_printf("llvmpipe: nr_empty_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16); - debug_printf("llvmpipe: nr_fully_covered_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16); - debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%2.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16); + debug_printf("llvmpipe: nr_16x16: %9u\n", total_16); + debug_printf("llvmpipe: nr_fully_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16); + debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16); + debug_printf("llvmpipe: nr_empty_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16); total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4); p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4; p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4; - debug_printf("llvmpipe: nr_empty_4x4: %9u (%2.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4); - debug_printf("llvmpipe: nr_non_empty_4x4: %9u (%2.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4); + debug_printf("llvmpipe: nr_4x4: %9u\n", total_4); + debug_printf("llvmpipe: nr_empty_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4); + debug_printf("llvmpipe: nr_non_empty_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4); - debug_printf("llvmpipe: nr_color_tile_clear: %9u\n", lp_count.nr_color_tile_clear); - debug_printf("llvmpipe: nr_color_tile_load: %9u\n", lp_count.nr_color_tile_load); - debug_printf("llvmpipe: nr_color_tile_store: %9u\n", lp_count.nr_color_tile_store); + debug_printf("llvmpipe: nr_color_tile_clear: %9u\n", lp_count.nr_color_tile_clear); + debug_printf("llvmpipe: nr_color_tile_load: %9u\n", lp_count.nr_color_tile_load); + debug_printf("llvmpipe: nr_color_tile_store: %9u\n", lp_count.nr_color_tile_store); - debug_printf("llvmpipe: nr_llvm_compiles: %u\n", lp_count.nr_llvm_compiles); - debug_printf("llvmpipe: total LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0); - debug_printf("llvmpipe: average LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles); + debug_printf("llvmpipe: nr_llvm_compiles: %u\n", lp_count.nr_llvm_compiles); + debug_printf("llvmpipe: total LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0); + debug_printf("llvmpipe: average LLVM compile time: %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles); } } diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h index a9629dae3c..4774f64550 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.h +++ b/src/gallium/drivers/llvmpipe/lp_perf.h @@ -44,6 +44,7 @@ struct lp_counters unsigned nr_empty_64; unsigned nr_fully_covered_64; unsigned nr_partially_covered_64; + unsigned nr_shade_opaque_64; unsigned nr_empty_16; unsigned nr_fully_covered_16; unsigned nr_partially_covered_16; diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c index c902c04684..02eeaf6487 100644 --- a/src/gallium/drivers/llvmpipe/lp_query.c +++ b/src/gallium/drivers/llvmpipe/lp_query.c @@ -48,7 +48,7 @@ static struct llvmpipe_query *llvmpipe_query( struct pipe_query *p ) static struct pipe_query * llvmpipe_create_query(struct pipe_context *pipe, - unsigned type) + unsigned type) { struct llvmpipe_query *pq; @@ -67,6 +67,16 @@ static void llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q) { struct llvmpipe_query *pq = llvmpipe_query(q); + /* query might still be in process if we never waited for the result */ + if (!pq->done) { + struct pipe_fence_handle *fence = NULL; + llvmpipe_flush(pipe, 0, &fence); + if (fence) { + pipe->screen->fence_finish(pipe->screen, fence, 0); + pipe->screen->fence_reference(pipe->screen, &fence, NULL); + } + } + pipe_mutex_destroy(pq->mutex); FREE(pq); } @@ -74,16 +84,26 @@ llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q) static boolean llvmpipe_get_query_result(struct pipe_context *pipe, - struct pipe_query *q, - boolean wait, - void *vresult) + struct pipe_query *q, + boolean wait, + void *vresult) { - struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe ); struct llvmpipe_query *pq = llvmpipe_query(q); uint64_t *result = (uint64_t *)vresult; if (!pq->done) { - lp_setup_flush(llvmpipe->setup, 0); + if (wait) { + struct pipe_fence_handle *fence = NULL; + llvmpipe_flush(pipe, 0, &fence); + if (fence) { + pipe->screen->fence_finish(pipe->screen, fence, 0); + pipe->screen->fence_reference(pipe->screen, &fence, NULL); + } + } + /* this is a bit inconsequent but should be ok */ + else { + llvmpipe_flush(pipe, 0, NULL); + } } if (pq->done) { diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index 50e44dcb2b..654f4ea48e 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -28,6 +28,7 @@ #include <limits.h> #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_rect.h" #include "util/u_surface.h" #include "lp_scene_queue.h" @@ -66,7 +67,7 @@ lp_rast_begin( struct lp_rasterizer *rast, cbuf->level, cbuf->zslice, LP_TEX_USAGE_READ_WRITE, - LP_TEX_LAYOUT_NONE); + LP_TEX_LAYOUT_LINEAR); } if (fb->zsbuf) { @@ -81,7 +82,6 @@ lp_rast_begin( struct lp_rasterizer *rast, zsbuf->zslice, LP_TEX_USAGE_READ_WRITE, LP_TEX_LAYOUT_NONE); - assert(rast->zsbuf.map); } lp_scene_bin_iter_begin( scene ); @@ -137,7 +137,6 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, struct lp_rasterizer *rast = task->rast; struct lp_scene *scene = rast->curr_scene; enum lp_texture_usage usage; - unsigned buf; LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y); @@ -147,24 +146,8 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, task->x = x; task->y = y; - if (scene->has_color_clear) - usage = LP_TEX_USAGE_WRITE_ALL; - else - usage = LP_TEX_USAGE_READ_WRITE; - - /* get pointers to color tile(s) */ - for (buf = 0; buf < rast->state.nr_cbufs; buf++) { - struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf]; - struct llvmpipe_resource *lpt; - assert(cbuf); - lpt = llvmpipe_resource(cbuf->texture); - task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt, - cbuf->face + cbuf->zslice, - cbuf->level, - usage, - x, y); - assert(task->color_tiles[buf]); - } + /* reset pointers to color tile(s) */ + memset(task->color_tiles, 0, sizeof(task->color_tiles)); /* get pointer to depth/stencil tile */ { @@ -188,7 +171,7 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task, /* Get actual pointer to the tile data. Note that depth/stencil * data is tiled differently than color data. */ - task->depth_tile = lp_rast_get_depth_block_pointer(rast, x, y); + task->depth_tile = lp_rast_get_depth_block_pointer(task, x, y); assert(task->depth_tile); } @@ -223,7 +206,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, clear_color[2] == clear_color[3]) { /* clear to grayscale value {x, x, x, x} */ for (i = 0; i < rast->state.nr_cbufs; i++) { - uint8_t *ptr = task->color_tiles[i]; + uint8_t *ptr = + lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4); } } @@ -235,7 +219,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task, */ const unsigned chunk = TILE_SIZE / 4; for (i = 0; i < rast->state.nr_cbufs; i++) { - uint8_t *c = task->color_tiles[i]; + uint8_t *c = + lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); unsigned j; for (j = 0; j < 4 * TILE_SIZE; j++) { @@ -286,8 +271,6 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, dst = task->depth_tile; - assert(dst == lp_rast_get_depth_block_pointer(rast, task->x, task->y)); - switch (block_size) { case 1: memset(dst, (uint8_t) clear_value, height * width); @@ -376,8 +359,8 @@ lp_rast_load_color(struct lp_rasterizer_task *task, * This is a bin command which is stored in all bins. */ void -lp_rast_store_color( struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) +lp_rast_store_linear_color( struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) { struct lp_rasterizer *rast = task->rast; struct lp_scene *scene = rast->curr_scene; @@ -387,30 +370,20 @@ lp_rast_store_color( struct lp_rasterizer_task *task, struct pipe_surface *cbuf = scene->fb.cbufs[buf]; const unsigned face = cbuf->face, level = cbuf->level; struct llvmpipe_resource *lpt = llvmpipe_resource(cbuf->texture); - /* this will convert the tiled data to linear if needed */ - (void) llvmpipe_get_texture_tile_linear(lpt, face, level, - LP_TEX_USAGE_READ, - task->x, task->y); - } -} - -/** - * This is a bin command called during bin processing. - */ -void -lp_rast_set_state(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - const struct lp_rast_state *state = arg.set_state; + if (!task->color_tiles[buf]) + continue; - LP_DBG(DEBUG_RAST, "%s %p\n", __FUNCTION__, (void *) state); - - /* just set the current state pointer for this rasterizer */ - task->current_state = state; + llvmpipe_unswizzle_cbuf_tile(lpt, + face, + level, + task->x, task->y, + task->color_tiles[buf]); + } } + /** * Run the shader on all blocks in a tile. This is used when a tile is * completely contained inside a triangle. @@ -421,8 +394,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { struct lp_rasterizer *rast = task->rast; - const struct lp_rast_state *state = task->current_state; const struct lp_rast_shader_inputs *inputs = arg.shade_tile; + const struct lp_rast_state *state = inputs->state; struct lp_fragment_shader_variant *variant = state->variant; const unsigned tile_x = task->x, tile_y = task->y; unsigned x, y; @@ -442,36 +415,60 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task, tile_x + x, tile_y + y); /* depth buffer */ - depth = lp_rast_get_depth_block_pointer(rast, tile_x + x, tile_y + y); + depth = lp_rast_get_depth_block_pointer(task, tile_x + x, tile_y + y); /* run shader on 4x4 block */ variant->jit_function[RAST_WHOLE]( &state->jit_context, - tile_x + x, tile_y + y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, - color, - depth, - INT_MIN, INT_MIN, INT_MIN, - NULL, NULL, NULL, &task->vis_counter); + tile_x + x, tile_y + y, + inputs->facing, + inputs->a0, + inputs->dadx, + inputs->dady, + color, + depth, + 0xffff, + &task->vis_counter); } } } /** - * Compute shading for a 4x4 block of pixels. + * Run the shader on all blocks in a tile. This is used when a tile is + * completely contained inside a triangle, and the shader is opaque. + * This is a bin command called during bin processing. + */ +void +lp_rast_shade_tile_opaque(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + struct lp_rasterizer *rast = task->rast; + unsigned i; + + LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); + + /* this will prevent converting the layout from tiled to linear */ + for (i = 0; i < rast->state.nr_cbufs; i++) { + (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL); + } + + lp_rast_shade_tile(task, arg); +} + + +/** + * Compute shading for a 4x4 block of pixels inside a triangle. * This is a bin command called during bin processing. * \param x X position of quad in window coords * \param y Y position of quad in window coords */ -void lp_rast_shade_quads( struct lp_rasterizer_task *task, - const struct lp_rast_shader_inputs *inputs, - unsigned x, unsigned y, - int32_t c1, int32_t c2, int32_t c3) +void +lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, + const struct lp_rast_shader_inputs *inputs, + unsigned x, unsigned y, + unsigned mask) { - const struct lp_rast_state *state = task->current_state; + const struct lp_rast_state *state = inputs->state; struct lp_fragment_shader_variant *variant = state->variant; struct lp_rasterizer *rast = task->rast; uint8_t *color[PIPE_MAX_COLOR_BUFS]; @@ -494,32 +491,26 @@ void lp_rast_shade_quads( struct lp_rasterizer_task *task, } /* depth buffer */ - depth = lp_rast_get_depth_block_pointer(rast, x, y); + depth = lp_rast_get_depth_block_pointer(task, x, y); assert(lp_check_alignment(state->jit_context.blend_color, 16)); - assert(lp_check_alignment(inputs->step[0], 16)); - assert(lp_check_alignment(inputs->step[1], 16)); - assert(lp_check_alignment(inputs->step[2], 16)); - /* run shader on 4x4 block */ - variant->jit_function[RAST_EDGE_TEST]( &state->jit_context, - x, y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, - color, - depth, - c1, c2, c3, - inputs->step[0], - inputs->step[1], - inputs->step[2], - &task->vis_counter); + variant->jit_function[RAST_EDGE_TEST](&state->jit_context, + x, y, + inputs->facing, + inputs->a0, + inputs->dadx, + inputs->dady, + color, + depth, + mask, + &task->vis_counter); } + /** * Set top row and left column of the tile's pixels to white. For debugging. */ @@ -598,6 +589,11 @@ lp_rast_tile_end(struct lp_rasterizer_task *task) (void) outline_subtiles; #endif + { + union lp_rast_cmd_arg dummy = {0}; + lp_rast_store_linear_color(task, dummy); + } + /* debug */ memset(task->color_tiles, 0, sizeof(task->color_tiles)); task->depth_tile = NULL; @@ -627,7 +623,7 @@ void lp_rast_begin_query(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { - /* Reset the the per-task counter */ + /* Reset the per-task counter */ task->vis_counter = 0; } @@ -715,10 +711,16 @@ static struct { { RAST(clear_color), RAST(clear_zstencil), - RAST(triangle), + RAST(triangle_1), + RAST(triangle_2), + RAST(triangle_3), + RAST(triangle_4), + RAST(triangle_5), + RAST(triangle_6), + RAST(triangle_7), RAST(shade_tile), - RAST(set_state), - RAST(store_color), + RAST(shade_tile_opaque), + RAST(store_linear_color), RAST(fence), RAST(begin_query), RAST(end_query), @@ -754,30 +756,8 @@ debug_bin( const struct cmd_bin *bin ) static boolean is_empty_bin( const struct cmd_bin *bin ) { - const struct cmd_block *head = bin->commands.head; - int i; - - if (0) - debug_bin(bin); - - /* We emit at most two load-tile commands at the start of the first - * command block. In addition we seem to emit a couple of - * set-state commands even in empty bins. - * - * As a heuristic, if a bin has more than 4 commands, consider it - * non-empty. - */ - if (head->next != NULL || - head->count > 4) { - return FALSE; - } - - for (i = 0; i < head->count; i++) - if (head->cmd[i] != lp_rast_set_state) { - return FALSE; - } - - return TRUE; + if (0) debug_bin(bin); + return bin->commands.head->count == 0; } @@ -813,6 +793,10 @@ rasterize_scene(struct lp_rasterizer_task *task, } } #endif + + if (scene->fence) { + lp_rast_fence(task, lp_rast_arg_fence(scene->fence)); + } } @@ -983,6 +967,10 @@ lp_rast_create( unsigned num_threads ) /* for synchronizing rasterization threads */ pipe_barrier_init( &rast->barrier, rast->num_threads ); + memset(lp_swizzled_cbuf, 0, sizeof lp_swizzled_cbuf); + + memset(lp_dummy_tile, 0, sizeof lp_dummy_tile); + return rast; } diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index 80ca68f5a2..eaf2a6f334 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -84,8 +84,7 @@ struct lp_rast_shader_inputs { float (*dadx)[4]; float (*dady)[4]; - /* edge/step info for 3 edges and 4x4 block of pixels */ - PIPE_ALIGN_VAR(16) int step[3][16]; + const struct lp_rast_state *state; }; struct lp_rast_clearzs { @@ -93,6 +92,22 @@ struct lp_rast_clearzs { unsigned clearzs_mask; }; +struct lp_rast_plane { + /* one-pixel sized trivial accept offsets for each plane */ + int ei; + + /* one-pixel sized trivial reject offsets for each plane */ + int eo; + + /* edge function values at minx,miny ?? */ + int c; + + int dcdx; + int dcdy; + + /* edge/step info for 3 edges and 4x4 block of pixels */ + const int *step; +}; /** * Rasterization information for a triangle known to be in this bin, @@ -101,35 +116,16 @@ struct lp_rast_clearzs { * Objects of this type are put into the lp_setup_context::data buffer. */ struct lp_rast_triangle { + /* inputs for the shader */ + struct lp_rast_shader_inputs inputs; + + int step[3][16]; + #ifdef DEBUG float v[3][2]; #endif - /* one-pixel sized trivial accept offsets for each plane */ - int ei1; - int ei2; - int ei3; - - /* one-pixel sized trivial reject offsets for each plane */ - int eo1; - int eo2; - int eo3; - - /* y deltas for vertex pairs (in fixed pt) */ - int dy12; - int dy23; - int dy31; - - /* x deltas for vertex pairs (in fixed pt) */ - int dx12; - int dx23; - int dx31; - - /* edge function values at minx,miny ?? */ - int c1, c2, c3; - - /* inputs for the shader */ - PIPE_ALIGN_VAR(16) struct lp_rast_shader_inputs inputs; + struct lp_rast_plane plane[7]; /* NOTE: may allocate fewer planes */ }; @@ -153,7 +149,10 @@ lp_rast_finish( struct lp_rasterizer *rast ); union lp_rast_cmd_arg { const struct lp_rast_shader_inputs *shade_tile; - const struct lp_rast_triangle *triangle; + struct { + const struct lp_rast_triangle *tri; + unsigned plane_mask; + } triangle; const struct lp_rast_state *set_state; uint8_t clear_color[4]; const struct lp_rast_clearzs *clear_zstencil; @@ -173,10 +172,12 @@ lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile ) } static INLINE union lp_rast_cmd_arg -lp_rast_arg_triangle( const struct lp_rast_triangle *triangle ) +lp_rast_arg_triangle( const struct lp_rast_triangle *triangle, + unsigned plane_mask) { union lp_rast_cmd_arg arg; - arg.triangle = triangle; + arg.triangle.tri = triangle; + arg.triangle.plane_mask = plane_mask; return arg; } @@ -226,19 +227,31 @@ void lp_rast_clear_color( struct lp_rasterizer_task *, void lp_rast_clear_zstencil( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); -void lp_rast_set_state( struct lp_rasterizer_task *, - const union lp_rast_cmd_arg ); - -void lp_rast_triangle( struct lp_rasterizer_task *, - const union lp_rast_cmd_arg ); +void lp_rast_triangle_1( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_2( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_3( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_4( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_5( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_6( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); +void lp_rast_triangle_7( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); void lp_rast_shade_tile( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); +void lp_rast_shade_tile_opaque( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); + void lp_rast_fence( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); -void lp_rast_store_color( struct lp_rasterizer_task *, +void lp_rast_store_linear_color( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h index d33dd49f3a..b4a48cfd02 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h @@ -31,6 +31,7 @@ #include "os/os_thread.h" #include "util/u_format.h" #include "gallivm/lp_bld_debug.h" +#include "lp_memory.h" #include "lp_rast.h" #include "lp_scene.h" #include "lp_state.h" @@ -52,8 +53,6 @@ struct lp_rasterizer_task uint8_t *color_tiles[PIPE_MAX_COLOR_BUFS]; uint8_t *depth_tile; - const struct lp_rast_state *current_state; - /** "back" pointer */ struct lp_rasterizer *rast; @@ -118,10 +117,12 @@ struct lp_rasterizer }; -void lp_rast_shade_quads( struct lp_rasterizer_task *task, - const struct lp_rast_shader_inputs *inputs, - unsigned x, unsigned y, - int32_t c1, int32_t c2, int32_t c3); +void +lp_rast_shade_quads_mask(struct lp_rasterizer_task *task, + const struct lp_rast_shader_inputs *inputs, + unsigned x, unsigned y, + unsigned mask); + /** @@ -132,18 +133,23 @@ void lp_rast_shade_quads( struct lp_rasterizer_task *task, * \param x, y location of 4x4 block in window coords */ static INLINE void * -lp_rast_get_depth_block_pointer(const struct lp_rasterizer *rast, +lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task, unsigned x, unsigned y) { + const struct lp_rasterizer *rast = task->rast; void *depth; assert((x % TILE_VECTOR_WIDTH) == 0); assert((y % TILE_VECTOR_HEIGHT) == 0); - assert(rast->zsbuf.map || !rast->curr_scene->fb.zsbuf); - - if (!rast->zsbuf.map) - return NULL; + if (!rast->zsbuf.map) { + /* Either out of memory or no zsbuf. Can't tell without access + * to the state. Just use dummy tile memory, but don't print + * the oom warning as this most likely because there is no + * zsbuf. + */ + return lp_dummy_tile; + } depth = (rast->zsbuf.map + rast->zsbuf.stride * y + @@ -155,6 +161,39 @@ lp_rast_get_depth_block_pointer(const struct lp_rasterizer *rast, /** + * Get pointer to the swizzled color tile + */ +static INLINE uint8_t * +lp_rast_get_color_tile_pointer(struct lp_rasterizer_task *task, + unsigned buf, enum lp_texture_usage usage) +{ + struct lp_rasterizer *rast = task->rast; + + assert(task->x % TILE_SIZE == 0); + assert(task->y % TILE_SIZE == 0); + assert(buf < rast->state.nr_cbufs); + + if (!task->color_tiles[buf]) { + struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf]; + struct llvmpipe_resource *lpt; + assert(cbuf); + lpt = llvmpipe_resource(cbuf->texture); + task->color_tiles[buf] = lp_swizzled_cbuf[task->thread_index][buf]; + + if (usage != LP_TEX_USAGE_WRITE_ALL) { + llvmpipe_swizzle_cbuf_tile(lpt, + cbuf->face + cbuf->zslice, + cbuf->level, + task->x, task->y, + task->color_tiles[buf]); + } + } + + return task->color_tiles[buf]; +} + + +/** * Get the pointer to a 4x4 color block (within a 64x64 tile). * We'll map the color buffer on demand here. * Note that this may be called even when there's no color buffers - return @@ -171,7 +210,7 @@ lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task, assert((x % TILE_VECTOR_WIDTH) == 0); assert((y % TILE_VECTOR_HEIGHT) == 0); - color = task->color_tiles[buf]; + color = lp_rast_get_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE); assert(color); px = x % TILE_SIZE; @@ -196,8 +235,8 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task, const struct lp_rast_shader_inputs *inputs, unsigned x, unsigned y ) { - struct lp_rasterizer *rast = task->rast; - const struct lp_rast_state *state = task->current_state; + const struct lp_rasterizer *rast = task->rast; + const struct lp_rast_state *state = inputs->state; struct lp_fragment_shader_variant *variant = state->variant; uint8_t *color[PIPE_MAX_COLOR_BUFS]; void *depth; @@ -207,19 +246,19 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task, for (i = 0; i < rast->state.nr_cbufs; i++) color[i] = lp_rast_get_color_block_pointer(task, i, x, y); - depth = lp_rast_get_depth_block_pointer(rast, x, y); + depth = lp_rast_get_depth_block_pointer(task, x, y); /* run shader on 4x4 block */ variant->jit_function[RAST_WHOLE]( &state->jit_context, - x, y, - inputs->facing, - inputs->a0, - inputs->dadx, - inputs->dady, - color, - depth, - INT_MIN, INT_MIN, INT_MIN, - NULL, NULL, NULL, &task->vis_counter ); + x, y, + inputs->facing, + inputs->a0, + inputs->dadx, + inputs->dady, + color, + depth, + 0xffff, + &task->vis_counter ); } diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index a5f0d14c95..ebe9a8e92b 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -113,168 +113,31 @@ block_full_16(struct lp_rasterizer_task *task, block_full_4(task, tri, x + ix, y + iy); } +#define TAG(x) x##_1 +#define NR_PLANES 1 +#include "lp_rast_tri_tmp.h" -/** - * Pass the 4x4 pixel block to the shader function. - * Determination of which of the 16 pixels lies inside the triangle - * will be done as part of the fragment shader. - */ -static void -do_block_4(struct lp_rasterizer_task *task, - const struct lp_rast_triangle *tri, - int x, int y, - int c1, int c2, int c3) -{ - assert(x >= 0); - assert(y >= 0); - - lp_rast_shade_quads(task, &tri->inputs, x, y, -c1, -c2, -c3); -} - - -/** - * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out - * of the triangle's bounds. - */ -static void -do_block_16(struct lp_rasterizer_task *task, - const struct lp_rast_triangle *tri, - int x, int y, - int c0, int c1, int c2) -{ - unsigned mask = 0; - int eo[3]; - int c[3]; - int i, j; - - assert(x >= 0); - assert(y >= 0); - assert(x % 16 == 0); - assert(y % 16 == 0); - - eo[0] = tri->eo1 * 4; - eo[1] = tri->eo2 * 4; - eo[2] = tri->eo3 * 4; - - c[0] = c0; - c[1] = c1; - c[2] = c2; - - for (j = 0; j < 3; j++) { - const int *step = tri->inputs.step[j]; - const int cx = c[j] + eo[j]; - - /* Mask has bits set whenever we are outside any of the edges. - */ - for (i = 0; i < 16; i++) { - int out = cx + step[i] * 4; - mask |= (out >> 31) & (1 << i); - } - } +#define TAG(x) x##_2 +#define NR_PLANES 2 +#include "lp_rast_tri_tmp.h" - mask = ~mask & 0xffff; - while (mask) { - int i = ffs(mask) - 1; - int px = x + pos_table4[i][0]; - int py = y + pos_table4[i][1]; - int cx1 = c0 + tri->inputs.step[0][i] * 4; - int cx2 = c1 + tri->inputs.step[1][i] * 4; - int cx3 = c2 + tri->inputs.step[2][i] * 4; +#define TAG(x) x##_3 +#define NR_PLANES 3 +#include "lp_rast_tri_tmp.h" - mask &= ~(1 << i); +#define TAG(x) x##_4 +#define NR_PLANES 4 +#include "lp_rast_tri_tmp.h" - /* Don't bother testing if the 4x4 block is entirely in/out of - * the triangle. It's a little faster to do it in the jit code. - */ - LP_COUNT(nr_non_empty_4); - do_block_4(task, tri, px, py, cx1, cx2, cx3); - } -} - - -/** - * Scan the tile in chunks and figure out which pixels to rasterize - * for this triangle. - */ -void -lp_rast_triangle(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - const struct lp_rast_triangle *tri = arg.triangle; - const int x = task->x, y = task->y; - int ei[3], eo[3], c[3]; - unsigned outmask, inmask, partial_mask; - unsigned i, j; - - c[0] = tri->c1 + tri->dx12 * y - tri->dy12 * x; - c[1] = tri->c2 + tri->dx23 * y - tri->dy23 * x; - c[2] = tri->c3 + tri->dx31 * y - tri->dy31 * x; - - eo[0] = tri->eo1 * 16; - eo[1] = tri->eo2 * 16; - eo[2] = tri->eo3 * 16; - - ei[0] = tri->ei1 * 16; - ei[1] = tri->ei2 * 16; - ei[2] = tri->ei3 * 16; - - outmask = 0; - inmask = 0xffff; +#define TAG(x) x##_5 +#define NR_PLANES 5 +#include "lp_rast_tri_tmp.h" - for (j = 0; j < 3; j++) { - const int *step = tri->inputs.step[j]; - const int cox = c[j] + eo[j]; - const int cio = ei[j]- eo[j]; +#define TAG(x) x##_6 +#define NR_PLANES 6 +#include "lp_rast_tri_tmp.h" - /* Outmask has bits set whenever we are outside any of the - * edges. - */ - /* Inmask has bits set whenever we are inside all of the edges. - */ - for (i = 0; i < 16; i++) { - int out = cox + step[i] * 16; - int in = out + cio; - outmask |= (out >> 31) & (1 << i); - inmask &= ~((in >> 31) & (1 << i)); - } - } +#define TAG(x) x##_7 +#define NR_PLANES 7 +#include "lp_rast_tri_tmp.h" - assert((outmask & inmask) == 0); - - if (outmask == 0xffff) - return; - - /* Invert mask, so that bits are set whenever we are at least - * partially inside all of the edges: - */ - partial_mask = ~inmask & ~outmask & 0xffff; - - /* Iterate over partials: - */ - while (partial_mask) { - int i = ffs(partial_mask) - 1; - int px = x + pos_table16[i][0]; - int py = y + pos_table16[i][1]; - int cx1 = c[0] + tri->inputs.step[0][i] * 16; - int cx2 = c[1] + tri->inputs.step[1][i] * 16; - int cx3 = c[2] + tri->inputs.step[2][i] * 16; - - partial_mask &= ~(1 << i); - - LP_COUNT(nr_partially_covered_16); - do_block_16(task, tri, px, py, cx1, cx2, cx3); - } - - /* Iterate over fulls: - */ - while (inmask) { - int i = ffs(inmask) - 1; - int px = x + pos_table16[i][0]; - int py = y + pos_table16[i][1]; - - inmask &= ~(1 << i); - - LP_COUNT(nr_fully_covered_16); - block_full_16(task, tri, px, py); - } -} diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h new file mode 100644 index 0000000000..a410c611a3 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -0,0 +1,238 @@ +/************************************************************************** + * + * Copyright 2007-2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Rasterization for binned triangles within a tile + */ + + + +/** + * Prototype for a 7 plane rasterizer function. Will codegenerate + * several of these. + * + * XXX: Varients for more/fewer planes. + * XXX: Need ways of dropping planes as we descend. + * XXX: SIMD + */ +static void +TAG(do_block_4)(struct lp_rasterizer_task *task, + const struct lp_rast_triangle *tri, + const struct lp_rast_plane *plane, + int x, int y, + const int *c) +{ + unsigned mask = 0; + int i; + + for (i = 0; i < 16; i++) { + int any_negative = 0; + int j; + + for (j = 0; j < NR_PLANES; j++) + any_negative |= (c[j] - 1 + plane[j].step[i]); + + any_negative >>= 31; + + mask |= (~any_negative) & (1 << i); + } + + /* Now pass to the shader: + */ + if (mask) + lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask); +} + +/** + * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out + * of the triangle's bounds. + */ +static void +TAG(do_block_16)(struct lp_rasterizer_task *task, + const struct lp_rast_triangle *tri, + const struct lp_rast_plane *plane, + int x, int y, + const int *c) +{ + unsigned outmask, inmask, partmask, partial_mask; + unsigned i, j; + + outmask = 0; /* outside one or more trivial reject planes */ + partmask = 0; /* outside one or more trivial accept planes */ + + for (j = 0; j < NR_PLANES; j++) { + const int *step = plane[j].step; + const int eo = plane[j].eo * 4; + const int ei = plane[j].ei * 4; + const int cox = c[j] + eo; + const int cio = ei - 1 - eo; + + for (i = 0; i < 16; i++) { + int out = cox + step[i] * 4; + int part = out + cio; + outmask |= (out >> 31) & (1 << i); + partmask |= (part >> 31) & (1 << i); + } + } + + if (outmask == 0xffff) + return; + + /* Mask of sub-blocks which are inside all trivial accept planes: + */ + inmask = ~partmask & 0xffff; + + /* Mask of sub-blocks which are inside all trivial reject planes, + * but outside at least one trivial accept plane: + */ + partial_mask = partmask & ~outmask; + + assert((partial_mask & inmask) == 0); + + /* Iterate over partials: + */ + while (partial_mask) { + int i = ffs(partial_mask) - 1; + int px = x + pos_table4[i][0]; + int py = y + pos_table4[i][1]; + int cx[NR_PLANES]; + + for (j = 0; j < NR_PLANES; j++) + cx[j] = c[j] + plane[j].step[i] * 4; + + partial_mask &= ~(1 << i); + + TAG(do_block_4)(task, tri, plane, px, py, cx); + } + + /* Iterate over fulls: + */ + while (inmask) { + int i = ffs(inmask) - 1; + int px = x + pos_table4[i][0]; + int py = y + pos_table4[i][1]; + + inmask &= ~(1 << i); + + block_full_4(task, tri, px, py); + } +} + + +/** + * Scan the tile in chunks and figure out which pixels to rasterize + * for this triangle. + */ +void +TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + unsigned plane_mask = arg.triangle.plane_mask; + const int x = task->x, y = task->y; + struct lp_rast_plane plane[NR_PLANES]; + int c[NR_PLANES]; + unsigned outmask, inmask, partmask, partial_mask; + unsigned i, j, nr_planes = 0; + + while (plane_mask) { + int i = ffs(plane_mask) - 1; + plane[nr_planes] = tri->plane[i]; + plane_mask &= ~(1 << i); + nr_planes++; + }; + + assert(nr_planes == NR_PLANES); + outmask = 0; /* outside one or more trivial reject planes */ + partmask = 0; /* outside one or more trivial accept planes */ + + for (j = 0; j < NR_PLANES; j++) { + const int *step = plane[j].step; + const int eo = plane[j].eo * 16; + const int ei = plane[j].ei * 16; + int cox, cio; + + c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; + cox = c[j] + eo; + cio = ei - 1 - eo; + + for (i = 0; i < 16; i++) { + int out = cox + step[i] * 16; + int part = out + cio; + outmask |= (out >> 31) & (1 << i); + partmask |= (part >> 31) & (1 << i); + } + } + + if (outmask == 0xffff) + return; + + /* Mask of sub-blocks which are inside all trivial accept planes: + */ + inmask = ~partmask & 0xffff; + + /* Mask of sub-blocks which are inside all trivial reject planes, + * but outside at least one trivial accept plane: + */ + partial_mask = partmask & ~outmask; + + assert((partial_mask & inmask) == 0); + + /* Iterate over partials: + */ + while (partial_mask) { + int i = ffs(partial_mask) - 1; + int px = x + pos_table16[i][0]; + int py = y + pos_table16[i][1]; + int cx[NR_PLANES]; + + for (j = 0; j < NR_PLANES; j++) + cx[j] = c[j] + plane[j].step[i] * 16; + + partial_mask &= ~(1 << i); + + LP_COUNT(nr_partially_covered_16); + TAG(do_block_16)(task, tri, plane, px, py, cx); + } + + /* Iterate over fulls: + */ + while (inmask) { + int i = ffs(inmask) - 1; + int px = x + pos_table16[i][0]; + int py = y + pos_table16[i][1]; + + inmask &= ~(1 << i); + + LP_COUNT(nr_fully_covered_16); + block_full_16(task, tri, px, py); + } +} + +#undef TAG +#undef NR_PLANES + diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c index 845c175cf2..f88a759fe7 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.c +++ b/src/gallium/drivers/llvmpipe/lp_scene.c @@ -32,6 +32,7 @@ #include "util/u_simple_list.h" #include "lp_scene.h" #include "lp_scene_queue.h" +#include "lp_fence.h" /** List of texture references */ @@ -162,8 +163,8 @@ lp_scene_reset(struct lp_scene *scene ) /* Free all but last binner command lists: */ - for (i = 0; i < scene->tiles_x; i++) { - for (j = 0; j < scene->tiles_y; j++) { + for (i = 0; i < TILES_X; i++) { + for (j = 0; j < TILES_Y; j++) { lp_scene_bin_reset(scene, i, j); } } @@ -198,6 +199,8 @@ lp_scene_reset(struct lp_scene *scene ) make_empty_list(ref_list); } + lp_fence_reference(&scene->fence, NULL); + scene->scene_size = 0; scene->has_color_clear = FALSE; @@ -303,60 +306,6 @@ lp_scene_is_resource_referenced(const struct lp_scene *scene, } -/** - * Return last command in the bin - */ -static lp_rast_cmd -lp_get_last_command( const struct cmd_bin *bin ) -{ - const struct cmd_block *tail = bin->commands.tail; - const unsigned i = tail->count; - if (i > 0) - return tail->cmd[i - 1]; - else - return NULL; -} - - -/** - * Replace the arg of the last command in the bin. - */ -static void -lp_replace_last_command_arg( struct cmd_bin *bin, - const union lp_rast_cmd_arg arg ) -{ - struct cmd_block *tail = bin->commands.tail; - const unsigned i = tail->count; - assert(i > 0); - tail->arg[i - 1] = arg; -} - - - -/** - * Put a state-change command into all bins. - * If we find that the last command in a bin was also a state-change - * command, we can simply replace that one with the new one. - */ -void -lp_scene_bin_state_command( struct lp_scene *scene, - lp_rast_cmd cmd, - const union lp_rast_cmd_arg arg ) -{ - unsigned i, j; - for (i = 0; i < scene->tiles_x; i++) { - for (j = 0; j < scene->tiles_y; j++) { - struct cmd_bin *bin = lp_scene_get_bin(scene, i, j); - lp_rast_cmd last_cmd = lp_get_last_command(bin); - if (last_cmd == cmd) { - lp_replace_last_command_arg(bin, arg); - } - else { - lp_scene_bin_command( scene, i, j, cmd, arg ); - } - } - } -} /** advance curr_x,y to the next bin */ diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h index 4e55d43174..fa1b311fa1 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.h +++ b/src/gallium/drivers/llvmpipe/lp_scene.h @@ -112,6 +112,7 @@ struct resource_ref { */ struct lp_scene { struct pipe_context *pipe; + struct lp_fence *fence; /** the framebuffer to render the scene into */ struct pipe_framebuffer_state fb; diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index 6432cea862..167cb2ee2e 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -43,6 +43,7 @@ #include "lp_debug.h" #include "lp_public.h" #include "lp_limits.h" +#include "lp_rast.h" #include "state_tracker/sw_winsys.h" @@ -86,7 +87,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS: return PIPE_MAX_SAMPLERS; case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS: - return 0; + return PIPE_MAX_VERTEX_SAMPLERS; case PIPE_CAP_MAX_COMBINED_SAMPLERS: return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS; case PIPE_CAP_NPOT_TEXTURES: @@ -166,6 +167,10 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return LP_MAX_TGSI_PREDS; case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE: return 1; + case PIPE_CAP_GEOMETRY_SHADER4: + return 1; + case PIPE_CAP_DEPTH_CLAMP: + return 0; default: assert(0); return 0; @@ -294,11 +299,16 @@ llvmpipe_destroy_screen( struct pipe_screen *_screen ) struct llvmpipe_screen *screen = llvmpipe_screen(_screen); struct sw_winsys *winsys = screen->winsys; + if (screen->rast) + lp_rast_destroy(screen->rast); + lp_jit_screen_cleanup(screen); if(winsys->destroy) winsys->destroy(winsys); + pipe_mutex_destroy(screen->rast_mutex); + FREE(screen); } @@ -347,11 +357,6 @@ llvmpipe_create_screen(struct sw_winsys *winsys) lp_jit_screen_init(screen); -#ifdef PIPE_OS_WINDOWS - /* Multithreading not supported on windows until conditions and barriers are - * properly implemented. */ - screen->num_threads = 0; -#else #ifdef PIPE_OS_EMBEDDED screen->num_threads = 0; #else @@ -359,7 +364,14 @@ llvmpipe_create_screen(struct sw_winsys *winsys) #endif screen->num_threads = debug_get_num_option("LP_NUM_THREADS", screen->num_threads); screen->num_threads = MIN2(screen->num_threads, LP_MAX_THREADS); -#endif + + screen->rast = lp_rast_create(screen->num_threads); + if (!screen->rast) { + lp_jit_screen_cleanup(screen); + FREE(screen); + return NULL; + } + pipe_mutex_init(screen->rast_mutex); util_format_s3tc_init(); diff --git a/src/gallium/drivers/llvmpipe/lp_screen.h b/src/gallium/drivers/llvmpipe/lp_screen.h index eb40f6823f..731526dfab 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.h +++ b/src/gallium/drivers/llvmpipe/lp_screen.h @@ -37,6 +37,7 @@ #include "gallivm/lp_bld.h" #include <llvm-c/ExecutionEngine.h> +#include "os/os_thread.h" #include "pipe/p_screen.h" #include "pipe/p_defines.h" @@ -63,6 +64,9 @@ struct llvmpipe_screen /* Increments whenever textures are modified. Contexts can track this. */ unsigned timestamp; + + struct lp_rasterizer *rast; + pipe_mutex rast_mutex; }; diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index e8aafee33f..556e571585 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -40,6 +40,7 @@ #include "util/u_memory.h" #include "util/u_pack_color.h" #include "lp_context.h" +#include "lp_memory.h" #include "lp_scene.h" #include "lp_scene_queue.h" #include "lp_texture.h" @@ -63,15 +64,7 @@ struct lp_scene * lp_setup_get_current_scene(struct lp_setup_context *setup) { if (!setup->scene) { - - /* wait for a free/empty scene - */ - setup->scene = lp_scene_dequeue(setup->empty_scenes, TRUE); - - assert(lp_scene_is_empty(setup->scene)); - - lp_scene_begin_binning(setup->scene, - &setup->fb ); + set_scene_state( setup, SETUP_EMPTY ); } return setup->scene; } @@ -159,8 +152,11 @@ static void lp_setup_rasterize_scene( struct lp_setup_context *setup ) { struct lp_scene *scene = lp_setup_get_current_scene(setup); + struct llvmpipe_screen *screen = llvmpipe_screen(scene->pipe->screen); - lp_scene_rasterize(scene, setup->rast); + pipe_mutex_lock(screen->rast_mutex); + lp_scene_rasterize(scene, screen->rast); + pipe_mutex_unlock(screen->rast_mutex); reset_context( setup ); @@ -233,22 +229,36 @@ set_scene_state( struct lp_setup_context *setup, LP_DBG(DEBUG_SETUP, "%s old %d new %d\n", __FUNCTION__, old_state, new_state); switch (new_state) { - case SETUP_ACTIVE: - begin_binning( setup ); + case SETUP_EMPTY: + assert(old_state == SETUP_FLUSHED); + assert(setup->scene == NULL); + + /* wait for a free/empty scene + */ + setup->scene = lp_scene_dequeue(setup->empty_scenes, TRUE); + assert(lp_scene_is_empty(setup->scene)); + lp_scene_begin_binning(setup->scene, + &setup->fb ); break; case SETUP_CLEARED: - if (old_state == SETUP_ACTIVE) { - assert(0); - return; - } + assert(old_state == SETUP_EMPTY); + assert(setup->scene != NULL); break; - + + case SETUP_ACTIVE: + assert(old_state == SETUP_EMPTY || + old_state == SETUP_CLEARED); + assert(setup->scene != NULL); + begin_binning( setup ); + break; + case SETUP_FLUSHED: if (old_state == SETUP_CLEARED) execute_clears( setup ); else lp_setup_rasterize_scene( setup ); + assert(setup->scene == NULL); break; default: @@ -264,23 +274,19 @@ set_scene_state( struct lp_setup_context *setup, */ void lp_setup_flush( struct lp_setup_context *setup, - unsigned flags ) + unsigned flags, + struct pipe_fence_handle **fence) { LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); if (setup->scene) { - struct lp_scene *scene = lp_setup_get_current_scene(setup); - union lp_rast_cmd_arg dummy = {0}; - - if (flags & (PIPE_FLUSH_SWAPBUFFERS | - PIPE_FLUSH_FRAME)) { - /* Store colors in the linear color buffer(s). - * If we don't do this here, we'll end up converting the tiled - * data to linear in the texture_unmap() function, which will - * not be a parallel/threaded operation as here. + if (fence) { + /* if we're going to flush the setup/rasterization modules, emit + * a fence. */ - lp_scene_bin_everywhere(scene, lp_rast_store_color, dummy); + *fence = lp_setup_fence( setup ); } + } set_scene_state( setup, SETUP_FLUSHED ); @@ -297,6 +303,11 @@ lp_setup_bind_framebuffer( struct lp_setup_context *setup, */ set_scene_state( setup, SETUP_FLUSHED ); + /* + * Ensure the old scene is not reused. + */ + assert(!setup->scene); + /* Set new state. This will be picked up later when we next need a * scene. */ @@ -421,24 +432,27 @@ lp_setup_clear( struct lp_setup_context *setup, struct pipe_fence_handle * lp_setup_fence( struct lp_setup_context *setup ) { - if (setup->num_threads == 0) { + if (setup->scene == NULL) return NULL; - } - else { + else if (setup->num_threads == 0) + return NULL; + else + { struct lp_scene *scene = lp_setup_get_current_scene(setup); - const unsigned rank = lp_scene_get_num_bins( scene ); /* xxx */ - struct lp_fence *fence = lp_fence_create(rank); - - LP_DBG(DEBUG_SETUP, "%s rank %u\n", __FUNCTION__, rank); + const unsigned rank = setup->num_threads; set_scene_state( setup, SETUP_ACTIVE ); + + assert(scene->fence == NULL); + + /* The caller gets a reference, we keep a copy too, so need to + * bump the refcount: + */ + lp_fence_reference(&scene->fence, lp_fence_create(rank)); - /* insert the fence into all command bins */ - lp_scene_bin_everywhere( scene, - lp_rast_fence, - lp_rast_arg_fence(fence) ); + LP_DBG(DEBUG_SETUP, "%s rank %u\n", __FUNCTION__, rank); - return (struct pipe_fence_handle *) fence; + return (struct pipe_fence_handle *) scene->fence; } } @@ -611,6 +625,17 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup, LP_TEX_LAYOUT_LINEAR); jit_tex->row_stride[j] = lp_tex->row_stride[j]; jit_tex->img_stride[j] = lp_tex->img_stride[j]; + + if (!jit_tex->data[j]) { + /* out of memory - use dummy tile memory */ + jit_tex->data[j] = lp_dummy_tile; + jit_tex->width = TILE_SIZE; + jit_tex->height = TILE_SIZE; + jit_tex->depth = 1; + jit_tex->last_level = 0; + jit_tex->row_stride[j] = 0; + jit_tex->img_stride[j] = 0; + } } } else { @@ -618,7 +643,6 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup, /* * XXX: Where should this be unmapped? */ - struct llvmpipe_screen *screen = llvmpipe_screen(tex->screen); struct sw_winsys *winsys = screen->winsys; jit_tex->data[0] = winsys->displaytarget_map(winsys, lp_tex->dt, @@ -717,28 +741,6 @@ lp_setup_update_state( struct lp_setup_context *setup ) setup->dirty |= LP_SETUP_NEW_FS; } - if (setup->dirty & LP_SETUP_NEW_SCISSOR) { - float *stored; - - stored = lp_scene_alloc_aligned(scene, 4 * sizeof(int32_t), 16); - - if (stored) { - stored[0] = (float) setup->scissor.current.minx; - stored[1] = (float) setup->scissor.current.miny; - stored[2] = (float) setup->scissor.current.maxx; - stored[3] = (float) setup->scissor.current.maxy; - - setup->scissor.stored = stored; - - setup->fs.current.jit_context.scissor_xmin = stored[0]; - setup->fs.current.jit_context.scissor_ymin = stored[1]; - setup->fs.current.jit_context.scissor_xmax = stored[2]; - setup->fs.current.jit_context.scissor_ymax = stored[3]; - } - - setup->dirty |= LP_SETUP_NEW_FS; - } - if(setup->dirty & LP_SETUP_NEW_CONSTANTS) { struct pipe_resource *buffer = setup->constants.current; @@ -792,11 +794,6 @@ lp_setup_update_state( struct lp_setup_context *setup ) &setup->fs.current, sizeof setup->fs.current); setup->fs.stored = stored; - - /* put the state-set command into all bins */ - lp_scene_bin_state_command( scene, - lp_rast_set_state, - lp_rast_arg_state(setup->fs.stored) ); } /* The scene now references the textures in the rasterization @@ -843,8 +840,6 @@ lp_setup_destroy( struct lp_setup_context *setup ) lp_scene_queue_destroy(setup->empty_scenes); - lp_rast_destroy( setup->rast ); - FREE( setup ); } @@ -871,13 +866,7 @@ lp_setup_create( struct pipe_context *pipe, if (!setup->empty_scenes) goto fail; - /* XXX: move this to the screen and share between contexts: - */ setup->num_threads = screen->num_threads; - setup->rast = lp_rast_create(screen->num_threads); - if (!setup->rast) - goto fail; - setup->vbuf = draw_vbuf_stage(draw, &setup->base); if (!setup->vbuf) goto fail; @@ -901,9 +890,6 @@ lp_setup_create( struct pipe_context *pipe, return setup; fail: - if (setup->rast) - lp_rast_destroy( setup->rast ); - if (setup->vbuf) ; @@ -933,6 +919,8 @@ lp_setup_begin_query(struct lp_setup_context *setup, memset(pq->count, 0, sizeof(pq->count)); /* reset all counters */ + set_scene_state( setup, SETUP_ACTIVE ); + cmd_arg.query_obj = pq; lp_scene_bin_everywhere(scene, lp_rast_begin_query, cmd_arg); pq->binned = TRUE; @@ -948,6 +936,8 @@ lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq) struct lp_scene * scene = lp_setup_get_current_scene(setup); union lp_rast_cmd_arg cmd_arg; + set_scene_state( setup, SETUP_ACTIVE ); + cmd_arg.query_obj = pq; lp_scene_bin_everywhere(scene, lp_rast_end_query, cmd_arg); } diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h index 6a0dc55129..73b1c85325 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_setup.h @@ -84,7 +84,8 @@ lp_setup_fence( struct lp_setup_context *setup ); void lp_setup_flush( struct lp_setup_context *setup, - unsigned flags ); + unsigned flags, + struct pipe_fence_handle **fence); void diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index c8b8a2480b..a0606f5034 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -81,7 +81,6 @@ struct lp_setup_context */ struct draw_stage *vbuf; unsigned num_threads; - struct lp_rasterizer *rast; struct lp_scene *scenes[MAX_SCENES]; /**< all the scenes */ struct lp_scene *scene; /**< current scene being built */ struct lp_scene_queue *empty_scenes; /**< queue of empty scenes */ @@ -101,9 +100,10 @@ struct lp_setup_context } clear; enum setup_state { - SETUP_FLUSHED, - SETUP_CLEARED, - SETUP_ACTIVE + SETUP_FLUSHED, /**< scene is null */ + SETUP_EMPTY, /**< scene exists but has only state changes */ + SETUP_CLEARED, /**< scene exists but has only clears */ + SETUP_ACTIVE /**< scene exists and has at least one draw/query */ } state; struct { @@ -129,7 +129,6 @@ struct lp_setup_context struct { struct pipe_scissor_state current; - const void *stored; } scissor; unsigned dirty; /**< bitmask of LP_SETUP_NEW_x bits */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 0557d35f8b..7e432503c1 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -38,12 +38,78 @@ #define NUM_CHANNELS 4 +struct tri_info { + + float pixel_offset; + + /* fixed point vertex coordinates */ + int x[3]; + int y[3]; + + /* float x,y deltas - all from the original coordinates + */ + float dy01, dy20; + float dx01, dx20; + float oneoverarea; + + const float (*v0)[4]; + const float (*v1)[4]; + const float (*v2)[4]; + + boolean frontfacing; +}; + + + +static const int step_scissor_minx[16] = { + 0, 1, 0, 1, + 2, 3, 2, 3, + 0, 1, 0, 1, + 2, 3, 2, 3 +}; + +static const int step_scissor_maxx[16] = { + 0, -1, 0, -1, + -2, -3, -2, -3, + 0, -1, 0, -1, + -2, -3, -2, -3 +}; + +static const int step_scissor_miny[16] = { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3 +}; + +static const int step_scissor_maxy[16] = { + 0, 0, -1, -1, + 0, 0, -1, -1, + -2, -2, -3, -3, + -2, -2, -3, -3 +}; + + + + +static INLINE int +subpixel_snap(float a) +{ + return util_iround(FIXED_ONE * a); +} + +static INLINE float +fixed_to_float(int a) +{ + return a * (1.0 / FIXED_ONE); +} + + /** * Compute a0 for a constant-valued coefficient (GL_FLAT shading). */ -static void constant_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, +static void constant_coef( struct lp_rast_triangle *tri, unsigned slot, const float value, unsigned i ) @@ -54,28 +120,21 @@ static void constant_coef( struct lp_setup_context *setup, } -/** - * Compute a0, dadx and dady for a linearly interpolated coefficient, - * for a triangle. - */ -static void linear_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - float oneoverarea, + +static void linear_coef( struct lp_rast_triangle *tri, + const struct tri_info *info, unsigned slot, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], unsigned vert_attr, unsigned i) { - float a1 = v1[vert_attr][i]; - float a2 = v2[vert_attr][i]; - float a3 = v3[vert_attr][i]; + float a0 = info->v0[vert_attr][i]; + float a1 = info->v1[vert_attr][i]; + float a2 = info->v2[vert_attr][i]; - float da12 = a1 - a2; - float da31 = a3 - a1; - float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea; - float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea; + float da01 = a0 - a1; + float da20 = a2 - a0; + float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; + float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; tri->inputs.dadx[slot][i] = dadx; tri->inputs.dady[slot][i] = dady; @@ -92,9 +151,9 @@ static void linear_coef( struct lp_setup_context *setup, * to define a0 as the sample at a pixel center somewhere near vmin * instead - i'll switch to this later. */ - tri->inputs.a0[slot][i] = (a1 - - (dadx * (v1[0][0] - setup->pixel_offset) + - dady * (v1[0][1] - setup->pixel_offset))); + tri->inputs.a0[slot][i] = (a0 - + (dadx * (info->v0[0][0] - info->pixel_offset) + + dady * (info->v0[0][1] - info->pixel_offset))); } @@ -106,31 +165,27 @@ static void linear_coef( struct lp_setup_context *setup, * Later, when we compute the value at a particular fragment position we'll * divide the interpolated value by the interpolated W at that fragment. */ -static void perspective_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - float oneoverarea, +static void perspective_coef( struct lp_rast_triangle *tri, + const struct tri_info *info, unsigned slot, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], unsigned vert_attr, unsigned i) { /* premultiply by 1/w (v[0][3] is always 1/w): */ - float a1 = v1[vert_attr][i] * v1[0][3]; - float a2 = v2[vert_attr][i] * v2[0][3]; - float a3 = v3[vert_attr][i] * v3[0][3]; - float da12 = a1 - a2; - float da31 = a3 - a1; - float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * oneoverarea; - float dady = (da31 * tri->dx12 - tri->dx31 * da12) * oneoverarea; + float a0 = info->v0[vert_attr][i] * info->v0[0][3]; + float a1 = info->v1[vert_attr][i] * info->v1[0][3]; + float a2 = info->v2[vert_attr][i] * info->v2[0][3]; + float da01 = a0 - a1; + float da20 = a2 - a0; + float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; + float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; tri->inputs.dadx[slot][i] = dadx; tri->inputs.dady[slot][i] = dady; - tri->inputs.a0[slot][i] = (a1 - - (dadx * (v1[0][0] - setup->pixel_offset) + - dady * (v1[0][1] - setup->pixel_offset))); + tri->inputs.a0[slot][i] = (a0 - + (dadx * (info->v0[0][0] - info->pixel_offset) + + dady * (info->v0[0][1] - info->pixel_offset))); } @@ -141,13 +196,9 @@ static void perspective_coef( struct lp_setup_context *setup, * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. */ static void -setup_fragcoord_coef(struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - float oneoverarea, +setup_fragcoord_coef(struct lp_rast_triangle *tri, + const struct tri_info *info, unsigned slot, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], unsigned usage_mask) { /*X*/ @@ -166,12 +217,12 @@ setup_fragcoord_coef(struct lp_setup_context *setup, /*Z*/ if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 2); + linear_coef(tri, info, slot, 0, 2); } /*W*/ if (usage_mask & TGSI_WRITEMASK_W) { - linear_coef(setup, tri, oneoverarea, slot, v1, v2, v3, 0, 3); + linear_coef(tri, info, slot, 0, 3); } } @@ -180,24 +231,23 @@ setup_fragcoord_coef(struct lp_setup_context *setup, * Setup the fragment input attribute with the front-facing value. * \param frontface is the triangle front facing? */ -static void setup_facing_coef( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, +static void setup_facing_coef( struct lp_rast_triangle *tri, unsigned slot, boolean frontface, unsigned usage_mask) { /* convert TRUE to 1.0 and FALSE to -1.0 */ if (usage_mask & TGSI_WRITEMASK_X) - constant_coef( setup, tri, slot, 2.0f * frontface - 1.0f, 0 ); + constant_coef( tri, slot, 2.0f * frontface - 1.0f, 0 ); if (usage_mask & TGSI_WRITEMASK_Y) - constant_coef( setup, tri, slot, 0.0f, 1 ); /* wasted */ + constant_coef( tri, slot, 0.0f, 1 ); /* wasted */ if (usage_mask & TGSI_WRITEMASK_Z) - constant_coef( setup, tri, slot, 0.0f, 2 ); /* wasted */ + constant_coef( tri, slot, 0.0f, 2 ); /* wasted */ if (usage_mask & TGSI_WRITEMASK_W) - constant_coef( setup, tri, slot, 0.0f, 3 ); /* wasted */ + constant_coef( tri, slot, 0.0f, 3 ); /* wasted */ } @@ -206,11 +256,7 @@ static void setup_facing_coef( struct lp_setup_context *setup, */ static void setup_tri_coefficients( struct lp_setup_context *setup, struct lp_rast_triangle *tri, - float oneoverarea, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4], - boolean frontface) + const struct tri_info *info) { unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; @@ -227,25 +273,25 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, if (setup->flatshade_first) { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, tri, slot+1, v1[vert_attr][i], i); + constant_coef(tri, slot+1, info->v0[vert_attr][i], i); } else { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - constant_coef(setup, tri, slot+1, v3[vert_attr][i], i); + constant_coef(tri, slot+1, info->v2[vert_attr][i], i); } break; case LP_INTERP_LINEAR: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - linear_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i); + linear_coef(tri, info, slot+1, vert_attr, i); break; case LP_INTERP_PERSPECTIVE: for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) - perspective_coef(setup, tri, oneoverarea, slot+1, v1, v2, v3, vert_attr, i); + perspective_coef(tri, info, slot+1, vert_attr, i); fragcoord_usage_mask |= TGSI_WRITEMASK_W; break; @@ -259,7 +305,7 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, break; case LP_INTERP_FACING: - setup_facing_coef(setup, tri, slot+1, frontface, usage_mask); + setup_facing_coef(tri, slot+1, info->frontfacing, usage_mask); break; default: @@ -269,16 +315,11 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, /* The internal position input is in slot zero: */ - setup_fragcoord_coef(setup, tri, oneoverarea, 0, v1, v2, v3, - fragcoord_usage_mask); + setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask); } -static INLINE int subpixel_snap( float a ) -{ - return util_iround(FIXED_ONE * a - (FIXED_ONE / 2)); -} @@ -291,21 +332,23 @@ static INLINE int subpixel_snap( float a ) * \return pointer to triangle space */ static INLINE struct lp_rast_triangle * -alloc_triangle(struct lp_scene *scene, unsigned nr_inputs, unsigned *tri_size) +alloc_triangle(struct lp_scene *scene, + unsigned nr_inputs, + unsigned nr_planes, + unsigned *tri_size) { unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); struct lp_rast_triangle *tri; - unsigned bytes; + unsigned tri_bytes, bytes; char *inputs; - assert(sizeof(*tri) % 16 == 0); - - bytes = sizeof(*tri) + (3 * input_array_sz); + tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16); + bytes = tri_bytes + (3 * input_array_sz); tri = lp_scene_alloc_aligned( scene, bytes, 16 ); if (tri) { - inputs = (char *) (tri + 1); + inputs = ((char *)tri) + tri_bytes; tri->inputs.a0 = (float (*)[4]) inputs; tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz); tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz); @@ -329,52 +372,71 @@ print_triangle(struct lp_setup_context *setup, uint i; debug_printf("llvmpipe triangle\n"); - for (i = 0; i < setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { debug_printf(" v1[%d]: %f %f %f %f\n", i, v1[i][0], v1[i][1], v1[i][2], v1[i][3]); } - for (i = 0; i < setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { debug_printf(" v2[%d]: %f %f %f %f\n", i, v2[i][0], v2[i][1], v2[i][2], v2[i][3]); } - for (i = 0; i < setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { debug_printf(" v3[%d]: %f %f %f %f\n", i, v3[i][0], v3[i][1], v3[i][2], v3[i][3]); } } +lp_rast_cmd lp_rast_tri_tab[8] = { + NULL, /* should be impossible */ + lp_rast_triangle_1, + lp_rast_triangle_2, + lp_rast_triangle_3, + lp_rast_triangle_4, + lp_rast_triangle_5, + lp_rast_triangle_6, + lp_rast_triangle_7 +}; + /** * Do basic setup for triangle rasterization and determine which * framebuffer tiles are touched. Put the triangle in the scene's * bins for the tiles which we overlap. */ -static void +static void do_triangle_ccw(struct lp_setup_context *setup, const float (*v1)[4], const float (*v2)[4], const float (*v3)[4], boolean frontfacing ) { - /* x/y positions in fixed point */ - const int x1 = subpixel_snap(v1[0][0] + 0.5 - setup->pixel_offset); - const int x2 = subpixel_snap(v2[0][0] + 0.5 - setup->pixel_offset); - const int x3 = subpixel_snap(v3[0][0] + 0.5 - setup->pixel_offset); - const int y1 = subpixel_snap(v1[0][1] + 0.5 - setup->pixel_offset); - const int y2 = subpixel_snap(v2[0][1] + 0.5 - setup->pixel_offset); - const int y3 = subpixel_snap(v3[0][1] + 0.5 - setup->pixel_offset); struct lp_scene *scene = lp_setup_get_current_scene(setup); + struct lp_fragment_shader_variant *variant = setup->fs.current.variant; struct lp_rast_triangle *tri; + struct tri_info info; int area; - float oneoverarea; int minx, maxx, miny, maxy; + int ix0, ix1, iy0, iy1; unsigned tri_bytes; - + int i; + int nr_planes = 3; + if (0) print_triangle(setup, v1, v2, v3); - tri = alloc_triangle(scene, setup->fs.nr_inputs, &tri_bytes); + if (setup->scissor_test) { + nr_planes = 7; + } + else { + nr_planes = 3; + } + + + tri = alloc_triangle(scene, + setup->fs.nr_inputs, + nr_planes, + &tri_bytes); if (!tri) return; @@ -387,15 +449,24 @@ do_triangle_ccw(struct lp_setup_context *setup, tri->v[2][1] = v3[0][1]; #endif - tri->dx12 = x1 - x2; - tri->dx23 = x2 - x3; - tri->dx31 = x3 - x1; + /* x/y positions in fixed point */ + info.x[0] = subpixel_snap(v1[0][0] - setup->pixel_offset); + info.x[1] = subpixel_snap(v2[0][0] - setup->pixel_offset); + info.x[2] = subpixel_snap(v3[0][0] - setup->pixel_offset); + info.y[0] = subpixel_snap(v1[0][1] - setup->pixel_offset); + info.y[1] = subpixel_snap(v2[0][1] - setup->pixel_offset); + info.y[2] = subpixel_snap(v3[0][1] - setup->pixel_offset); + + tri->plane[0].dcdy = info.x[0] - info.x[1]; + tri->plane[1].dcdy = info.x[1] - info.x[2]; + tri->plane[2].dcdy = info.x[2] - info.x[0]; - tri->dy12 = y1 - y2; - tri->dy23 = y2 - y3; - tri->dy31 = y3 - y1; + tri->plane[0].dcdx = info.y[0] - info.y[1]; + tri->plane[1].dcdx = info.y[1] - info.y[2]; + tri->plane[2].dcdx = info.y[2] - info.y[0]; - area = (tri->dx12 * tri->dy31 - tri->dx31 * tri->dy12); + area = (tri->plane[0].dcdy * tri->plane[2].dcdx - + tri->plane[2].dcdy * tri->plane[0].dcdx); LP_COUNT(nr_tris); @@ -410,20 +481,35 @@ do_triangle_ccw(struct lp_setup_context *setup, } /* Bounding rectangle (in pixels) */ - minx = (MIN3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER; - maxx = (MAX3(x1, x2, x3) + (FIXED_ONE-1)) >> FIXED_ORDER; - miny = (MIN3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER; - maxy = (MAX3(y1, y2, y3) + (FIXED_ONE-1)) >> FIXED_ORDER; - + { + /* Yes this is necessary to accurately calculate bounding boxes + * with the two fill-conventions we support. GL (normally) ends + * up needing a bottom-left fill convention, which requires + * slightly different rounding. + */ + int adj = (setup->pixel_offset != 0) ? 1 : 0; + + minx = (MIN3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; + maxx = (MAX3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; + miny = (MIN3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + maxy = (MAX3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + } + if (setup->scissor_test) { minx = MAX2(minx, setup->scissor.current.minx); maxx = MIN2(maxx, setup->scissor.current.maxx); miny = MAX2(miny, setup->scissor.current.miny); maxy = MIN2(maxy, setup->scissor.current.maxy); } + else { + minx = MAX2(minx, 0); + miny = MAX2(miny, 0); + maxx = MIN2(maxx, scene->fb.width); + maxy = MIN2(maxy, scene->fb.height); + } + - if (miny == maxy || - minx == maxx) { + if (miny >= maxy || minx >= maxx) { lp_scene_putback_data( scene, tri_bytes ); LP_COUNT(nr_culled_tris); return; @@ -431,75 +517,88 @@ do_triangle_ccw(struct lp_setup_context *setup, /* */ - oneoverarea = ((float)FIXED_ONE) / (float)area; + info.pixel_offset = setup->pixel_offset; + info.v0 = v1; + info.v1 = v2; + info.v2 = v3; + info.dx01 = info.v0[0][0] - info.v1[0][0]; + info.dx20 = info.v2[0][0] - info.v0[0][0]; + info.dy01 = info.v0[0][1] - info.v1[0][1]; + info.dy20 = info.v2[0][1] - info.v0[0][1]; + info.oneoverarea = 1.0 / (info.dx01 * info.dy20 - info.dx20 * info.dy01); + info.frontfacing = frontfacing; /* Setup parameter interpolants: */ - setup_tri_coefficients( setup, tri, oneoverarea, v1, v2, v3, frontfacing ); + setup_tri_coefficients( setup, tri, &info ); tri->inputs.facing = frontfacing ? 1.0F : -1.0F; + tri->inputs.state = setup->fs.stored; - /* half-edge constants, will be interated over the whole render target. - */ - tri->c1 = tri->dy12 * x1 - tri->dx12 * y1; - tri->c2 = tri->dy23 * x2 - tri->dx23 * y2; - tri->c3 = tri->dy31 * x3 - tri->dx31 * y3; - /* correct for top-left fill convention: - */ - if (tri->dy12 < 0 || (tri->dy12 == 0 && tri->dx12 > 0)) tri->c1++; - if (tri->dy23 < 0 || (tri->dy23 == 0 && tri->dx23 > 0)) tri->c2++; - if (tri->dy31 < 0 || (tri->dy31 == 0 && tri->dx31 > 0)) tri->c3++; - - tri->dy12 *= FIXED_ONE; - tri->dy23 *= FIXED_ONE; - tri->dy31 *= FIXED_ONE; - - tri->dx12 *= FIXED_ONE; - tri->dx23 *= FIXED_ONE; - tri->dx31 *= FIXED_ONE; - - /* find trivial reject offsets for each edge for a single-pixel - * sized block. These will be scaled up at each recursive level to - * match the active blocksize. Scaling in this way works best if - * the blocks are square. - */ - tri->eo1 = 0; - if (tri->dy12 < 0) tri->eo1 -= tri->dy12; - if (tri->dx12 > 0) tri->eo1 += tri->dx12; + + for (i = 0; i < 3; i++) { + struct lp_rast_plane *plane = &tri->plane[i]; - tri->eo2 = 0; - if (tri->dy23 < 0) tri->eo2 -= tri->dy23; - if (tri->dx23 > 0) tri->eo2 += tri->dx23; + /* half-edge constants, will be interated over the whole render + * target. + */ + plane->c = plane->dcdx * info.x[i] - plane->dcdy * info.y[i]; + + /* correct for top-left vs. bottom-left fill convention. + * + * note that we're overloading gl_rasterization_rules to mean + * both (0.5,0.5) pixel centers *and* bottom-left filling + * convention. + * + * GL actually has a top-left filling convention, but GL's + * notion of "top" differs from gallium's... + * + * Also, sometimes (in FBO cases) GL will render upside down + * to its usual method, in which case it will probably want + * to use the opposite, top-left convention. + */ + if (plane->dcdx < 0) { + /* both fill conventions want this - adjust for left edges */ + plane->c++; + } + else if (plane->dcdx == 0) { + if (setup->pixel_offset == 0) { + /* correct for top-left fill convention: + */ + if (plane->dcdy > 0) plane->c++; + } + else { + /* correct for bottom-left fill convention: + */ + if (plane->dcdy < 0) plane->c++; + } + } - tri->eo3 = 0; - if (tri->dy31 < 0) tri->eo3 -= tri->dy31; - if (tri->dx31 > 0) tri->eo3 += tri->dx31; + plane->dcdx *= FIXED_ONE; + plane->dcdy *= FIXED_ONE; - /* Calculate trivial accept offsets from the above. - */ - tri->ei1 = tri->dx12 - tri->dy12 - tri->eo1; - tri->ei2 = tri->dx23 - tri->dy23 - tri->eo2; - tri->ei3 = tri->dx31 - tri->dy31 - tri->eo3; + /* find trivial reject offsets for each edge for a single-pixel + * sized block. These will be scaled up at each recursive level to + * match the active blocksize. Scaling in this way works best if + * the blocks are square. + */ + plane->eo = 0; + if (plane->dcdx < 0) plane->eo -= plane->dcdx; + if (plane->dcdy > 0) plane->eo += plane->dcdy; - /* Fill in the inputs.step[][] arrays. - * We've manually unrolled some loops here. - */ - { - const int xstep1 = -tri->dy12; - const int xstep2 = -tri->dy23; - const int xstep3 = -tri->dy31; - const int ystep1 = tri->dx12; - const int ystep2 = tri->dx23; - const int ystep3 = tri->dx31; - -#define SETUP_STEP(i, x, y) \ - do { \ - tri->inputs.step[0][i] = x * xstep1 + y * ystep1; \ - tri->inputs.step[1][i] = x * xstep2 + y * ystep2; \ - tri->inputs.step[2][i] = x * xstep3 + y * ystep3; \ - } while (0) + /* Calculate trivial accept offsets from the above. + */ + plane->ei = plane->dcdy - plane->dcdx - plane->eo; + plane->step = tri->step[i]; + + /* Fill in the inputs.step[][] arrays. + * We've manually unrolled some loops here. + */ +#define SETUP_STEP(j, x, y) \ + tri->step[i][j] = y * plane->dcdy - x * plane->dcdx + SETUP_STEP(0, 0, 0); SETUP_STEP(1, 1, 0); SETUP_STEP(2, 0, 1); @@ -522,63 +621,106 @@ do_triangle_ccw(struct lp_setup_context *setup, #undef STEP } + + /* + * When rasterizing scissored tris, use the intersection of the + * triangle bounding box and the scissor rect to generate the + * scissor planes. + * + * This permits us to cut off the triangle "tails" that are present + * in the intermediate recursive levels caused when two of the + * triangles edges don't diverge quickly enough to trivially reject + * exterior blocks from the triangle. + * + * It's not really clear if it's worth worrying about these tails, + * but since we generate the planes for each scissored tri, it's + * free to trim them in this case. + * + * Note that otherwise, the scissor planes only vary in 'C' value, + * and even then only on state-changes. Could alternatively store + * these planes elsewhere. + */ + if (nr_planes == 7) { + tri->plane[3].step = step_scissor_minx; + tri->plane[3].dcdx = -1; + tri->plane[3].dcdy = 0; + tri->plane[3].c = 1-minx; + tri->plane[3].ei = 0; + tri->plane[3].eo = 1; + + tri->plane[4].step = step_scissor_maxx; + tri->plane[4].dcdx = 1; + tri->plane[4].dcdy = 0; + tri->plane[4].c = maxx; + tri->plane[4].ei = -1; + tri->plane[4].eo = 0; + + tri->plane[5].step = step_scissor_miny; + tri->plane[5].dcdx = 0; + tri->plane[5].dcdy = 1; + tri->plane[5].c = 1-miny; + tri->plane[5].ei = 0; + tri->plane[5].eo = 1; + + tri->plane[6].step = step_scissor_maxy; + tri->plane[6].dcdx = 0; + tri->plane[6].dcdy = -1; + tri->plane[6].c = maxy; + tri->plane[6].ei = -1; + tri->plane[6].eo = 0; + } + + /* * All fields of 'tri' are now set. The remaining code here is * concerned with binning. */ - /* Convert to tile coordinates: + /* Convert to tile coordinates, and inclusive ranges: */ - minx = minx / TILE_SIZE; - miny = miny / TILE_SIZE; - maxx = maxx / TILE_SIZE; - maxy = maxy / TILE_SIZE; + ix0 = minx / TILE_SIZE; + iy0 = miny / TILE_SIZE; + ix1 = (maxx-1) / TILE_SIZE; + iy1 = (maxy-1) / TILE_SIZE; /* * Clamp to framebuffer size */ - minx = MAX2(minx, 0); - miny = MAX2(miny, 0); - maxx = MIN2(maxx, scene->tiles_x - 1); - maxy = MIN2(maxy, scene->tiles_y - 1); + assert(ix0 == MAX2(ix0, 0)); + assert(iy0 == MAX2(iy0, 0)); + assert(ix1 == MIN2(ix1, scene->tiles_x - 1)); + assert(iy1 == MIN2(iy1, scene->tiles_y - 1)); /* Determine which tile(s) intersect the triangle's bounding box */ - if (miny == maxy && minx == maxx) + if (iy0 == iy1 && ix0 == ix1) { /* Triangle is contained in a single tile: */ - lp_scene_bin_command( scene, minx, miny, lp_rast_triangle, - lp_rast_arg_triangle(tri) ); + lp_scene_bin_command( scene, ix0, iy0, + lp_rast_tri_tab[nr_planes], + lp_rast_arg_triangle(tri, (1<<nr_planes)-1) ); } - else + else { - int c1 = (tri->c1 + - tri->dx12 * miny * TILE_SIZE - - tri->dy12 * minx * TILE_SIZE); - int c2 = (tri->c2 + - tri->dx23 * miny * TILE_SIZE - - tri->dy23 * minx * TILE_SIZE); - int c3 = (tri->c3 + - tri->dx31 * miny * TILE_SIZE - - tri->dy31 * minx * TILE_SIZE); - - int ei1 = tri->ei1 << TILE_ORDER; - int ei2 = tri->ei2 << TILE_ORDER; - int ei3 = tri->ei3 << TILE_ORDER; - - int eo1 = tri->eo1 << TILE_ORDER; - int eo2 = tri->eo2 << TILE_ORDER; - int eo3 = tri->eo3 << TILE_ORDER; - - int xstep1 = -(tri->dy12 << TILE_ORDER); - int xstep2 = -(tri->dy23 << TILE_ORDER); - int xstep3 = -(tri->dy31 << TILE_ORDER); - - int ystep1 = tri->dx12 << TILE_ORDER; - int ystep2 = tri->dx23 << TILE_ORDER; - int ystep3 = tri->dx31 << TILE_ORDER; + int c[7]; + int ei[7]; + int eo[7]; + int xstep[7]; + int ystep[7]; int x, y; + + for (i = 0; i < nr_planes; i++) { + c[i] = (tri->plane[i].c + + tri->plane[i].dcdy * iy0 * TILE_SIZE - + tri->plane[i].dcdx * ix0 * TILE_SIZE); + + ei[i] = tri->plane[i].ei << TILE_ORDER; + eo[i] = tri->plane[i].eo << TILE_ORDER; + xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER); + ystep[i] = tri->plane[i].dcdy << TILE_ORDER; + } + /* Test tile-sized blocks against the triangle. @@ -586,63 +728,67 @@ do_triangle_ccw(struct lp_setup_context *setup, * contained inside the tri, bin an lp_rast_shade_tile command. * Else, bin a lp_rast_triangle command. */ - for (y = miny; y <= maxy; y++) + for (y = iy0; y <= iy1; y++) { - int cx1 = c1; - int cx2 = c2; - int cx3 = c3; boolean in = FALSE; /* are we inside the triangle? */ + int cx[7]; + + for (i = 0; i < nr_planes; i++) + cx[i] = c[i]; - for (x = minx; x <= maxx; x++) + for (x = ix0; x <= ix1; x++) { - if (cx1 + eo1 < 0 || - cx2 + eo2 < 0 || - cx3 + eo3 < 0) - { - /* do nothing */ + int out = 0; + int partial = 0; + + for (i = 0; i < nr_planes; i++) { + int planeout = cx[i] + eo[i]; + int planepartial = cx[i] + ei[i] - 1; + out |= (planeout >> 31); + partial |= (planepartial >> 31) & (1<<i); + } + + if (out) { + /* do nothing */ + if (in) + break; /* exiting triangle, all done with this row */ LP_COUNT(nr_empty_64); - if (in) - break; /* exiting triangle, all done with this row */ - } - else if (cx1 + ei1 > 0 && - cx2 + ei2 > 0 && - cx3 + ei3 > 0) - { + } + else if (partial) { + /* Not trivially accepted by at least one plane - + * rasterize/shade partial tile + */ + int count = util_bitcount(partial); + in = TRUE; + lp_scene_bin_command( scene, x, y, + lp_rast_tri_tab[count], + lp_rast_arg_triangle(tri, partial) ); + + LP_COUNT(nr_partially_covered_64); + } + else { /* triangle covers the whole tile- shade whole tile */ LP_COUNT(nr_fully_covered_64); - in = TRUE; - if (setup->fs.current.variant->opaque) { + in = TRUE; + if (variant->opaque && + !setup->fb.zsbuf) { lp_scene_bin_reset( scene, x, y ); - lp_scene_bin_command( scene, x, y, - lp_rast_set_state, - lp_rast_arg_state(setup->fs.stored) ); } lp_scene_bin_command( scene, x, y, lp_rast_shade_tile, lp_rast_arg_inputs(&tri->inputs) ); - } - else - { - /* rasterizer/shade partial tile */ - LP_COUNT(nr_partially_covered_64); - in = TRUE; - lp_scene_bin_command( scene, x, y, - lp_rast_triangle, - lp_rast_arg_triangle(tri) ); - } + } /* Iterate cx values across the region: */ - cx1 += xstep1; - cx2 += xstep2; - cx3 += xstep3; + for (i = 0; i < nr_planes; i++) + cx[i] += xstep[i]; } /* Iterate c values down the region: */ - c1 += ystep1; - c2 += ystep2; - c3 += ystep3; + for (i = 0; i < nr_planes; i++) + c[i] += ystep[i]; } } } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c index f6a424f25a..51948f5bf2 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c @@ -61,7 +61,9 @@ lp_setup_get_vertex_info(struct vbuf_render *vbr) { struct lp_setup_context *setup = lp_setup_context(vbr); - /* vertex size/info depends on the latest state */ + /* Vertex size/info depends on the latest state. + * The draw module may have issued additional state-change commands. + */ lp_setup_update_state(setup); return setup->vertex_info; diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h index 05d1b93794..86313e1c48 100644 --- a/src/gallium/drivers/llvmpipe/lp_state.h +++ b/src/gallium/drivers/llvmpipe/lp_state.h @@ -130,6 +130,12 @@ llvmpipe_init_rasterizer_funcs(struct llvmpipe_context *llvmpipe); void llvmpipe_init_so_funcs(struct llvmpipe_context *llvmpipe); +void +llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *ctx, + unsigned num, + struct pipe_sampler_view **views); +void +llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx); #endif diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index d20a5218d4..77bec4640b 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -189,7 +189,7 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe ) llvmpipe->constants[PIPE_SHADER_FRAGMENT][0]); if (llvmpipe->dirty & LP_NEW_SAMPLER_VIEW) - lp_setup_set_fragment_sampler_views(llvmpipe->setup, + lp_setup_set_fragment_sampler_views(llvmpipe->setup, llvmpipe->num_fragment_sampler_views, llvmpipe->fragment_sampler_views); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 65115052cd..5953d690a4 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -31,9 +31,6 @@ * Code generate the whole fragment pipeline. * * The fragment pipeline consists of the following stages: - * - triangle edge in/out testing - * - scissor test - * - stipple (TBI) * - early depth test * - fragment shader * - alpha test @@ -97,6 +94,7 @@ #include "lp_state.h" #include "lp_tex_sample.h" #include "lp_flush.h" +#include "lp_state_fs.h" #include <llvm-c/Analysis.h> @@ -170,177 +168,63 @@ generate_depth_stencil(LLVMBuilderRef builder, /** - * Generate the code to do inside/outside triangle testing for the + * Expand the relevent bits of mask_input to a 4-dword mask for the * four pixels in a 2x2 quad. This will set the four elements of the * quad mask vector to 0 or ~0. - * \param i which quad of the quad group to test, in [0,3] + * + * \param quad which quad of the quad group to test, in [0,3] + * \param mask_input bitwise mask for the whole 4x4 stamp */ -static void -generate_tri_edge_mask(LLVMBuilderRef builder, - unsigned i, - LLVMValueRef *mask, /* ivec4, out */ - LLVMValueRef c0, /* int32 */ - LLVMValueRef c1, /* int32 */ - LLVMValueRef c2, /* int32 */ - LLVMValueRef step0_ptr, /* ivec4 */ - LLVMValueRef step1_ptr, /* ivec4 */ - LLVMValueRef step2_ptr) /* ivec4 */ +static LLVMValueRef +generate_quad_mask(LLVMBuilderRef builder, + struct lp_type fs_type, + unsigned quad, + LLVMValueRef mask_input) /* int32 */ { -#define OPTIMIZE_IN_OUT_TEST 0 -#if OPTIMIZE_IN_OUT_TEST - struct lp_build_if_state ifctx; - LLVMValueRef not_draw_all; -#endif - struct lp_build_flow_context *flow; - struct lp_type i32_type; - LLVMTypeRef i32vec4_type; - LLVMValueRef c0_vec, c1_vec, c2_vec; - LLVMValueRef in_out_mask; - - assert(i < 4); - - /* int32 vector type */ - memset(&i32_type, 0, sizeof i32_type); - i32_type.floating = FALSE; /* values are integers */ - i32_type.sign = TRUE; /* values are signed */ - i32_type.norm = FALSE; /* values are not normalized */ - i32_type.width = 32; /* 32-bit int values */ - i32_type.length = 4; /* 4 elements per vector */ - - i32vec4_type = lp_build_int32_vec4_type(); + struct lp_type mask_type; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef bits[4]; + LLVMValueRef mask; /* - * Use a conditional here to do detailed pixel in/out testing. - * We only have to do this if c0 != INT_MIN. + * XXX: We'll need a different path for 16 x u8 */ - flow = lp_build_flow_create(builder); - lp_build_flow_scope_begin(flow); - - { -#if OPTIMIZE_IN_OUT_TEST - /* not_draw_all = (c0 != INT_MIN) */ - not_draw_all = LLVMBuildICmp(builder, - LLVMIntNE, - c0, - LLVMConstInt(LLVMInt32Type(), INT_MIN, 0), - ""); - - in_out_mask = lp_build_const_int_vec(i32_type, ~0); - - - lp_build_flow_scope_declare(flow, &in_out_mask); - - /* if (not_draw_all) {... */ - lp_build_if(&ifctx, flow, builder, not_draw_all); -#endif - { - LLVMValueRef step0_vec, step1_vec, step2_vec; - LLVMValueRef m0_vec, m1_vec, m2_vec; - LLVMValueRef index, m; - - /* c0_vec = {c0, c0, c0, c0} - * Note that we emit this code four times but LLVM optimizes away - * three instances of it. - */ - c0_vec = lp_build_broadcast(builder, i32vec4_type, c0); - c1_vec = lp_build_broadcast(builder, i32vec4_type, c1); - c2_vec = lp_build_broadcast(builder, i32vec4_type, c2); - lp_build_name(c0_vec, "edgeconst0vec"); - lp_build_name(c1_vec, "edgeconst1vec"); - lp_build_name(c2_vec, "edgeconst2vec"); - - /* load step0vec, step1, step2 vec from memory */ - index = LLVMConstInt(LLVMInt32Type(), i, 0); - step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), ""); - step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), ""); - step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), ""); - lp_build_name(step0_vec, "step0vec"); - lp_build_name(step1_vec, "step1vec"); - lp_build_name(step2_vec, "step2vec"); - - /* m0_vec = step0_ptr[i] > c0_vec */ - m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec); - m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec); - m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec); - - /* in_out_mask = m0_vec & m1_vec & m2_vec */ - m = LLVMBuildAnd(builder, m0_vec, m1_vec, ""); - in_out_mask = LLVMBuildAnd(builder, m, m2_vec, ""); - lp_build_name(in_out_mask, "inoutmaskvec"); - } -#if OPTIMIZE_IN_OUT_TEST - lp_build_endif(&ifctx); -#endif - - } - lp_build_flow_scope_end(flow); - lp_build_flow_destroy(flow); + assert(fs_type.width == 32); + assert(fs_type.length == 4); + mask_type = lp_int_type(fs_type); - /* This is the initial alive/dead pixel mask for a quad of four pixels. - * It's an int[4] vector with each word set to 0 or ~0. - * Words will get cleared when pixels faile the Z test, etc. + /* + * mask_input >>= (quad * 4) */ - *mask = in_out_mask; -} - - -static LLVMValueRef -generate_scissor_test(LLVMBuilderRef builder, - LLVMValueRef context_ptr, - const struct lp_build_interp_soa_context *interp, - struct lp_type type) -{ - LLVMTypeRef vec_type = lp_build_vec_type(type); - LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1]; - LLVMValueRef xmin, ymin, xmax, ymax; - LLVMValueRef m0, m1, m2, m3, m; - - /* xpos, ypos contain the window coords for the four pixels in the quad */ - assert(xpos); - assert(ypos); - - /* get the current scissor bounds, convert to vectors */ - xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr); - xmin = lp_build_broadcast(builder, vec_type, xmin); - - ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr); - ymin = lp_build_broadcast(builder, vec_type, ymin); - xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr); - xmax = lp_build_broadcast(builder, vec_type, xmax); + mask_input = LLVMBuildLShr(builder, + mask_input, + LLVMConstInt(i32t, quad * 4, 0), + ""); - ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr); - ymax = lp_build_broadcast(builder, vec_type, ymax); + /* + * mask = { mask_input & (1 << i), for i in [0,3] } + */ - /* compare the fragment's position coordinates against the scissor bounds */ - m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin); - m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin); - m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax); - m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax); + mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input); - /* AND all the masks together */ - m = LLVMBuildAnd(builder, m0, m1, ""); - m = LLVMBuildAnd(builder, m, m2, ""); - m = LLVMBuildAnd(builder, m, m3, ""); + bits[0] = LLVMConstInt(i32t, 1 << 0, 0); + bits[1] = LLVMConstInt(i32t, 1 << 1, 0); + bits[2] = LLVMConstInt(i32t, 1 << 2, 0); + bits[3] = LLVMConstInt(i32t, 1 << 3, 0); - lp_build_name(m, "scissormask"); + mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), ""); - return m; -} + /* + * mask = mask != 0 ? ~0 : 0 + */ + mask = lp_build_compare(builder, + mask_type, PIPE_FUNC_NOTEQUAL, + mask, + lp_build_const_int_vec(mask_type, 0)); -static LLVMValueRef -build_int32_vec_const(int value) -{ - struct lp_type i32_type; - - memset(&i32_type, 0, sizeof i32_type); - i32_type.floating = FALSE; /* values are integers */ - i32_type.sign = TRUE; /* values are signed */ - i32_type.norm = FALSE; /* values are not normalized */ - i32_type.width = 32; /* 32-bit int values */ - i32_type.length = 4; /* 4 elements per vector */ - return lp_build_const_int_vec(i32_type, value); + return mask; } @@ -348,7 +232,7 @@ build_int32_vec_const(int value) /** * Generate the fragment shader, depth/stencil test, and alpha tests. * \param i which quad in the tile, in range [0,3] - * \param do_tri_test if 1, do triangle edge in/out testing + * \param partial_mask if 1, do mask_input testing */ static void generate_fs(struct llvmpipe_context *lp, @@ -364,13 +248,8 @@ generate_fs(struct llvmpipe_context *lp, LLVMValueRef (*color)[4], LLVMValueRef depth_ptr, LLVMValueRef facing, - unsigned do_tri_test, - LLVMValueRef c0, - LLVMValueRef c1, - LLVMValueRef c2, - LLVMValueRef step0_ptr, - LLVMValueRef step1_ptr, - LLVMValueRef step2_ptr, + unsigned partial_mask, + LLVMValueRef mask_input, LLVMValueRef counter) { const struct tgsi_token *tokens = shader->base.tokens; @@ -411,23 +290,17 @@ generate_fs(struct llvmpipe_context *lp, lp_build_flow_scope_declare(flow, &z); /* do triangle edge testing */ - if (do_tri_test) { - generate_tri_edge_mask(builder, i, pmask, - c0, c1, c2, step0_ptr, step1_ptr, step2_ptr); + if (partial_mask) { + *pmask = generate_quad_mask(builder, type, + i, mask_input); } else { - *pmask = build_int32_vec_const(~0); + *pmask = lp_build_const_int_vec(type, ~0); } /* 'mask' will control execution based on quad's pixel alive/killed state */ lp_build_mask_begin(&mask, flow, type, *pmask); - if (key->scissor) { - LLVMValueRef smask = - generate_scissor_test(builder, context_ptr, interp, type); - lp_build_mask_update(&mask, smask); - } - early_depth_stencil_test = (key->depth.enabled || key->stencil[0].enabled) && !key->alpha.enabled && @@ -579,7 +452,7 @@ static void generate_fragment(struct llvmpipe_context *lp, struct lp_fragment_shader *shader, struct lp_fragment_shader_variant *variant, - unsigned do_tri_test) + unsigned partial_mask) { struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); const struct lp_fragment_shader_variant_key *key = &variant->key; @@ -589,9 +462,8 @@ generate_fragment(struct llvmpipe_context *lp, LLVMTypeRef fs_elem_type; LLVMTypeRef fs_int_vec_type; LLVMTypeRef blend_vec_type; - LLVMTypeRef arg_types[16]; + LLVMTypeRef arg_types[11]; LLVMTypeRef func_type; - LLVMTypeRef int32_vec4_type = lp_build_int32_vec4_type(); LLVMValueRef context_ptr; LLVMValueRef x; LLVMValueRef y; @@ -600,7 +472,8 @@ generate_fragment(struct llvmpipe_context *lp, LLVMValueRef dady_ptr; LLVMValueRef color_ptr_ptr; LLVMValueRef depth_ptr; - LLVMValueRef c0, c1, c2, step0_ptr, step1_ptr, step2_ptr, counter = NULL; + LLVMValueRef mask_input; + LLVMValueRef counter = NULL; LLVMBasicBlockRef block; LLVMBuilderRef builder; struct lp_build_sampler_soa *sampler; @@ -645,7 +518,7 @@ generate_fragment(struct llvmpipe_context *lp, blend_vec_type = lp_build_vec_type(blend_type); util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", - shader->no, variant->no, do_tri_test ? "edge" : "whole"); + shader->no, variant->no, partial_mask ? "partial" : "whole"); arg_types[0] = screen->context_ptr_type; /* context */ arg_types[1] = LLVMInt32Type(); /* x */ @@ -656,23 +529,15 @@ generate_fragment(struct llvmpipe_context *lp, arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dady */ arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0); /* color */ arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */ - arg_types[9] = LLVMInt32Type(); /* c0 */ - arg_types[10] = LLVMInt32Type(); /* c1 */ - arg_types[11] = LLVMInt32Type(); /* c2 */ - /* Note: the step arrays are built as int32[16] but we interpret - * them here as int32_vec4[4]. - */ - arg_types[12] = LLVMPointerType(int32_vec4_type, 0);/* step0 */ - arg_types[13] = LLVMPointerType(int32_vec4_type, 0);/* step1 */ - arg_types[14] = LLVMPointerType(int32_vec4_type, 0);/* step2 */ - arg_types[15] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */ + arg_types[9] = LLVMInt32Type(); /* mask_input */ + arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */ func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); function = LLVMAddFunction(screen->module, func_name, func_type); LLVMSetFunctionCallConv(function, LLVMCCallConv); - variant->function[do_tri_test] = function; + variant->function[partial_mask] = function; /* XXX: need to propagate noalias down into color param now we are @@ -691,12 +556,7 @@ generate_fragment(struct llvmpipe_context *lp, dady_ptr = LLVMGetParam(function, 6); color_ptr_ptr = LLVMGetParam(function, 7); depth_ptr = LLVMGetParam(function, 8); - c0 = LLVMGetParam(function, 9); - c1 = LLVMGetParam(function, 10); - c2 = LLVMGetParam(function, 11); - step0_ptr = LLVMGetParam(function, 12); - step1_ptr = LLVMGetParam(function, 13); - step2_ptr = LLVMGetParam(function, 14); + mask_input = LLVMGetParam(function, 9); lp_build_name(context_ptr, "context"); lp_build_name(x, "x"); @@ -706,15 +566,10 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(dady_ptr, "dady"); lp_build_name(color_ptr_ptr, "color_ptr_ptr"); lp_build_name(depth_ptr, "depth"); - lp_build_name(c0, "c0"); - lp_build_name(c1, "c1"); - lp_build_name(c2, "c2"); - lp_build_name(step0_ptr, "step0"); - lp_build_name(step1_ptr, "step1"); - lp_build_name(step2_ptr, "step2"); + lp_build_name(mask_input, "mask_input"); if (key->occlusion_count) { - counter = LLVMGetParam(function, 15); + counter = LLVMGetParam(function, 10); lp_build_name(counter, "counter"); } @@ -763,9 +618,9 @@ generate_fragment(struct llvmpipe_context *lp, out_color, depth_ptr_i, facing, - do_tri_test, - c0, c1, c2, - step0_ptr, step1_ptr, step2_ptr, counter); + partial_mask, + mask_input, + counter); for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) for(chan = 0; chan < NUM_CHANNELS; ++chan) @@ -792,9 +647,13 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]); } - lp_build_conv_mask(builder, fs_type, blend_type, - fs_mask, num_fs, - &blend_mask, 1); + if (partial_mask || !variant->opaque) { + lp_build_conv_mask(builder, fs_type, blend_type, + fs_mask, num_fs, + &blend_mask, 1); + } else { + blend_mask = lp_build_const_int_vec(blend_type, ~0); + } color_ptr = LLVMBuildLoad(builder, LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""), @@ -832,8 +691,7 @@ generate_fragment(struct llvmpipe_context *lp, #endif /* Apply optimizations to LLVM IR */ - if (1) - LLVMRunFunctionPassManager(screen->pass, function); + LLVMRunFunctionPassManager(screen->pass, function); if (gallivm_debug & GALLIVM_DEBUG_IR) { /* Print the LLVM IR to stderr */ @@ -847,7 +705,7 @@ generate_fragment(struct llvmpipe_context *lp, { void *f = LLVMGetPointerToGlobal(screen->engine, function); - variant->jit_function[do_tri_test] = (lp_jit_frag_func)pointer_to_func(f); + variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f); if (gallivm_debug & GALLIVM_DEBUG_ASM) { lp_disassemble(f); @@ -963,7 +821,6 @@ generate_variant(struct llvmpipe_context *lp, !key->stencil[0].enabled && !key->alpha.enabled && !key->depth.enabled && - !key->scissor && !shader->info.uses_kill ? TRUE : FALSE; @@ -1182,7 +1039,6 @@ make_variant_key(struct llvmpipe_context *lp, /* alpha.ref_value is passed in jit_context */ key->flatshade = lp->rasterizer->flatshade; - key->scissor = lp->rasterizer->scissor; if (lp->active_query_count) { key->occlusion_count = TRUE; } diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 593cd4de6b..37900fc544 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -54,7 +54,6 @@ struct lp_fragment_shader_variant_key enum pipe_format zsbuf_format; unsigned nr_cbufs:8; unsigned flatshade:1; - unsigned scissor:1; unsigned occlusion_count:1; struct { diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c index e94065fb6a..715ce2f02e 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c +++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c @@ -35,10 +35,9 @@ #include "draw/draw_context.h" #include "lp_context.h" -#include "lp_context.h" +#include "lp_screen.h" #include "lp_state.h" -#include "draw/draw_context.h" - +#include "state_tracker/sw_winsys.h" static void * @@ -100,6 +99,10 @@ llvmpipe_bind_vertex_sampler_states(struct pipe_context *pipe, llvmpipe->num_vertex_samplers = num_samplers; + draw_set_samplers(llvmpipe->draw, + llvmpipe->vertex_samplers, + llvmpipe->num_vertex_samplers); + llvmpipe->dirty |= LP_NEW_SAMPLER; } @@ -166,6 +169,10 @@ llvmpipe_set_vertex_sampler_views(struct pipe_context *pipe, llvmpipe->num_vertex_sampler_views = num; + draw_set_sampler_views(llvmpipe->draw, + llvmpipe->vertex_sampler_views, + llvmpipe->num_vertex_sampler_views); + llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW; } @@ -214,6 +221,77 @@ llvmpipe_delete_sampler_state(struct pipe_context *pipe, } +/** + * Called during state validation when LP_NEW_SAMPLER_VIEW is set. + */ +void +llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp, + unsigned num, + struct pipe_sampler_view **views) +{ + unsigned i; + uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS]; + uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS]; + const void *data[DRAW_MAX_TEXTURE_LEVELS]; + + assert(num <= PIPE_MAX_VERTEX_SAMPLERS); + if (!num) + return; + + for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) { + struct pipe_sampler_view *view = i < num ? views[i] : NULL; + + if (view) { + struct pipe_resource *tex = view->texture; + struct llvmpipe_resource *lp_tex = llvmpipe_resource(tex); + + /* We're referencing the texture's internal data, so save a + * reference to it. + */ + pipe_resource_reference(&lp->mapped_vs_tex[i], tex); + + if (!lp_tex->dt) { + /* regular texture - setup array of mipmap level pointers */ + int j; + for (j = 0; j <= tex->last_level; j++) { + data[j] = + llvmpipe_get_texture_image_all(lp_tex, j, LP_TEX_USAGE_READ, + LP_TEX_LAYOUT_LINEAR); + row_stride[j] = lp_tex->row_stride[j]; + img_stride[j] = lp_tex->img_stride[j]; + } + } + else { + /* display target texture/surface */ + /* + * XXX: Where should this be unmapped? + */ + struct llvmpipe_screen *screen = llvmpipe_screen(tex->screen); + struct sw_winsys *winsys = screen->winsys; + data[0] = winsys->displaytarget_map(winsys, lp_tex->dt, + PIPE_TRANSFER_READ); + row_stride[0] = lp_tex->row_stride[0]; + img_stride[0] = lp_tex->img_stride[0]; + assert(data[0]); + } + draw_set_mapped_texture(lp->draw, + i, + tex->width0, tex->height0, tex->depth0, + tex->last_level, + row_stride, img_stride, data); + } + } +} + +void +llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx) +{ + unsigned i; + for (i = 0; i < Elements(ctx->mapped_vs_tex); i++) { + pipe_resource_reference(&ctx->mapped_vs_tex[i], NULL); + } +} + void llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe) { diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c b/src/gallium/drivers/llvmpipe/lp_state_so.c index 4c64a5b142..30b17c9881 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_so.c +++ b/src/gallium/drivers/llvmpipe/lp_state_so.c @@ -29,7 +29,6 @@ #include "lp_state.h" #include "lp_texture.h" -#include "util/u_format.h" #include "util/u_memory.h" #include "draw/draw_context.h" diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c index 76b3fce1fa..f761e82850 100644 --- a/src/gallium/drivers/llvmpipe/lp_surface.c +++ b/src/gallium/drivers/llvmpipe/lp_surface.c @@ -67,14 +67,14 @@ lp_resource_copy(struct pipe_context *pipe, dst, subdst.face, subdst.level, 0, /* flush_flags */ FALSE, /* read_only */ - FALSE, /* cpu_access */ + TRUE, /* cpu_access */ FALSE); /* do_not_block */ llvmpipe_flush_resource(pipe, src, subsrc.face, subsrc.level, 0, /* flush_flags */ TRUE, /* read_only */ - FALSE, /* cpu_access */ + TRUE, /* cpu_access */ FALSE); /* do_not_block */ /* @@ -106,19 +106,27 @@ lp_resource_copy(struct pipe_context *pipe, unsigned x, y; enum lp_texture_usage usage; - /* XXX for the tiles which are completely contained by the - * dest rectangle, we could set the usage mode to WRITE_ALL. - * Just test for the case of replacing the whole dest region for now. - */ - if (width == dst_tex->base.width0 && height == dst_tex->base.height0) - usage = LP_TEX_USAGE_WRITE_ALL; - else - usage = LP_TEX_USAGE_READ_WRITE; - adjust_to_tile_bounds(dstx, dsty, width, height, &tx, &ty, &tw, &th); for (y = 0; y < th; y += TILE_SIZE) { + boolean contained_y = ty + y >= dsty && + ty + y + TILE_SIZE <= dsty + height ? + TRUE : FALSE; + for (x = 0; x < tw; x += TILE_SIZE) { + boolean contained_x = tx + x >= dstx && + tx + x + TILE_SIZE <= dstx + width ? + TRUE : FALSE; + + /* + * Set the usage mode to WRITE_ALL for the tiles which are + * completely contained by the dest rectangle. + */ + if (contained_y && contained_x) + usage = LP_TEX_USAGE_WRITE_ALL; + else + usage = LP_TEX_USAGE_READ_WRITE; + (void) llvmpipe_get_texture_tile_linear(dst_tex, subdst.face, subdst.level, usage, @@ -138,13 +146,15 @@ lp_resource_copy(struct pipe_context *pipe, subdst.level, LP_TEX_LAYOUT_LINEAR); - util_copy_rect(dst_linear_ptr, format, - llvmpipe_resource_stride(&dst_tex->base, subdst.level), - dstx, dsty, - width, height, - src_linear_ptr, - llvmpipe_resource_stride(&src_tex->base, subsrc.level), - srcx, srcy); + if (dst_linear_ptr && src_linear_ptr) { + util_copy_rect(dst_linear_ptr, format, + llvmpipe_resource_stride(&dst_tex->base, subdst.level), + dstx, dsty, + width, height, + src_linear_ptr, + llvmpipe_resource_stride(&src_tex->base, subsrc.level), + srcx, srcy); + } } } diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c index 9b02f436c5..cf41b40581 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_conv.c +++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c @@ -167,19 +167,26 @@ test_one(unsigned verbose, unsigned i, j; void *code; + if (src_type.width * src_type.length != dst_type.width * dst_type.length && + src_type.length != dst_type.length) { + return TRUE; + } + if(verbose >= 1) dump_conv_types(stdout, src_type, dst_type); - if(src_type.length > dst_type.length) { + if (src_type.length > dst_type.length) { num_srcs = 1; num_dsts = src_type.length/dst_type.length; } - else { + else if (src_type.length < dst_type.length) { num_dsts = 1; num_srcs = dst_type.length/src_type.length; } - - assert(src_type.width * src_type.length == dst_type.width * dst_type.length); + else { + num_dsts = 1; + num_srcs = 1; + } /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); @@ -381,6 +388,11 @@ const struct lp_type conv_types[] = { { FALSE, FALSE, TRUE, FALSE, 8, 16 }, { FALSE, FALSE, FALSE, TRUE, 8, 16 }, { FALSE, FALSE, FALSE, FALSE, 8, 16 }, + + { FALSE, FALSE, TRUE, TRUE, 8, 4 }, + { FALSE, FALSE, TRUE, FALSE, 8, 4 }, + { FALSE, FALSE, FALSE, TRUE, 8, 4 }, + { FALSE, FALSE, FALSE, FALSE, 8, 4 }, }; diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c index 8b6dc1c7f5..2855d7cea4 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_format.c +++ b/src/gallium/drivers/llvmpipe/lp_test_format.c @@ -31,6 +31,7 @@ #include <float.h> #include "gallivm/lp_bld.h" +#include "gallivm/lp_bld_debug.h" #include "gallivm/lp_bld_init.h" #include <llvm-c/Analysis.h> #include <llvm-c/Target.h> @@ -38,6 +39,7 @@ #include "util/u_memory.h" #include "util/u_pointer.h" +#include "util/u_string.h" #include "util/u_format.h" #include "util/u_format_tests.h" #include "util/u_format_s3tc.h" @@ -71,17 +73,20 @@ write_tsv_row(FILE *fp, typedef void -(*fetch_ptr_t)(float *, const void *packed, +(*fetch_ptr_t)(void *unpacked, const void *packed, unsigned i, unsigned j); static LLVMValueRef -add_fetch_rgba_test(LLVMModuleRef lp_build_module, - const struct util_format_description *desc) +add_fetch_rgba_test(unsigned verbose, + const struct util_format_description *desc, + struct lp_type type) { + char name[256]; LLVMTypeRef args[4]; LLVMValueRef func; LLVMValueRef packed_ptr; + LLVMValueRef offset = LLVMConstNull(LLVMInt32Type()); LLVMValueRef rgba_ptr; LLVMValueRef i; LLVMValueRef j; @@ -89,11 +94,15 @@ add_fetch_rgba_test(LLVMModuleRef lp_build_module, LLVMBuilderRef builder; LLVMValueRef rgba; - args[0] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0); + util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name, + type.floating ? "float" : "unorm8"); + + args[0] = LLVMPointerType(lp_build_vec_type(type), 0); args[1] = LLVMPointerType(LLVMInt8Type(), 0); args[3] = args[2] = LLVMInt32Type(); - func = LLVMAddFunction(lp_build_module, "fetch", LLVMFunctionType(LLVMVoidType(), args, Elements(args), 0)); + func = LLVMAddFunction(lp_build_module, name, + LLVMFunctionType(LLVMVoidType(), args, Elements(args), 0)); LLVMSetFunctionCallConv(func, LLVMCCallConv); rgba_ptr = LLVMGetParam(func, 0); packed_ptr = LLVMGetParam(func, 1); @@ -104,91 +113,104 @@ add_fetch_rgba_test(LLVMModuleRef lp_build_module, builder = LLVMCreateBuilder(); LLVMPositionBuilderAtEnd(builder, block); - rgba = lp_build_fetch_rgba_aos(builder, desc, packed_ptr, i, j); + rgba = lp_build_fetch_rgba_aos(builder, desc, type, + packed_ptr, offset, i, j); LLVMBuildStore(builder, rgba, rgba_ptr); LLVMBuildRetVoid(builder); LLVMDisposeBuilder(builder); + + if (LLVMVerifyFunction(func, LLVMPrintMessageAction)) { + LLVMDumpValue(func); + abort(); + } + + LLVMRunFunctionPassManager(lp_build_pass, func); + + if (verbose >= 1) { + LLVMDumpValue(func); + } + return func; } PIPE_ALIGN_STACK static boolean -test_format(unsigned verbose, FILE *fp, - const struct util_format_description *desc, - const struct util_format_test_case *test) +test_format_float(unsigned verbose, FILE *fp, + const struct util_format_description *desc) { LLVMValueRef fetch = NULL; - LLVMPassManagerRef pass = NULL; fetch_ptr_t fetch_ptr; PIPE_ALIGN_VAR(16) float unpacked[4]; - boolean success; - unsigned i, j, k; + boolean first = TRUE; + boolean success = TRUE; + unsigned i, j, k, l; + void *f; - fetch = add_fetch_rgba_test(lp_build_module, desc); + fetch = add_fetch_rgba_test(verbose, desc, lp_float32_vec4_type()); - if (LLVMVerifyFunction(fetch, LLVMPrintMessageAction)) { - LLVMDumpValue(fetch); - abort(); + f = LLVMGetPointerToGlobal(lp_build_engine, fetch); + fetch_ptr = (fetch_ptr_t) pointer_to_func(f); + + if (verbose >= 2) { + lp_disassemble(f); } -#if 0 - pass = LLVMCreatePassManager(); - LLVMAddTargetData(LLVMGetExecutionEngineTargetData(lp_build_engine), pass); - /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, - * but there are more on SVN. */ - LLVMAddConstantPropagationPass(pass); - LLVMAddInstructionCombiningPass(pass); - LLVMAddPromoteMemoryToRegisterPass(pass); - LLVMAddGVNPass(pass); - LLVMAddCFGSimplificationPass(pass); - LLVMRunPassManager(pass, lp_build_module); -#else - (void)pass; -#endif - - fetch_ptr = (fetch_ptr_t)pointer_to_func(LLVMGetPointerToGlobal(lp_build_engine, fetch)); - - for (i = 0; i < desc->block.height; ++i) { - for (j = 0; j < desc->block.width; ++j) { - - memset(unpacked, 0, sizeof unpacked); - - fetch_ptr(unpacked, test->packed, j, i); - - success = TRUE; - for(k = 0; k < 4; ++k) - if (fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON) - success = FALSE; - - if (!success) { - printf("FAILED\n"); - printf(" Packed: %02x %02x %02x %02x\n", - test->packed[0], test->packed[1], test->packed[2], test->packed[3]); - printf(" Unpacked (%u,%u): %f %f %f %f obtained\n", - j, i, - unpacked[0], unpacked[1], unpacked[2], unpacked[3]); - printf(" %f %f %f %f expected\n", - test->unpacked[i][j][0], - test->unpacked[i][j][1], - test->unpacked[i][j][2], - test->unpacked[i][j][3]); + for (l = 0; l < util_format_nr_test_cases; ++l) { + const struct util_format_test_case *test = &util_format_test_cases[l]; + + if (test->format == desc->format) { + + if (first) { + printf("Testing %s (float) ...\n", + desc->name); + first = FALSE; + } + + for (i = 0; i < desc->block.height; ++i) { + for (j = 0; j < desc->block.width; ++j) { + boolean match; + + memset(unpacked, 0, sizeof unpacked); + + fetch_ptr(unpacked, test->packed, j, i); + + match = TRUE; + for(k = 0; k < 4; ++k) + if (fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON) + match = FALSE; + + if (!match) { + printf("FAILED\n"); + printf(" Packed: %02x %02x %02x %02x\n", + test->packed[0], test->packed[1], test->packed[2], test->packed[3]); + printf(" Unpacked (%u,%u): %f %f %f %f obtained\n", + j, i, + unpacked[0], unpacked[1], unpacked[2], unpacked[3]); + printf(" %f %f %f %f expected\n", + test->unpacked[i][j][0], + test->unpacked[i][j][1], + test->unpacked[i][j][2], + test->unpacked[i][j][3]); + success = FALSE; + } + } } } } - if (!success) - LLVMDumpValue(fetch); + if (!success) { + if (verbose < 1) { + LLVMDumpValue(fetch); + } + } LLVMFreeMachineCodeForFunction(lp_build_engine, fetch); LLVMDeleteFunction(fetch); - if(pass) - LLVMDisposePassManager(pass); - if(fp) write_tsv_row(fp, desc, success); @@ -196,32 +218,104 @@ test_format(unsigned verbose, FILE *fp, } - +PIPE_ALIGN_STACK static boolean -test_one(unsigned verbose, FILE *fp, - const struct util_format_description *format_desc) +test_format_unorm8(unsigned verbose, FILE *fp, + const struct util_format_description *desc) { - unsigned i; + LLVMValueRef fetch = NULL; + fetch_ptr_t fetch_ptr; + uint8_t unpacked[4]; boolean first = TRUE; boolean success = TRUE; + unsigned i, j, k, l; + void *f; - for (i = 0; i < util_format_nr_test_cases; ++i) { - const struct util_format_test_case *test = &util_format_test_cases[i]; + fetch = add_fetch_rgba_test(verbose, desc, lp_unorm8_vec4_type()); - if (test->format == format_desc->format) { + f = LLVMGetPointerToGlobal(lp_build_engine, fetch); + fetch_ptr = (fetch_ptr_t) pointer_to_func(f); + + if (verbose >= 2) { + lp_disassemble(f); + } + + for (l = 0; l < util_format_nr_test_cases; ++l) { + const struct util_format_test_case *test = &util_format_test_cases[l]; + + if (test->format == desc->format) { if (first) { - printf("Testing %s ...\n", - format_desc->name); + printf("Testing %s (unorm8) ...\n", + desc->name); first = FALSE; } - if (!test_format(verbose, fp, format_desc, test)) { - success = FALSE; + for (i = 0; i < desc->block.height; ++i) { + for (j = 0; j < desc->block.width; ++j) { + boolean match; + + memset(unpacked, 0, sizeof unpacked); + + fetch_ptr(unpacked, test->packed, j, i); + + match = TRUE; + for(k = 0; k < 4; ++k) { + int error = float_to_ubyte(test->unpacked[i][j][k]) - unpacked[k]; + if (error < 0) + error = -error; + if (error > 1) + match = FALSE; + } + + if (!match) { + printf("FAILED\n"); + printf(" Packed: %02x %02x %02x %02x\n", + test->packed[0], test->packed[1], test->packed[2], test->packed[3]); + printf(" Unpacked (%u,%u): %02x %02x %02x %02x obtained\n", + j, i, + unpacked[0], unpacked[1], unpacked[2], unpacked[3]); + printf(" %02x %02x %02x %02x expected\n", + float_to_ubyte(test->unpacked[i][j][0]), + float_to_ubyte(test->unpacked[i][j][1]), + float_to_ubyte(test->unpacked[i][j][2]), + float_to_ubyte(test->unpacked[i][j][3])); + success = FALSE; + } + } } } } + if (!success) + LLVMDumpValue(fetch); + + LLVMFreeMachineCodeForFunction(lp_build_engine, fetch); + LLVMDeleteFunction(fetch); + + if(fp) + write_tsv_row(fp, desc, success); + + return success; +} + + + + +static boolean +test_one(unsigned verbose, FILE *fp, + const struct util_format_description *format_desc) +{ + boolean success = TRUE; + + if (!test_format_float(verbose, fp, format_desc)) { + success = FALSE; + } + + if (!test_format_unorm8(verbose, fp, format_desc)) { + success = FALSE; + } + return success; } diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c new file mode 100644 index 0000000000..f571a81a4a --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_test_round.c @@ -0,0 +1,277 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include <stdlib.h> +#include <stdio.h> + +#include "util/u_pointer.h" +#include "gallivm/lp_bld.h" +#include "gallivm/lp_bld_printf.h" +#include "gallivm/lp_bld_arit.h" + +#include <llvm-c/Analysis.h> +#include <llvm-c/ExecutionEngine.h> +#include <llvm-c/Target.h> +#include <llvm-c/Transforms/Scalar.h> + +#include "lp_test.h" + + +void +write_tsv_header(FILE *fp) +{ + fprintf(fp, + "result\t" + "format\n"); + + fflush(fp); +} + + +#ifdef PIPE_ARCH_SSE + +#define USE_SSE2 +#include "sse_mathfun.h" + +typedef __m128 (*test_round_t)(__m128); + +typedef LLVMValueRef (*lp_func_t)(struct lp_build_context *, LLVMValueRef); + + +static LLVMValueRef +add_test(LLVMModuleRef module, const char *name, lp_func_t lp_func) +{ + LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4); + LLVMTypeRef args[1] = { v4sf }; + LLVMValueRef func = LLVMAddFunction(module, name, LLVMFunctionType(v4sf, args, 1, 0)); + LLVMValueRef arg1 = LLVMGetParam(func, 0); + LLVMBuilderRef builder = LLVMCreateBuilder(); + LLVMBasicBlockRef block = LLVMAppendBasicBlock(func, "entry"); + LLVMValueRef ret; + struct lp_build_context bld; + + bld.builder = builder; + bld.type.floating = 1; + bld.type.width = 32; + bld.type.length = 4; + + LLVMSetFunctionCallConv(func, LLVMCCallConv); + + LLVMPositionBuilderAtEnd(builder, block); + + ret = lp_func(&bld, arg1); + + LLVMBuildRet(builder, ret); + LLVMDisposeBuilder(builder); + return func; +} + +static void +printv(char* string, v4sf value) +{ + v4sf v = value; + float *f = (float *)&v; + printf("%s: %10f %10f %10f %10f\n", string, + f[0], f[1], f[2], f[3]); +} + +static void +compare(v4sf x, v4sf y) +{ + float *xp = (float *) &x; + float *yp = (float *) &y; + if (xp[0] != yp[0] || + xp[1] != yp[1] || + xp[2] != yp[2] || + xp[3] != yp[3]) { + printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n"); + } +} + + + +PIPE_ALIGN_STACK +static boolean +test_round(unsigned verbose, FILE *fp) +{ + LLVMModuleRef module = NULL; + LLVMValueRef test_round = NULL, test_trunc, test_floor, test_ceil; + LLVMExecutionEngineRef engine = NULL; + LLVMModuleProviderRef provider = NULL; + LLVMPassManagerRef pass = NULL; + char *error = NULL; + test_round_t round_func, trunc_func, floor_func, ceil_func; + float unpacked[4]; + unsigned packed; + boolean success = TRUE; + int i; + + module = LLVMModuleCreateWithName("test"); + + test_round = add_test(module, "round", lp_build_round); + test_trunc = add_test(module, "trunc", lp_build_trunc); + test_floor = add_test(module, "floor", lp_build_floor); + test_ceil = add_test(module, "ceil", lp_build_ceil); + + if(LLVMVerifyModule(module, LLVMPrintMessageAction, &error)) { + printf("LLVMVerifyModule: %s\n", error); + LLVMDumpModule(module); + abort(); + } + LLVMDisposeMessage(error); + + provider = LLVMCreateModuleProviderForExistingModule(module); + if (LLVMCreateJITCompiler(&engine, provider, 1, &error)) { + fprintf(stderr, "%s\n", error); + LLVMDisposeMessage(error); + abort(); + } + +#if 0 + pass = LLVMCreatePassManager(); + LLVMAddTargetData(LLVMGetExecutionEngineTargetData(engine), pass); + /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, + * but there are more on SVN. */ + LLVMAddConstantPropagationPass(pass); + LLVMAddInstructionCombiningPass(pass); + LLVMAddPromoteMemoryToRegisterPass(pass); + LLVMAddGVNPass(pass); + LLVMAddCFGSimplificationPass(pass); + LLVMRunPassManager(pass, module); +#else + (void)pass; +#endif + + round_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_round)); + trunc_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_trunc)); + floor_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_floor)); + ceil_func = (test_round_t) pointer_to_func(LLVMGetPointerToGlobal(engine, test_ceil)); + + memset(unpacked, 0, sizeof unpacked); + packed = 0; + + if (0) + LLVMDumpModule(module); + + for (i = 0; i < 3; i++) { + v4sf xvals[3] = { + {-10.0, -1, 0, 12.0}, + {-1.5, -0.25, 1.25, 2.5}, + {-0.99, -0.01, 0.01, 0.99} + }; + v4sf x = xvals[i]; + v4sf y, ref; + float *xp = (float *) &x; + float *refp = (float *) &ref; + + printf("\n"); + printv("x ", x); + + refp[0] = round(xp[0]); + refp[1] = round(xp[1]); + refp[2] = round(xp[2]); + refp[3] = round(xp[3]); + y = round_func(x); + printv("C round(x) ", ref); + printv("LLVM round(x)", y); + compare(ref, y); + + refp[0] = trunc(xp[0]); + refp[1] = trunc(xp[1]); + refp[2] = trunc(xp[2]); + refp[3] = trunc(xp[3]); + y = trunc_func(x); + printv("C trunc(x) ", ref); + printv("LLVM trunc(x)", y); + compare(ref, y); + + refp[0] = floor(xp[0]); + refp[1] = floor(xp[1]); + refp[2] = floor(xp[2]); + refp[3] = floor(xp[3]); + y = floor_func(x); + printv("C floor(x) ", ref); + printv("LLVM floor(x)", y); + compare(ref, y); + + refp[0] = ceil(xp[0]); + refp[1] = ceil(xp[1]); + refp[2] = ceil(xp[2]); + refp[3] = ceil(xp[3]); + y = ceil_func(x); + printv("C ceil(x) ", ref); + printv("LLVM ceil(x) ", y); + compare(ref, y); + } + + LLVMFreeMachineCodeForFunction(engine, test_round); + LLVMFreeMachineCodeForFunction(engine, test_trunc); + LLVMFreeMachineCodeForFunction(engine, test_floor); + LLVMFreeMachineCodeForFunction(engine, test_ceil); + + LLVMDisposeExecutionEngine(engine); + if(pass) + LLVMDisposePassManager(pass); + + return success; +} + +#else /* !PIPE_ARCH_SSE */ + +static boolean +test_round(unsigned verbose, FILE *fp) +{ + return TRUE; +} + +#endif /* !PIPE_ARCH_SSE */ + + +boolean +test_all(unsigned verbose, FILE *fp) +{ + boolean success = TRUE; + + test_round(verbose, fp); + + return success; +} + + +boolean +test_some(unsigned verbose, FILE *fp, unsigned long n) +{ + return test_all(verbose, fp); +} + +boolean +test_single(unsigned verbose, FILE *fp) +{ + printf("no test_single()"); + return TRUE; +} diff --git a/src/gallium/drivers/llvmpipe/lp_test_sincos.c b/src/gallium/drivers/llvmpipe/lp_test_sincos.c index c7a903a025..1366ecddcb 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_sincos.c +++ b/src/gallium/drivers/llvmpipe/lp_test_sincos.c @@ -108,7 +108,6 @@ test_sincos(unsigned verbose, FILE *fp) test_sincos_t sin_func; test_sincos_t cos_func; float unpacked[4]; - unsigned packed; boolean success = TRUE; module = LLVMModuleCreateWithName("test"); @@ -149,7 +148,6 @@ test_sincos(unsigned verbose, FILE *fp) cos_func = (test_sincos_t)LLVMGetPointerToGlobal(engine, test_cos); memset(unpacked, 0, sizeof unpacked); - packed = 0; // LLVMDumpModule(module); diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index 0d526ead89..25112c10a6 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -36,6 +36,7 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" +#include "util/u_cpu_detect.h" #include "util/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -55,6 +56,7 @@ #ifdef DEBUG static struct llvmpipe_resource resource_list; #endif +static unsigned id_counter = 0; static INLINE boolean @@ -183,8 +185,8 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen, */ const unsigned width = align(lpr->base.width0, TILE_SIZE); const unsigned height = align(lpr->base.height0, TILE_SIZE); - const unsigned width_t = align(width, TILE_SIZE) / TILE_SIZE; - const unsigned height_t = align(height, TILE_SIZE) / TILE_SIZE; + const unsigned width_t = width / TILE_SIZE; + const unsigned height_t = height / TILE_SIZE; lpr->tiles_per_row[0] = width_t; lpr->tiles_per_image[0] = width_t * height_t; @@ -209,7 +211,6 @@ static struct pipe_resource * llvmpipe_resource_create(struct pipe_screen *_screen, const struct pipe_resource *templat) { - static unsigned id_counter = 0; struct llvmpipe_screen *screen = llvmpipe_screen(_screen); struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource); if (!lpr) @@ -389,7 +390,6 @@ llvmpipe_resource_map(struct pipe_resource *resource, map = llvmpipe_get_texture_image(lpr, face + zslice, level, tex_usage, layout); - assert(map); return map; } else { @@ -446,6 +446,10 @@ llvmpipe_resource_from_handle(struct pipe_screen *screen, { struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys; struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource); + unsigned width, height, width_t, height_t; + + /* XXX Seems like from_handled depth textures doesn't work that well */ + if (!lpr) return NULL; @@ -453,6 +457,25 @@ llvmpipe_resource_from_handle(struct pipe_screen *screen, pipe_reference_init(&lpr->base.reference, 1); lpr->base.screen = screen; + width = align(lpr->base.width0, TILE_SIZE); + height = align(lpr->base.height0, TILE_SIZE); + width_t = width / TILE_SIZE; + height_t = height / TILE_SIZE; + + /* + * Looks like unaligned displaytargets work just fine, + * at least sampler/render ones. + */ +#if 0 + assert(lpr->base.width0 == width); + assert(lpr->base.height0 == height); +#endif + + lpr->tiles_per_row[0] = width_t; + lpr->tiles_per_image[0] = width_t * height_t; + lpr->num_slices_faces[0] = 1; + lpr->img_stride[0] = 0; + lpr->dt = winsys->displaytarget_from_handle(winsys, template, whandle, @@ -460,6 +483,17 @@ llvmpipe_resource_from_handle(struct pipe_screen *screen, if (!lpr->dt) goto fail; + lpr->layout[0] = alloc_layout_array(1, lpr->base.width0, lpr->base.height0); + + assert(lpr->layout[0]); + assert(lpr->layout[0][0] == LP_TEX_LAYOUT_NONE); + + lpr->id = id_counter++; + +#ifdef DEBUG + insert_at_tail(&resource_list, lpr); +#endif + return &lpr->base; fail: @@ -899,13 +933,15 @@ static void alloc_image_data(struct llvmpipe_resource *lpr, unsigned level, enum lp_texture_layout layout) { + uint alignment = MAX2(16, util_cpu_caps.cacheline); + if (lpr->dt) assert(level == 0); if (layout == LP_TEX_LAYOUT_TILED) { /* tiled data is stored in regular memory */ uint buffer_size = tex_image_size(lpr, level, layout); - lpr->tiled[level].data = align_malloc(buffer_size, 16); + lpr->tiled[level].data = align_malloc(buffer_size, alignment); } else { assert(layout == LP_TEX_LAYOUT_LINEAR); @@ -921,7 +957,7 @@ alloc_image_data(struct llvmpipe_resource *lpr, unsigned level, else { /* not a display target - allocate regular memory */ uint buffer_size = tex_image_size(lpr, level, LP_TEX_LAYOUT_LINEAR); - lpr->linear[level].data = align_malloc(buffer_size, 16); + lpr->linear[level].data = align_malloc(buffer_size, alignment); } } } @@ -1035,7 +1071,7 @@ llvmpipe_get_texture_image(struct llvmpipe_resource *lpr, layout_logic(cur_layout, layout, usage, &new_layout, &convert); - if (convert) { + if (convert && other_data && target_data) { if (layout == LP_TEX_LAYOUT_TILED) { lp_linear_to_tiled(other_data, target_data, x * TILE_SIZE, y * TILE_SIZE, @@ -1067,8 +1103,6 @@ llvmpipe_get_texture_image(struct llvmpipe_resource *lpr, width_t, height_t, layout); } - assert(target_data); - return target_data; } @@ -1138,7 +1172,7 @@ llvmpipe_get_texture_tile_linear(struct llvmpipe_resource *lpr, layout_logic(cur_layout, LP_TEX_LAYOUT_LINEAR, usage, &new_layout, &convert); - if (convert) { + if (convert && tiled_image && linear_image) { lp_tiled_to_linear(tiled_image, linear_image, x, y, TILE_SIZE, TILE_SIZE, lpr->base.format, lpr->row_stride[level], @@ -1187,13 +1221,16 @@ llvmpipe_get_texture_tile(struct llvmpipe_resource *lpr, cur_layout = llvmpipe_get_texture_tile_layout(lpr, face_slice, level, tx, ty); layout_logic(cur_layout, LP_TEX_LAYOUT_TILED, usage, &new_layout, &convert); - if (convert) { + if (convert && linear_image && tiled_image) { lp_linear_to_tiled(linear_image, tiled_image, x, y, TILE_SIZE, TILE_SIZE, lpr->base.format, lpr->row_stride[level], lpr->tiles_per_row[level]); } + if (!tiled_image) + return NULL; + if (new_layout != cur_layout) llvmpipe_set_texture_tile_layout(lpr, face_slice, level, tx, ty, new_layout); @@ -1206,6 +1243,94 @@ llvmpipe_get_texture_tile(struct llvmpipe_resource *lpr, /** + * Get pointer to tiled data for rendering. + * \return pointer to the tiled data at the given tile position + */ +void +llvmpipe_unswizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile) +{ + struct llvmpipe_texture_image *linear_img = &lpr->linear[level]; + const unsigned tx = x / TILE_SIZE, ty = y / TILE_SIZE; + uint8_t *linear_image; + + assert(x % TILE_SIZE == 0); + assert(y % TILE_SIZE == 0); + + if (!linear_img->data) { + /* allocate memory for the linear image now */ + alloc_image_data(lpr, level, LP_TEX_LAYOUT_LINEAR); + } + + /* compute address of the slice/face of the image that contains the tile */ + linear_image = llvmpipe_get_texture_image_address(lpr, face_slice, level, + LP_TEX_LAYOUT_LINEAR); + + { + uint ii = x, jj = y; + uint tile_offset = jj / TILE_SIZE + ii / TILE_SIZE; + uint byte_offset = tile_offset * TILE_SIZE * TILE_SIZE * 4; + + /* Note that lp_tiled_to_linear expects the tile parameter to + * point at the first tile in a whole-image sized array. In + * this code, we have only a single tile and have to do some + * pointer arithmetic to figure out where the "image" would have + * started. + */ + lp_tiled_to_linear(tile - byte_offset, linear_image, + x, y, TILE_SIZE, TILE_SIZE, + lpr->base.format, + lpr->row_stride[level], + 1); /* tiles per row */ + } + + llvmpipe_set_texture_tile_layout(lpr, face_slice, level, tx, ty, + LP_TEX_LAYOUT_LINEAR); +} + + +/** + * Get pointer to tiled data for rendering. + * \return pointer to the tiled data at the given tile position + */ +void +llvmpipe_swizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile) +{ + uint8_t *linear_image; + + assert(x % TILE_SIZE == 0); + assert(y % TILE_SIZE == 0); + + /* compute address of the slice/face of the image that contains the tile */ + linear_image = llvmpipe_get_texture_image_address(lpr, face_slice, level, + LP_TEX_LAYOUT_LINEAR); + + if (linear_image) { + uint ii = x, jj = y; + uint tile_offset = jj / TILE_SIZE + ii / TILE_SIZE; + uint byte_offset = tile_offset * TILE_SIZE * TILE_SIZE * 4; + + /* Note that lp_linear_to_tiled expects the tile parameter to + * point at the first tile in a whole-image sized array. In + * this code, we have only a single tile and have to do some + * pointer arithmetic to figure out where the "image" would have + * started. + */ + lp_linear_to_tiled(linear_image, tile - byte_offset, + x, y, TILE_SIZE, TILE_SIZE, + lpr->base.format, + lpr->row_stride[level], + 1); /* tiles per row */ + } +} + + +/** * Return size of resource in bytes */ unsigned diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h index 503b6a19a8..4e4a65dcb4 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.h +++ b/src/gallium/drivers/llvmpipe/lp_texture.h @@ -223,6 +223,17 @@ llvmpipe_get_texture_tile(struct llvmpipe_resource *lpr, unsigned x, unsigned y); +void +llvmpipe_unswizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile); + +void +llvmpipe_swizzle_cbuf_tile(struct llvmpipe_resource *lpr, + unsigned face_slice, unsigned level, + unsigned x, unsigned y, + uint8_t *tile); extern void llvmpipe_print_resources(void); diff --git a/src/gallium/drivers/llvmpipe/lp_tile_image.c b/src/gallium/drivers/llvmpipe/lp_tile_image.c index 2b63992dd7..0938f7aea7 100644 --- a/src/gallium/drivers/llvmpipe/lp_tile_image.c +++ b/src/gallium/drivers/llvmpipe/lp_tile_image.c @@ -204,7 +204,7 @@ lp_tiled_to_linear(const void *src, void *dst, lp_tile_unswizzle_4ub(format, src_tile, dst, dst_stride, - ii, jj, tile_w, tile_h); + ii, jj); } } } @@ -293,7 +293,7 @@ lp_linear_to_tiled(const void *src, void *dst, lp_tile_swizzle_4ub(format, dst_tile, src, src_stride, - ii, jj, tile_w, tile_h); + ii, jj); } } } diff --git a/src/gallium/drivers/llvmpipe/lp_tile_shuffle_mask.py b/src/gallium/drivers/llvmpipe/lp_tile_shuffle_mask.py new file mode 100644 index 0000000000..ea2fc0f375 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_tile_shuffle_mask.py @@ -0,0 +1,32 @@ + +tile = [[0,1,4,5], + [2,3,6,7], + [8,9,12,13], + [10,11,14,15]] +shift = 0 +align = 1 +value = 0L +holder = [] + +import sys + +basemask = [0x +fd = sys.stdout +indent = " "*9 +for c in range(4): + fd.write(indent + "*pdst++ = \n"); + for l,line in enumerate(tile): + fd.write(indent + " %s_mm_shuffle_epi8(line%d, (__m128i){"%(l and '+' or ' ',l)) + for i,pos in enumerate(line): + mask = 0x00ffffffff & (~(0xffL << shift)) + value = mask | ((pos) << shift) + holder.append(value) + if holder and (i + 1) %2 == 0: + fd.write("0x%8.0x"%(holder[0] + (holder[1] << 32))) + holder = [] + if (i) %4 == 1: + fd.write( ',') + + fd.write("})%s\n"%((l == 3) and ';' or '')) + print + shift += 8 diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.h b/src/gallium/drivers/llvmpipe/lp_tile_soa.h index 07f71b8411..12dac1da6c 100644 --- a/src/gallium/drivers/llvmpipe/lp_tile_soa.h +++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.h @@ -79,14 +79,14 @@ void lp_tile_swizzle_4ub(enum pipe_format format, uint8_t *dst, const void *src, unsigned src_stride, - unsigned x, unsigned y, unsigned w, unsigned h); + unsigned x, unsigned y); void lp_tile_unswizzle_4ub(enum pipe_format format, const uint8_t *src, void *dst, unsigned dst_stride, - unsigned x, unsigned y, unsigned w, unsigned h); + unsigned x, unsigned y); diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py index 5ab63cbac6..c71ec8066c 100644 --- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py +++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py @@ -75,13 +75,13 @@ def generate_format_read(format, dst_channel, dst_native_type, dst_suffix): src_native_type = native_type(format) print 'static void' - print 'lp_tile_%s_swizzle_%s(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, dst_suffix, dst_native_type) + print 'lp_tile_%s_swizzle_%s(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)' % (name, dst_suffix, dst_native_type) print '{' print ' unsigned x, y;' print ' const uint8_t *src_row = src + y0*src_stride;' - print ' for (y = 0; y < h; ++y) {' + print ' for (y = 0; y < TILE_SIZE; ++y) {' print ' const %s *src_pixel = (const %s *)(src_row + x0*%u);' % (src_native_type, src_native_type, format.stride()) - print ' for (x = 0; x < w; ++x) {' + print ' for (x = 0; x < TILE_SIZE; ++x) {' names = ['']*4 if format.colorspace in ('rgb', 'srgb'): @@ -202,9 +202,9 @@ def emit_unrolled_unswizzle_code(format, src_channel): print ' %s *dstpix = (%s *) dst;' % (dst_native_type, dst_native_type) print ' unsigned int qx, qy, i;' print - print ' for (qy = 0; qy < h; qy += TILE_VECTOR_HEIGHT) {' + print ' for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {' print ' const unsigned py = y0 + qy;' - print ' for (qx = 0; qx < w; qx += TILE_VECTOR_WIDTH) {' + print ' for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {' print ' const unsigned px = x0 + qx;' print ' const uint8_t *r = src + 0 * TILE_C_STRIDE;' print ' const uint8_t *g = src + 1 * TILE_C_STRIDE;' @@ -231,9 +231,9 @@ def emit_tile_pixel_unswizzle_code(format, src_channel): print ' unsigned x, y;' print ' uint8_t *dst_row = dst + y0*dst_stride;' - print ' for (y = 0; y < h; ++y) {' + print ' for (y = 0; y < TILE_SIZE; ++y) {' print ' %s *dst_pixel = (%s *)(dst_row + x0*%u);' % (dst_native_type, dst_native_type, format.stride()) - print ' for (x = 0; x < w; ++x) {' + print ' for (x = 0; x < TILE_SIZE; ++x) {' if format.layout == PLAIN: if not format.is_array(): @@ -273,7 +273,7 @@ def generate_format_write(format, src_channel, src_native_type, src_suffix): name = format.short_name() print 'static void' - print 'lp_tile_%s_unswizzle_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h)' % (name, src_suffix, src_native_type) + print 'lp_tile_%s_unswizzle_%s(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)' % (name, src_suffix, src_native_type) print '{' if format.layout == PLAIN \ and format.colorspace == 'rgb' \ @@ -289,6 +289,202 @@ def generate_format_write(format, src_channel, src_native_type, src_suffix): print +def generate_ssse3(): + print ''' +#if defined(PIPE_ARCH_SSE) + + +#if defined(PIPE_ARCH_SSSE3) + +#include <tmmintrin.h> + +#else + +#include <emmintrin.h> + +/** + * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases + * where -mssse3 is not supported/enabled. + * + * MSVC will never get in here as its intrinsics support do not rely on + * compiler command line options. + */ +static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8(__m128i a, __m128i mask) +{ + __m128i result; + __asm__("pshufb %1, %0" + : "=x" (result) + : "xm" (mask), "0" (a)); + return result; +} + +#endif + + +static void +lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst, + const uint8_t *src, unsigned src_stride, + unsigned x0, unsigned y0) +{ + + unsigned x, y; + __m128i *pdst = (__m128i*) dst; + const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t); + unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH; + unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT; + + const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); + + const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff); + const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff); + const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff); + const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff); + + const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e); + const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d); + const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c); + const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f); + + for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) { + __m128i line0 = *(__m128i*)ysrc0; + const uint8_t *ysrc = ysrc0 + src_stride; + ysrc0 += tile_stridey; + + for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) { + __m128i r, g, b, a, line1; + line1 = *(__m128i*)ysrc; + PIPE_READ_WRITE_BARRIER(); + ysrc += src_stride; + r = _mm_shuffle_epi8(line0, shuffle00); + g = _mm_shuffle_epi8(line0, shuffle01); + b = _mm_shuffle_epi8(line0, shuffle02); + a = _mm_shuffle_epi8(line0, shuffle03); + + line0 = *(__m128i*)ysrc; + PIPE_READ_WRITE_BARRIER(); + ysrc += src_stride; + r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10)); + g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11)); + b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12)); + a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13)); + + line1 = *(__m128i*)ysrc; + PIPE_READ_WRITE_BARRIER(); + ysrc -= tile_stridex; + r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20)); + g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21)); + b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22)); + a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23)); + + if (x + 1 < TILE_SIZE) { + line0 = *(__m128i*)ysrc; + ysrc += src_stride; + } + + PIPE_READ_WRITE_BARRIER(); + r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30)); + g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31)); + b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32)); + a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33)); + + *pdst++ = r; + *pdst++ = g; + *pdst++ = b; + *pdst++ = a; + } + } + +} + +static void +lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src, + uint8_t *dst, unsigned dst_stride, + unsigned x0, unsigned y0) +{ + unsigned int x, y; + const __m128i *psrc = (__m128i*) src; + const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t)); + uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t); + __m128i c0 = *psrc++; + __m128i c1; + + const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff); + const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff); + const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff); + const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff); + + const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff); + const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff); + const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff); + const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff); + + const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff); + const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff); + const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff); + const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff); + + const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05); + const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07); + const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d); + const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f); + + for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) { + __m128i *tile = (__m128i*) pdst; + pdst += dst_stride * TILE_VECTOR_HEIGHT; + for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) { + uint8_t *linep = (uint8_t*) (tile++); + __m128i line0, line1, line2, line3; + + c1 = *psrc++; /* r */ + PIPE_READ_WRITE_BARRIER(); + line0 = _mm_shuffle_epi8(c0, shuffle00); + line1 = _mm_shuffle_epi8(c0, shuffle01); + line2 = _mm_shuffle_epi8(c0, shuffle02); + line3 = _mm_shuffle_epi8(c0, shuffle03); + + c0 = *psrc++; /* g */ + PIPE_READ_WRITE_BARRIER(); + line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10)); + line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11)); + line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12)); + line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13)); + + c1 = *psrc++; /* b */ + PIPE_READ_WRITE_BARRIER(); + line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20)); + line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21)); + line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22)); + line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23)); + + if (psrc != end) + c0 = *psrc++; /* a */ + PIPE_READ_WRITE_BARRIER(); + line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30)); + line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31)); + line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32)); + line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33)); + + *(__m128i*) (linep) = line0; + *(__m128i*) (((char*)linep) + dst_stride) = line1; + *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2; + *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3; + } + } +} + +#endif /* PIPE_ARCH_SSSE3 */ +''' + + def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix): '''Generate the dispatch function to read pixels from any format''' @@ -297,9 +493,9 @@ def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix): generate_format_read(format, dst_channel, dst_native_type, dst_suffix) print 'void' - print 'lp_tile_swizzle_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (dst_suffix, dst_native_type) + print 'lp_tile_swizzle_%s(enum pipe_format format, %s *dst, const void *src, unsigned src_stride, unsigned x, unsigned y)' % (dst_suffix, dst_native_type) print '{' - print ' void (*func)(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % dst_native_type + print ' void (*func)(%s *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0);' % dst_native_type print '#ifdef DEBUG' print ' lp_tile_swizzle_count += 1;' print '#endif' @@ -307,13 +503,21 @@ def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix): for format in formats: if is_format_supported(format): print ' case %s:' % format.name - print ' func = &lp_tile_%s_swizzle_%s;' % (format.short_name(), dst_suffix) + func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix) + if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM': + print '#ifdef PIPE_ARCH_SSE' + print ' func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name) + print '#else' + print ' func = %s;' % (func_name,) + print '#endif' + else: + print ' func = %s;' % (func_name,) print ' break;' print ' default:' print ' debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));' print ' return;' print ' }' - print ' func(dst, (const uint8_t *)src, src_stride, x, y, w, h);' + print ' func(dst, (const uint8_t *)src, src_stride, x, y);' print '}' print @@ -326,10 +530,10 @@ def generate_unswizzle(formats, src_channel, src_native_type, src_suffix): generate_format_write(format, src_channel, src_native_type, src_suffix) print 'void' - print 'lp_tile_unswizzle_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (src_suffix, src_native_type) + print 'lp_tile_unswizzle_%s(enum pipe_format format, const %s *src, void *dst, unsigned dst_stride, unsigned x, unsigned y)' % (src_suffix, src_native_type) print '{' - print ' void (*func)(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0, unsigned w, unsigned h);' % src_native_type + print ' void (*func)(const %s *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0);' % src_native_type print '#ifdef DEBUG' print ' lp_tile_unswizzle_count += 1;' print '#endif' @@ -337,13 +541,21 @@ def generate_unswizzle(formats, src_channel, src_native_type, src_suffix): for format in formats: if is_format_supported(format): print ' case %s:' % format.name - print ' func = &lp_tile_%s_unswizzle_%s;' % (format.short_name(), src_suffix) + func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix) + if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM': + print '#ifdef PIPE_ARCH_SSE' + print ' func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name) + print '#else' + print ' func = %s;' % (func_name,) + print '#endif' + else: + print ' func = %s;' % (func_name,) print ' break;' print ' default:' print ' debug_printf("%s: unsupported format %s\\n", __FUNCTION__, util_format_name(format));' print ' return;' print ' }' - print ' func(src, (uint8_t *)dst, dst_stride, x, y, w, h);' + print ' func(src, (uint8_t *)dst, dst_stride, x, y);' print '}' print @@ -362,6 +574,7 @@ def main(): print '#include "util/u_format.h"' print '#include "util/u_math.h"' print '#include "util/u_half.h"' + print '#include "util/u_cpu_detect.h"' print '#include "lp_tile_soa.h"' print print '#ifdef DEBUG' @@ -391,6 +604,8 @@ def main(): print '};' print + generate_ssse3() + channel = Channel(UNSIGNED, True, 8) native_type = 'uint8_t' suffix = '4ub' |