diff options
author | Roland Scheidegger <sroland@vmware.com> | 2010-09-24 15:17:07 +0200 |
---|---|---|
committer | José Fonseca <jfonseca@vmware.com> | 2010-09-25 12:19:31 +0100 |
commit | 049a8cce76b1148ab00de1ae207171e519bfafca (patch) | |
tree | 2dc7640c23c22be517ee1e2d60714733ade41bc5 /src/gallium/auxiliary/gallivm | |
parent | 46d05d4ef99857e50d978247917f3e16574418f4 (diff) |
gallivm: optimize yuv decoding
this is more a proof to show vector shifts on x86 with per-element shift count
are evil. Since we can avoid the shift with a single compare/select, use that
instead. Replaces more than 20 instructions (and slow ones at that) with about 3,
and cuts compiled shader size with mesa's yuvsqure demo by over 10%
(no performance measurements done - but selection is blazing fast).
Might want to revisit that for future cpus - unfortunately AVX won't have vector
shifts neither, but AMD's XOP will, but even in that case using selection here
is probably not slower.
Diffstat (limited to 'src/gallium/auxiliary/gallivm')
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c | 64 |
1 files changed, 55 insertions, 9 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c index 0a5038bc98..2bce289555 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c @@ -35,6 +35,7 @@ #include "util/u_format.h" +#include "util/u_cpu_detect.h" #include "lp_bld_arit.h" #include "lp_bld_type.h" @@ -42,7 +43,7 @@ #include "lp_bld_conv.h" #include "lp_bld_gather.h" #include "lp_bld_format.h" - +#include "lp_bld_logic.h" /** * Extract Y, U, V channels from packed UYVY. @@ -59,7 +60,7 @@ uyvy_to_yuv_soa(LLVMBuilderRef builder, LLVMValueRef *v) { struct lp_type type; - LLVMValueRef shift, mask; + LLVMValueRef mask; memset(&type, 0, sizeof type); type.width = 32; @@ -69,14 +70,37 @@ uyvy_to_yuv_soa(LLVMBuilderRef builder, assert(lp_check_value(type, i)); /* - * y = (uyvy >> 16*i) & 0xff + * y = (uyvy >> (16*i + 8)) & 0xff * u = (uyvy ) & 0xff * v = (uyvy >> 16 ) & 0xff */ - shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); - shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), ""); - *y = LLVMBuildLShr(builder, packed, shift, ""); +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* + * Avoid shift with per-element count. + * No support on x86, gets translated to roughly 5 instructions + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ + if (util_cpu_caps.has_sse2 && n == 4) { + LLVMValueRef sel, tmp, tmp2; + struct lp_build_context bld32; + + lp_build_context_init(&bld32, builder, type); + + tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), ""); + tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(type, 16), ""); + sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0)); + *y = lp_build_select(&bld32, sel, tmp, tmp2); + } else +#endif + { + LLVMValueRef shift; + shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); + shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), ""); + *y = LLVMBuildLShr(builder, packed, shift, ""); + } + *u = packed; *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), ""); @@ -103,7 +127,7 @@ yuyv_to_yuv_soa(LLVMBuilderRef builder, LLVMValueRef *v) { struct lp_type type; - LLVMValueRef shift, mask; + LLVMValueRef mask; memset(&type, 0, sizeof type); type.width = 32; @@ -118,8 +142,30 @@ yuyv_to_yuv_soa(LLVMBuilderRef builder, * v = (yuyv >> 24 ) & 0xff */ - shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); - *y = LLVMBuildLShr(builder, packed, shift, ""); +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* + * Avoid shift with per-element count. + * No support on x86, gets translated to roughly 5 instructions + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ + if (util_cpu_caps.has_sse2 && n == 4) { + LLVMValueRef sel, tmp; + struct lp_build_context bld32; + + lp_build_context_init(&bld32, builder, type); + + tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), ""); + sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0)); + *y = lp_build_select(&bld32, sel, packed, tmp); + } else +#endif + { + LLVMValueRef shift; + shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); + *y = LLVMBuildLShr(builder, packed, shift, ""); + } + *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), ""); *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), ""); |