From 607e3c542cedd645da91c96abfe6698623acf503 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Mon, 4 Oct 2010 15:00:34 +0100 Subject: gallivm: special case conversion 4x4f to 1x16ub Nice reduction in the number of operations required for final color output in many shaders. --- src/gallium/auxiliary/gallivm/lp_bld_conv.c | 84 +++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'src/gallium/auxiliary/gallivm') diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 8b477313d4..605eb043c7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -69,6 +69,7 @@ #include "lp_bld_arit.h" #include "lp_bld_pack.h" #include "lp_bld_conv.h" +#include "lp_bld_intr.h" /** @@ -241,6 +242,89 @@ lp_build_conv(LLVMBuilderRef builder, } num_tmps = num_srcs; + + /* Special case 4x4f --> 1x16ub + */ + if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + src_type.length == 4 && + + dst_type.floating == 0 && + dst_type.fixed == 0 && + dst_type.sign == 0 && + dst_type.norm == 1 && + dst_type.width == 8 && + dst_type.length == 16) + { + int i; + + for (i = 0; i < num_dsts; i++, src += 4) { + struct lp_type int16_type = dst_type; + struct lp_type int32_type = dst_type; + LLVMValueRef lo, hi; + LLVMValueRef src_int0; + LLVMValueRef src_int1; + LLVMValueRef src_int2; + LLVMValueRef src_int3; + LLVMTypeRef int16_vec_type; + LLVMTypeRef int32_vec_type; + LLVMTypeRef src_vec_type; + LLVMTypeRef dst_vec_type; + LLVMValueRef const_255f; + + int16_type.width *= 2; + int16_type.length /= 2; + int16_type.sign = 1; + + int32_type.width *= 4; + int32_type.length /= 4; + int32_type.sign = 1; + + src_vec_type = lp_build_vec_type(src_type); + dst_vec_type = lp_build_vec_type(dst_type); + int16_vec_type = lp_build_vec_type(int16_type); + int32_vec_type = lp_build_vec_type(int32_type); + + const_255f = lp_build_const_vec(src_type, 255.0); + + src_int0 = LLVMBuildFPToSI(builder, + LLVMBuildFMul(builder, src[0], const_255f, ""), + int32_vec_type, ""); + + src_int1 = LLVMBuildFPToSI(builder, + LLVMBuildFMul(builder, src[1], const_255f, ""), + int32_vec_type, ""); + + src_int2 = LLVMBuildFPToSI(builder, + LLVMBuildFMul(builder, src[2], const_255f, ""), + int32_vec_type, ""); + + src_int3 = LLVMBuildFPToSI(builder, + LLVMBuildFMul(builder, src[3], const_255f, ""), + int32_vec_type, ""); + +#if HAVE_LLVM >= 0x0207 + lo = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", + int16_vec_type, src_int0, src_int1); + hi = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", + int16_vec_type, src_int2, src_int3); + dst[i] = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", + dst_vec_type, lo, hi); +#else + lo = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", + int32_vec_type, src_int0, src_int1); + hi = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", + int32_vec_type, src_int2, src_int3); + dst[i] = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", + int16_vec_type, lo, hi); +#endif + } + return; + } + /* * Clamp if necessary */ -- cgit v1.2.3