From e3ccfd4e03e6e2bf3f5a18be80f61819220b4c16 Mon Sep 17 00:00:00 2001 From: José Fonseca Date: Wed, 29 Sep 2010 22:27:38 +0100 Subject: gallivm: Use SSE4.1's ROUNDSS/ROUNDSD for scalar rounding. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 92 ++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 21 deletions(-) (limited to 'src/gallium/auxiliary') diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index ff0c7f7ca8..e65c13e64b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -989,28 +989,67 @@ lp_build_round_sse41(struct lp_build_context *bld, enum lp_build_round_sse41_mode mode) { const struct lp_type type = bld->type; - LLVMTypeRef vec_type = lp_build_vec_type(type); + LLVMTypeRef i32t = LLVMInt32Type(); const char *intrinsic; + LLVMValueRef res; assert(type.floating); - assert(type.width*type.length == 128); + assert(lp_check_value(type, a)); assert(util_cpu_caps.has_sse4_1); - switch(type.width) { - case 32: - intrinsic = "llvm.x86.sse41.round.ps"; - break; - case 64: - intrinsic = "llvm.x86.sse41.round.pd"; - break; - default: - assert(0); - return bld->undef; + if (type.length == 1) { + LLVMTypeRef vec_type; + LLVMValueRef undef; + LLVMValueRef args[3]; + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ss"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.sd"; + break; + default: + assert(0); + return bld->undef; + } + + vec_type = LLVMVectorType(bld->elem_type, 4); + + undef = LLVMGetUndef(vec_type); + + args[0] = undef; + args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, ""); + args[2] = LLVMConstInt(i32t, mode, 0); + + res = lp_build_intrinsic(bld->builder, intrinsic, + vec_type, args, Elements(args)); + + res = LLVMBuildExtractElement(bld->builder, res, index0, ""); + } + else { + assert(type.width*type.length == 128); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ps"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.pd"; + break; + default: + assert(0); + return bld->undef; + } + + res = lp_build_intrinsic_binary(bld->builder, intrinsic, + bld->vec_type, a, + LLVMConstInt(i32t, mode, 0)); } - return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, - LLVMConstInt(LLVMInt32Type(), mode, 0)); + return res; } @@ -1028,8 +1067,10 @@ lp_build_trunc(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); @@ -1056,8 +1097,10 @@ lp_build_round(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -1082,8 +1125,10 @@ lp_build_floor(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -1108,8 +1153,10 @@ lp_build_ceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -1170,7 +1217,8 @@ lp_build_iround(struct lp_build_context *bld, assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { @@ -1214,7 +1262,8 @@ lp_build_ifloor(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { @@ -1264,7 +1313,8 @@ lp_build_iceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { -- cgit v1.2.3