diff options
Diffstat (limited to 'src/gallium/auxiliary/gallivm/lp_bld_arit.c')
-rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_arit.c | 511 |
1 files changed, 427 insertions, 84 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index dce3c3745b..00f419a486 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -614,17 +614,15 @@ lp_build_div(struct lp_build_context *bld, /** - * Linear interpolation. - * - * This also works for integer values with a few caveats. + * Linear interpolation -- without any checks. * * @sa http://www.stereopsis.com/doubleblend.html */ -LLVMValueRef -lp_build_lerp(struct lp_build_context *bld, - LLVMValueRef x, - LLVMValueRef v0, - LLVMValueRef v1) +static INLINE LLVMValueRef +lp_build_lerp_simple(struct lp_build_context *bld, + LLVMValueRef x, + LLVMValueRef v0, + LLVMValueRef v1) { LLVMValueRef delta; LLVMValueRef res; @@ -639,12 +637,80 @@ lp_build_lerp(struct lp_build_context *bld, res = lp_build_add(bld, v0, res); - if(bld->type.fixed) + if (bld->type.fixed) { /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, * but it will be wrong for other uses. Basically we need a more * powerful lp_type, capable of further distinguishing the values * interpretation from the value storage. */ res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), ""); + } + + return res; +} + + +/** + * Linear interpolation. + */ +LLVMValueRef +lp_build_lerp(struct lp_build_context *bld, + LLVMValueRef x, + LLVMValueRef v0, + LLVMValueRef v1) +{ + const struct lp_type type = bld->type; + LLVMValueRef res; + + assert(lp_check_value(type, x)); + assert(lp_check_value(type, v0)); + assert(lp_check_value(type, v1)); + + if (type.norm) { + struct lp_type wide_type; + struct lp_build_context wide_bld; + LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; + LLVMValueRef shift; + + assert(type.length >= 2); + assert(!type.sign); + + /* + * Create a wider type, enough to hold the intermediate result of the + * multiplication. + */ + memset(&wide_type, 0, sizeof wide_type); + wide_type.fixed = TRUE; + wide_type.width = type.width*2; + wide_type.length = type.length/2; + + lp_build_context_init(&wide_bld, bld->builder, wide_type); + + lp_build_unpack2(bld->builder, type, wide_type, x, &xl, &xh); + lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h); + lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h); + + /* + * Scale x from [0, 255] to [0, 256] + */ + + shift = lp_build_const_int_vec(wide_type, type.width - 1); + + xl = lp_build_add(&wide_bld, xl, + LLVMBuildAShr(bld->builder, xl, shift, "")); + xh = lp_build_add(&wide_bld, xh, + LLVMBuildAShr(bld->builder, xh, shift, "")); + + /* + * Lerp both halves. + */ + + resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); + resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); + + res = lp_build_pack2(bld->builder, wide_type, type, resl, resh); + } else { + res = lp_build_lerp_simple(bld, x, v0, v1); + } return res; } @@ -923,35 +989,122 @@ lp_build_round_sse41(struct lp_build_context *bld, enum lp_build_round_sse41_mode mode) { const struct lp_type type = bld->type; - LLVMTypeRef vec_type = lp_build_vec_type(type); + LLVMTypeRef i32t = LLVMInt32Type(); const char *intrinsic; + LLVMValueRef res; assert(type.floating); - assert(type.width*type.length == 128); + assert(lp_check_value(type, a)); assert(util_cpu_caps.has_sse4_1); - switch(type.width) { - case 32: - intrinsic = "llvm.x86.sse41.round.ps"; - break; - case 64: - intrinsic = "llvm.x86.sse41.round.pd"; - break; - default: - assert(0); - return bld->undef; + if (type.length == 1) { + LLVMTypeRef vec_type; + LLVMValueRef undef; + LLVMValueRef args[3]; + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ss"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.sd"; + break; + default: + assert(0); + return bld->undef; + } + + vec_type = LLVMVectorType(bld->elem_type, 4); + + undef = LLVMGetUndef(vec_type); + + args[0] = undef; + args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, ""); + args[2] = LLVMConstInt(i32t, mode, 0); + + res = lp_build_intrinsic(bld->builder, intrinsic, + vec_type, args, Elements(args)); + + res = LLVMBuildExtractElement(bld->builder, res, index0, ""); + } + else { + assert(type.width*type.length == 128); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ps"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.pd"; + break; + default: + assert(0); + return bld->undef; + } + + res = lp_build_intrinsic_binary(bld->builder, intrinsic, + bld->vec_type, a, + LLVMConstInt(i32t, mode, 0)); + } + + return res; +} + + +static INLINE LLVMValueRef +lp_build_iround_nearest_sse2(struct lp_build_context *bld, + LLVMValueRef a) +{ + const struct lp_type type = bld->type; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMTypeRef ret_type = lp_build_int_vec_type(type); + const char *intrinsic; + LLVMValueRef res; + + assert(type.floating); + /* using the double precision conversions is a bit more complicated */ + assert(type.width == 32); + + assert(lp_check_value(type, a)); + assert(util_cpu_caps.has_sse2); + + /* This is relying on MXCSR rounding mode, which should always be nearest. */ + if (type.length == 1) { + LLVMTypeRef vec_type; + LLVMValueRef undef; + LLVMValueRef arg; + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + + vec_type = LLVMVectorType(bld->elem_type, 4); + + intrinsic = "llvm.x86.sse.cvtss2si"; + + undef = LLVMGetUndef(vec_type); + + arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, ""); + + res = lp_build_intrinsic_unary(bld->builder, intrinsic, + ret_type, arg); + } + else { + assert(type.width*type.length == 128); + + intrinsic = "llvm.x86.sse2.cvtps2dq"; + + res = lp_build_intrinsic_unary(bld->builder, intrinsic, + ret_type, a); } - return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, - LLVMConstInt(LLVMInt32Type(), mode, 0)); + return res; } /** - * Return the integer part of a float (vector) value. The returned value is - * a float (vector). - * Ex: trunc(-1.5) = 1.0 + * Return the integer part of a float (vector) value (== round toward zero). + * The returned value is a float (vector). + * Ex: trunc(-1.5) = -1.0 */ LLVMValueRef lp_build_trunc(struct lp_build_context *bld, @@ -962,8 +1115,10 @@ lp_build_trunc(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); @@ -990,8 +1145,10 @@ lp_build_round(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -1016,8 +1173,10 @@ lp_build_floor(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -1042,8 +1201,10 @@ lp_build_ceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -1068,9 +1229,9 @@ lp_build_fract(struct lp_build_context *bld, /** - * Return the integer part of a float (vector) value. The returned value is - * an integer (vector). - * Ex: itrunc(-1.5) = 1 + * Return the integer part of a float (vector) value (== round toward zero). + * The returned value is an integer (vector). + * Ex: itrunc(-1.5) = -1 */ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, @@ -1097,31 +1258,40 @@ lp_build_iround(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { + if (util_cpu_caps.has_sse2 && + ((type.width == 32) && (type.length == 1 || type.length == 4))) { + return lp_build_iround_nearest_sse2(bld, a); + } + else if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { - LLVMTypeRef vec_type = lp_build_vec_type(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; LLVMValueRef half; - /* get sign bit */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - - /* sign * 0.5 */ half = lp_build_const_vec(type, 0.5); - half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); - half = LLVMBuildOr(bld->builder, sign, half, ""); - half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); + + if (type.sign) { + LLVMTypeRef vec_type = bld->vec_type; + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* get sign bit */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + + /* sign * 0.5 */ + half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); + half = LLVMBuildOr(bld->builder, sign, half, ""); + half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); + } res = LLVMBuildFAdd(bld->builder, a, half, ""); } @@ -1142,37 +1312,42 @@ lp_build_ifloor(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { - /* Take the sign bit and add it to 1 constant */ - LLVMTypeRef vec_type = lp_build_vec_type(type); - unsigned mantissa = lp_mantissa(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; - LLVMValueRef offset; - - /* sign = a < 0 ? ~0 : 0 */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); - - /* offset = -0.99999(9)f */ - offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); - offset = LLVMConstBitCast(offset, int_vec_type); - - /* offset = a < 0 ? offset : 0.0f */ - offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); - - res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res"); + res = a; + + if (type.sign) { + /* Take the sign bit and add it to 1 constant */ + LLVMTypeRef vec_type = bld->vec_type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + LLVMValueRef offset; + + /* sign = a < 0 ? ~0 : 0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); + + /* offset = -0.99999(9)f */ + offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); + offset = LLVMConstBitCast(offset, int_vec_type); + + /* offset = a < 0 ? offset : 0.0f */ + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); + + res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res"); + } } /* round to nearest (toward zero) */ @@ -1192,35 +1367,39 @@ lp_build_iceil(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); - if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { - LLVMTypeRef vec_type = lp_build_vec_type(type); + LLVMTypeRef vec_type = bld->vec_type; unsigned mantissa = lp_mantissa(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; LLVMValueRef offset; - /* sign = a < 0 ? 0 : ~0 */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); - sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); - /* offset = 0.99999(9)f */ offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); - offset = LLVMConstBitCast(offset, int_vec_type); - /* offset = a < 0 ? 0.0 : offset */ - offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + if (type.sign) { + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* sign = a < 0 ? 0 : ~0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); + sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); + + /* offset = a < 0 ? 0.0 : offset */ + offset = LLVMConstBitCast(offset, int_vec_type); + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + } res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res"); } @@ -1232,6 +1411,46 @@ lp_build_iceil(struct lp_build_context *bld, } +/** + * Combined ifloor() & fract(). + * + * Preferred to calling the functions separately, as it will ensure that the + * stratergy (floor() vs ifloor()) that results in less redundant work is used. + */ +void +lp_build_ifloor_fract(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart) +{ + const struct lp_type type = bld->type; + LLVMValueRef ipart; + + assert(type.floating); + assert(lp_check_value(type, a)); + + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { + /* + * floor() is easier. + */ + + ipart = lp_build_floor(bld, a); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart"); + } + else { + /* + * ifloor() is easier. + */ + + *out_ipart = lp_build_ifloor(bld, a); + ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart"); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + } +} + + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a) @@ -2041,6 +2260,71 @@ lp_build_exp2(struct lp_build_context *bld, /** + * Extract the exponent of a IEEE-754 floating point value. + * + * Optionally apply an integer bias. + * + * Result is an integer value with + * + * ifloor(log2(x)) + bias + */ +LLVMValueRef +lp_build_extract_exponent(struct lp_build_context *bld, + LLVMValueRef x, + int bias) +{ + const struct lp_type type = bld->type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef res; + + assert(type.floating); + + assert(lp_check_value(bld->type, x)); + + x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, ""); + + res = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), ""); + res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(type, 255), ""); + res = LLVMBuildSub(bld->builder, res, lp_build_const_int_vec(type, 127 - bias), ""); + + return res; +} + + +/** + * Extract the mantissa of the a floating. + * + * Result is a floating point value with + * + * x / floor(log2(x)) + */ +LLVMValueRef +lp_build_extract_mantissa(struct lp_build_context *bld, + LLVMValueRef x) +{ + const struct lp_type type = bld->type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1); + LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); + LLVMValueRef res; + + assert(lp_check_value(bld->type, x)); + + assert(type.floating); + + x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, ""); + + /* res = x / 2**ipart */ + res = LLVMBuildAnd(bld->builder, x, mantmask, ""); + res = LLVMBuildOr(bld->builder, res, one, ""); + res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, ""); + + return res; +} + + + +/** * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ * These coefficients can be generate with * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html @@ -2159,3 +2443,62 @@ lp_build_log2(struct lp_build_context *bld, lp_build_log2_approx(bld, x, NULL, NULL, &res); return res; } + + +/** + * Faster (and less accurate) log2. + * + * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) + * + * Piece-wise linear approximation, with exact results when x is a + * power of two. + * + * See http://www.flipcode.com/archives/Fast_log_Function.shtml + */ +LLVMValueRef +lp_build_fast_log2(struct lp_build_context *bld, + LLVMValueRef x) +{ + LLVMValueRef ipart; + LLVMValueRef fpart; + + assert(lp_check_value(bld->type, x)); + + assert(bld->type.floating); + + /* ipart = floor(log2(x)) - 1 */ + ipart = lp_build_extract_exponent(bld, x, -1); + ipart = LLVMBuildSIToFP(bld->builder, ipart, bld->vec_type, ""); + + /* fpart = x / 2**ipart */ + fpart = lp_build_extract_mantissa(bld, x); + + /* ipart + fpart */ + return LLVMBuildFAdd(bld->builder, ipart, fpart, ""); +} + + +/** + * Fast implementation of iround(log2(x)). + * + * Not an approximation -- it should give accurate results all the time. + */ +LLVMValueRef +lp_build_ilog2(struct lp_build_context *bld, + LLVMValueRef x) +{ + LLVMValueRef sqrt2 = lp_build_const_vec(bld->type, M_SQRT2); + LLVMValueRef ipart; + + assert(bld->type.floating); + + assert(lp_check_value(bld->type, x)); + + /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ + x = LLVMBuildFMul(bld->builder, x, sqrt2, ""); + + /* ipart = floor(log2(x) + 0.5) */ + ipart = lp_build_extract_exponent(bld, x, 0); + + return ipart; +} |