diff options
Diffstat (limited to 'src/gallium/drivers/llvmpipe/lp_bld_conv.c')
-rw-r--r-- | src/gallium/drivers/llvmpipe/lp_bld_conv.c | 93 |
1 files changed, 71 insertions, 22 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c index 54d2e13d34..3a54272cbd 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c @@ -28,18 +28,34 @@ /** * @file - * Helper + * Helper functions for type conversions. * - * LLVM IR doesn't support all basic arithmetic operations we care about (most - * notably min/max and saturated operations), and it is often necessary to - * resort machine-specific intrinsics directly. The functions here hide all - * these implementation details from the other modules. + * We want to use the fastest type for a given computation whenever feasible. + * The other side of this is that we need to be able convert between several + * types accurately and efficiently. * - * We also do simple expressions simplification here. Reasons are: - * - it is very easy given we have all necessary information readily available - * - LLVM optimization passes fail to simplify several vector expressions - * - We often know value constraints which the optimization passes have no way - * of knowing, such as when source arguments are known to be in [0, 1] range. + * Conversion between types of different bit width is quite complex since a + * + * To remember there are a few invariants in type conversions: + * + * - register width must remain constant: + * + * src_type.width * src_type.length == dst_type.width * dst_type.length + * + * - total number of elements must remain constant: + * + * src_type.length * num_srcs == dst_type.length * num_dsts + * + * It is not always possible to do the conversion both accurately and + * efficiently, usually due to lack of adequate machine instructions. In these + * cases it is important not to cut shortcuts here and sacrifice accuracy, as + * there this functions can be used anywhere. In the future we might have a + * precision parameter which can gauge the accuracy vs efficiency compromise, + * but for now if the data conversion between two stages happens to be the + * bottleneck, then most likely should just avoid converting at all and run + * both stages with the same type. + * + * Make sure to run lp_test_conv unit test after any change to this file. * * @author Jose Fonseca <jfonseca@vmware.com> */ @@ -55,6 +71,19 @@ #include "lp_bld_conv.h" +/** + * Special case for converting clamped IEEE-754 floats to unsigned norms. + * + * The mathematical voodoo below may seem excessive but it is actually + * paramount we do it this way for several reasons. First, there is no single + * precision FP to unsigned integer conversion Intel SSE instruction. Second, + * secondly, even if there was, since the FP's mantissa takes only a fraction + * of register bits the typically scale and cast approach would require double + * precision for accurate results, and therefore half the throughput + * + * Although the result values can be scaled to an arbitrary bit width specified + * by dst_width, the actual result type will have the same width. + */ LLVMValueRef lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, union lp_type src_type, @@ -118,7 +147,7 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, /** - * Inverse of lp_build_clamped_float_to_unsigned_norm. + * Inverse of lp_build_clamped_float_to_unsigned_norm above. */ LLVMValueRef lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, @@ -139,7 +168,6 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, mantissa = lp_mantissa(dst_type); - /* We cannot carry more bits than the mantissa */ n = MIN2(mantissa, src_width); ubound = ((unsigned long long)1 << n); @@ -212,6 +240,12 @@ lp_build_const_pack_shuffle(unsigned n) } +/** + * Expand the bit width. + * + * This will only change the number of bits the values are represented, not the + * values themselved. + */ static void lp_build_expand(LLVMBuilderRef builder, union lp_type src_type, @@ -270,9 +304,13 @@ lp_build_expand(LLVMBuilderRef builder, /** * Non-interleaved pack. * - * lo = __ l0 __ l1 __ l2 __.. __ ln - * hi = __ h0 __ h1 __ h2 __.. __ hn - * res = l0 l1 l2 .. ln h0 h1 h2 .. hn + * This will move values as + * + * lo = __ l0 __ l1 __ l2 __.. __ ln + * hi = __ h0 __ h1 __ h2 __.. __ hn + * res = l0 l1 l2 .. ln h0 h1 h2 .. hn + * + * TODO: handle saturation consistently. */ static LLVMValueRef lp_build_pack2(LLVMBuilderRef builder, @@ -347,6 +385,11 @@ lp_build_pack2(LLVMBuilderRef builder, } +/** + * Truncate the bit width. + * + * TODO: Handle saturation consistently. + */ static LLVMValueRef lp_build_trunc(LLVMBuilderRef builder, union lp_type src_type, @@ -392,13 +435,10 @@ lp_build_trunc(LLVMBuilderRef builder, /** - * Convert between two SIMD types. + * Generic type conversion. * - * Converting between SIMD types of different element width poses a problem: - * SIMD registers have a fixed number of bits, so different element widths - * imply different vector lengths. Therefore we must multiplex the multiple - * incoming sources into a single destination vector, or demux a single incoming - * vector into multiple vectors. + * TODO: Take a precision argument, or even better, add a new precision member + * to the lp_type union. */ void lp_build_conv(LLVMBuilderRef builder, @@ -605,7 +645,14 @@ lp_build_conv(LLVMBuilderRef builder, /** - * Convenience wrapper around lp_build_conv for bit masks. + * Bit mask conversion. + * + * This will convert the integer masks that match the given types. + * + * The mask values should 0 or -1, i.e., all bits either set to zero or one. + * Any other value will likely cause in unpredictable results. + * + * This is basically a very trimmed down version of lp_build_conv. */ void lp_build_conv_mask(LLVMBuilderRef builder, @@ -621,6 +668,8 @@ lp_build_conv_mask(LLVMBuilderRef builder, assert(src_type.length * num_srcs == dst_type.length * num_dsts); /* + * Drop + * * We assume all values are 0 or -1 */ |