diff options
Diffstat (limited to 'src/gallium/auxiliary/gallivm')
22 files changed, 1876 insertions, 461 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index d926b2de18..f5f2623e46 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -56,7 +56,6 @@ #include "lp_bld_intr.h" #include "lp_bld_logic.h" #include "lp_bld_pack.h" -#include "lp_bld_debug.h" #include "lp_bld_arit.h" @@ -847,6 +846,11 @@ lp_build_round_sse41(struct lp_build_context *bld, } +/** + * Return the integer part of a float (vector) value. The returned value is + * a float (vector). + * Ex: trunc(-1.5) = 1.0 + */ LLVMValueRef lp_build_trunc(struct lp_build_context *bld, LLVMValueRef a) @@ -869,6 +873,12 @@ lp_build_trunc(struct lp_build_context *bld, } +/** + * Return float (vector) rounded to nearest integer (vector). The returned + * value is a float (vector). + * Ex: round(0.9) = 1.0 + * Ex: round(-1.5) = -2.0 + */ LLVMValueRef lp_build_round(struct lp_build_context *bld, LLVMValueRef a) @@ -890,6 +900,11 @@ lp_build_round(struct lp_build_context *bld, } +/** + * Return floor of float (vector), result is a float (vector) + * Ex: floor(1.1) = 1.0 + * Ex: floor(-1.1) = -2.0 + */ LLVMValueRef lp_build_floor(struct lp_build_context *bld, LLVMValueRef a) @@ -911,6 +926,11 @@ lp_build_floor(struct lp_build_context *bld, } +/** + * Return ceiling of float (vector), returning float (vector). + * Ex: ceil( 1.1) = 2.0 + * Ex: ceil(-1.1) = -1.0 + */ LLVMValueRef lp_build_ceil(struct lp_build_context *bld, LLVMValueRef a) @@ -933,7 +953,7 @@ lp_build_ceil(struct lp_build_context *bld, /** - * Return fractional part of 'a' computed as a - floor(f) + * Return fractional part of 'a' computed as a - floor(a) * Typically used in texture coord arithmetic. */ LLVMValueRef @@ -946,8 +966,9 @@ lp_build_fract(struct lp_build_context *bld, /** - * Convert to integer, through whichever rounding method that's fastest, - * typically truncating toward zero. + * Return the integer part of a float (vector) value. The returned value is + * an integer (vector). + * Ex: itrunc(-1.5) = 1 */ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, @@ -964,7 +985,10 @@ lp_build_itrunc(struct lp_build_context *bld, /** - * Convert float[] to int[] with round(). + * Return float (vector) rounded to nearest integer (vector). The returned + * value is an integer (vector). + * Ex: iround(0.9) = 1 + * Ex: iround(-1.5) = -2 */ LLVMValueRef lp_build_iround(struct lp_build_context *bld, @@ -1007,7 +1031,9 @@ lp_build_iround(struct lp_build_context *bld, /** - * Convert float[] to int[] with floor(). + * Return floor of float (vector), result is an int (vector) + * Ex: ifloor(1.1) = 1.0 + * Ex: ifloor(-1.1) = -2.0 */ LLVMValueRef lp_build_ifloor(struct lp_build_context *bld, @@ -1034,29 +1060,31 @@ lp_build_ifloor(struct lp_build_context *bld, /* sign = a < 0 ? ~0 : 0 */ sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), ""); - lp_build_name(sign, "floor.sign"); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); /* offset = -0.99999(9)f */ - offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa)); + offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); offset = LLVMConstBitCast(offset, int_vec_type); - /* offset = a < 0 ? -0.99999(9)f : 0.0f */ + /* offset = a < 0 ? offset : 0.0f */ offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, ""); - lp_build_name(offset, "floor.offset"); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); - res = LLVMBuildAdd(bld->builder, a, offset, ""); - lp_build_name(res, "floor.res"); + res = LLVMBuildAdd(bld->builder, a, offset, "ifloor.res"); } - res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); - lp_build_name(res, "floor"); + /* round to nearest (toward zero) */ + res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res"); return res; } +/** + * Return ceiling of float (vector), returning int (vector). + * Ex: iceil( 1.1) = 2 + * Ex: iceil(-1.1) = -1 + */ LLVMValueRef lp_build_iceil(struct lp_build_context *bld, LLVMValueRef a) @@ -1072,12 +1100,31 @@ lp_build_iceil(struct lp_build_context *bld, res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { - /* TODO: mimic lp_build_ifloor() here */ - assert(0); - res = bld->undef; + LLVMTypeRef vec_type = lp_build_vec_type(type); + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + LLVMValueRef offset; + + /* sign = a < 0 ? 0 : ~0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); + sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); + + /* offset = 0.99999(9)f */ + offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); + offset = LLVMConstBitCast(offset, int_vec_type); + + /* offset = a < 0 ? 0.0 : offset */ + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + + res = LLVMBuildAdd(bld->builder, a, offset, "iceil.res"); } - res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); + /* round to nearest (toward zero) */ + res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res"); return res; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h index d46b9f882b..7ee8fff140 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_const.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h @@ -107,4 +107,12 @@ lp_build_const_mask_aos(struct lp_type type, const boolean cond[4]); +static INLINE LLVMValueRef +lp_build_const_int32(int i) +{ + return LLVMConstInt(LLVMInt32Type(), i, 0); +} + + + #endif /* !LP_BLD_CONST_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 3f7f2ebde9..77012f1fac 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -83,6 +83,9 @@ * * Although the result values can be scaled to an arbitrary bit width specified * by dst_width, the actual result type will have the same width. + * + * Ex: src = { float, float, float, float } + * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. */ LLVMValueRef lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, @@ -152,6 +155,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, /** * Inverse of lp_build_clamped_float_to_unsigned_norm above. + * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] + * return {float, float, float, float} with values in range [0, 1]. */ LLVMValueRef lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, @@ -219,18 +224,19 @@ lp_build_conv(LLVMBuilderRef builder, unsigned num_tmps; unsigned i; - /* Register width must remain constant */ - assert(src_type.width * src_type.length == dst_type.width * dst_type.length); - /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); + assert(num_srcs <= LP_MAX_VECTOR_LENGTH); + assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; - for(i = 0; i < num_srcs; ++i) + for(i = 0; i < num_srcs; ++i) { + assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; + } num_tmps = num_srcs; /* @@ -326,30 +332,25 @@ lp_build_conv(LLVMBuilderRef builder, /* * Truncate or expand bit width + * + * No data conversion should happen here, although the sign bits are + * crucial to avoid bad clamping. */ - assert(!tmp_type.floating || tmp_type.width == dst_type.width); + { + struct lp_type new_type; - if(tmp_type.width > dst_type.width) { - assert(num_dsts == 1); - tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps); - tmp_type.width = dst_type.width; - tmp_type.length = dst_type.length; - num_tmps = 1; - } + new_type = tmp_type; + new_type.sign = dst_type.sign; + new_type.width = dst_type.width; + new_type.length = dst_type.length; + + lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); - if(tmp_type.width < dst_type.width) { - assert(num_tmps == 1); - lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts); - tmp_type.width = dst_type.width; - tmp_type.length = dst_type.length; + tmp_type = new_type; num_tmps = num_dsts; } - assert(tmp_type.width == dst_type.width); - assert(tmp_type.length == dst_type.length); - assert(num_tmps == num_dsts); - /* * Scale to the widest range */ @@ -406,8 +407,10 @@ lp_build_conv(LLVMBuilderRef builder, } } - for(i = 0; i < num_dsts; ++i) + for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; + assert(lp_check_value(dst_type, dst[i])); + } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h index 5f5036e7bd..60e22d727a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -48,9 +48,9 @@ struct lp_build_context; */ LLVMValueRef -lp_build_unpack_rgba_aos(LLVMBuilderRef builder, - const struct util_format_description *desc, - LLVMValueRef packed); +lp_build_format_swizzle_aos(const struct util_format_description *desc, + struct lp_build_context *bld, + LLVMValueRef unswizzled); LLVMValueRef lp_build_pack_rgba_aos(LLVMBuilderRef builder, @@ -60,7 +60,9 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, LLVMValueRef lp_build_fetch_rgba_aos(LLVMBuilderRef builder, const struct util_format_description *format_desc, - LLVMValueRef ptr, + struct lp_type type, + LLVMValueRef base_ptr, + LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j); @@ -72,7 +74,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, void lp_build_format_swizzle_soa(const struct util_format_description *format_desc, struct lp_build_context *bld, - const LLVMValueRef *unswizzled, + const LLVMValueRef unswizzled[4], LLVMValueRef swizzled_out[4]); void @@ -82,6 +84,11 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, LLVMValueRef packed, LLVMValueRef rgba_out[4]); +void +lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder, + struct lp_type dst_type, + LLVMValueRef packed, + LLVMValueRef *rgba); void lp_build_fetch_rgba_soa(LLVMBuilderRef builder, @@ -93,5 +100,18 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, LLVMValueRef j, LLVMValueRef rgba_out[4]); +/* + * YUV + */ + + +LLVMValueRef +lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j); #endif /* !LP_BLD_FORMAT_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 87e3e72a6e..0f01fc1d75 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -38,33 +38,122 @@ #include "util/u_math.h" #include "util/u_string.h" +#include "lp_bld_arit.h" #include "lp_bld_init.h" #include "lp_bld_type.h" #include "lp_bld_flow.h" +#include "lp_bld_const.h" +#include "lp_bld_conv.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_gather.h" #include "lp_bld_format.h" /** + * Basic swizzling. Rearrange the order of the unswizzled array elements + * according to the format description. PIPE_SWIZZLE_ZERO/ONE are supported + * too. + * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}. + */ +LLVMValueRef +lp_build_format_swizzle_aos(const struct util_format_description *desc, + struct lp_build_context *bld, + LLVMValueRef unswizzled) +{ + unsigned char swizzles[4]; + unsigned chan; + + assert(bld->type.length % 4 == 0); + + for (chan = 0; chan < 4; ++chan) { + enum util_format_swizzle swizzle; + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + /* + * For ZS formats do RGBA = ZZZ1 + */ + if (chan == 3) { + swizzle = UTIL_FORMAT_SWIZZLE_1; + } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) { + swizzle = UTIL_FORMAT_SWIZZLE_0; + } else { + swizzle = desc->swizzle[0]; + } + } else { + swizzle = desc->swizzle[chan]; + } + swizzles[chan] = swizzle; + } + + return lp_build_swizzle_aos(bld, unswizzled, swizzles); +} + + +/** + * Whether the format matches the vector type, apart of swizzles. + */ +static INLINE boolean +format_matches_type(const struct util_format_description *desc, + struct lp_type type) +{ + enum util_format_type chan_type; + unsigned chan; + + assert(type.length % 4 == 0); + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || + desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB || + desc->block.width != 1 || + desc->block.height != 1) { + return FALSE; + } + + if (type.floating) { + chan_type = UTIL_FORMAT_TYPE_FLOAT; + } else if (type.fixed) { + chan_type = UTIL_FORMAT_TYPE_FIXED; + } else if (type.sign) { + chan_type = UTIL_FORMAT_TYPE_SIGNED; + } else { + chan_type = UTIL_FORMAT_TYPE_UNSIGNED; + } + + for (chan = 0; chan < desc->nr_channels; ++chan) { + if (desc->channel[chan].size != type.width) { + return FALSE; + } + + if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) { + if (desc->channel[chan].type != chan_type || + desc->channel[chan].normalized != type.norm) { + return FALSE; + } + } + } + + return TRUE; +} + + +/** * Unpack a single pixel into its RGBA components. * * @param desc the pixel format for the packed pixel value * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM * - * @return RGBA in a 4 floats vector. + * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector. */ -LLVMValueRef -lp_build_unpack_rgba_aos(LLVMBuilderRef builder, - const struct util_format_description *desc, - LLVMValueRef packed) +static INLINE LLVMValueRef +lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder, + const struct util_format_description *desc, + LLVMValueRef packed) { LLVMValueRef shifted, casted, scaled, masked; LLVMValueRef shifts[4]; LLVMValueRef masks[4]; LLVMValueRef scales[4]; - LLVMValueRef swizzles[4]; - LLVMValueRef aux[4]; + boolean normalized; - int empty_channel; boolean needs_uitofp; unsigned shift; unsigned i; @@ -77,8 +166,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, /* Do the intermediate integer computations with 32bit integers since it * matches floating point size */ - if (desc->block.bits < 32) - packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), ""); + assert (LLVMTypeOf(packed) == LLVMInt32Type()); /* Broadcast the packed value to all four channels * before: packed = BGRA @@ -98,7 +186,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, /* Initialize vector constants */ normalized = FALSE; needs_uitofp = FALSE; - empty_channel = -1; shift = 0; /* Loop over 4 color components */ @@ -109,7 +196,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, shifts[i] = LLVMGetUndef(LLVMInt32Type()); masks[i] = LLVMConstNull(LLVMInt32Type()); scales[i] = LLVMConstNull(LLVMFloatType()); - empty_channel = i; } else { unsigned long long mask = (1ULL << bits) - 1; @@ -158,52 +244,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, else scaled = casted; - for (i = 0; i < 4; ++i) - aux[i] = LLVMGetUndef(LLVMFloatType()); - - /* Build swizzles vector to put components into R,G,B,A order */ - for (i = 0; i < 4; ++i) { - enum util_format_swizzle swizzle; - - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - /* - * For ZS formats do RGBA = ZZZ1 - */ - if (i == 3) { - swizzle = UTIL_FORMAT_SWIZZLE_1; - } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) { - swizzle = UTIL_FORMAT_SWIZZLE_0; - } else { - swizzle = desc->swizzle[0]; - } - } else { - swizzle = desc->swizzle[i]; - } - - switch (swizzle) { - case UTIL_FORMAT_SWIZZLE_X: - case UTIL_FORMAT_SWIZZLE_Y: - case UTIL_FORMAT_SWIZZLE_Z: - case UTIL_FORMAT_SWIZZLE_W: - swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0); - break; - case UTIL_FORMAT_SWIZZLE_0: - assert(empty_channel >= 0); - swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0); - break; - case UTIL_FORMAT_SWIZZLE_1: - swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0); - aux[0] = LLVMConstReal(LLVMFloatType(), 1.0); - break; - case UTIL_FORMAT_SWIZZLE_NONE: - swizzles[i] = LLVMGetUndef(LLVMFloatType()); - assert(0); - break; - } - } - - return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), - LLVMConstVector(swizzles, 4), ""); + return scaled; } @@ -310,22 +351,65 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, } + + /** * Fetch a pixel into a 4 float AoS. * * \param format_desc describes format of the image we're fetching from * \param ptr address of the pixel block (or the texel if uncompressed) * \param i, j the sub-block pixel coordinates. For non-compressed formats - * these will always be (0,). - * \return valueRef with the float[4] RGBA pixel + * these will always be (0, 0). + * \return a 4 element vector with the pixel's RGBA values. */ LLVMValueRef lp_build_fetch_rgba_aos(LLVMBuilderRef builder, const struct util_format_description *format_desc, - LLVMValueRef ptr, + struct lp_type type, + LLVMValueRef base_ptr, + LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j) { + unsigned num_pixels = type.length / 4; + struct lp_build_context bld; + + assert(type.length <= LP_MAX_VECTOR_LENGTH); + assert(type.length % 4 == 0); + + lp_build_context_init(&bld, builder, type); + + /* + * Trivial case + * + * The format matches the type (apart of a swizzle) so no need for + * scaling or converting. + */ + + if (format_matches_type(format_desc, type) && + format_desc->block.bits <= type.width * 4 && + util_is_pot(format_desc->block.bits)) { + LLVMValueRef packed; + + /* + * The format matches the type (apart of a swizzle) so no need for + * scaling or converting. + */ + + packed = lp_build_gather(builder, type.length/4, + format_desc->block.bits, type.width*4, + base_ptr, offset); + + assert(format_desc->block.bits <= type.width * type.length); + + packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(type), ""); + + return lp_build_format_swizzle_aos(format_desc, &bld, packed); + } + + /* + * Bit arithmetic + */ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || @@ -337,21 +421,77 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, format_desc->is_bitmask && !format_desc->is_mixed && (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED || - format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) - { - LLVMValueRef packed; + format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) { + + LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef res; + unsigned k; + + /* + * Unpack a pixel at a time into a <4 x float> RGBA vector + */ + + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef packed; + + packed = lp_build_gather_elem(builder, num_pixels, + format_desc->block.bits, 32, + base_ptr, offset, k); - ptr = LLVMBuildBitCast(builder, ptr, - LLVMPointerType(LLVMIntType(format_desc->block.bits), 0) , - ""); + tmps[k] = lp_build_unpack_arith_rgba_aos(builder, format_desc, + packed); + } + + /* + * Type conversion. + * + * TODO: We could avoid floating conversion for integer to + * integer conversions. + */ - packed = LLVMBuildLoad(builder, ptr, "packed"); + lp_build_conv(builder, + lp_float32_vec4_type(), + type, + tmps, num_pixels, &res, 1); - return lp_build_unpack_rgba_aos(builder, format_desc, packed); + return lp_build_format_swizzle_aos(format_desc, &bld, res); } - else if (format_desc->fetch_rgba_float) { + + /* + * YUV / subsampled formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_subsampled_rgba_aos(builder, + format_desc, + num_pixels, + base_ptr, + offset, + i, j); + + lp_build_conv(builder, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + + /* + * Fallback to util_format_description::fetch_rgba_8unorm(). + */ + + if (format_desc->fetch_rgba_8unorm && + !type.floating && type.width == 8 && !type.sign && type.norm) { /* - * Fallback to calling util_format_description::fetch_rgba_float. + * Fallback to calling util_format_description::fetch_rgba_8unorm. * * This is definitely not the most efficient way of fetching pixels, as * we miss the opportunity to do vectorization, but this it is a @@ -361,9 +501,113 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); char name[256]; + LLVMTypeRef i8t = LLVMInt8Type(); + LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); + LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef function; + LLVMValueRef tmp_ptr; LLVMValueRef tmp; - LLVMValueRef args[4]; + LLVMValueRef res; + unsigned k; + + util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm", + format_desc->short_name); + + /* + * Declare and bind format_desc->fetch_rgba_8unorm(). + */ + + function = LLVMGetNamedFunction(module, name); + if (!function) { + LLVMTypeRef ret_type; + LLVMTypeRef arg_types[4]; + LLVMTypeRef function_type; + + ret_type = LLVMVoidType(); + arg_types[0] = pi8t; + arg_types[1] = pi8t; + arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8); + function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); + function = LLVMAddFunction(module, name, function_type); + + LLVMSetFunctionCallConv(function, LLVMCCallConv); + LLVMSetLinkage(function, LLVMExternalLinkage); + + assert(LLVMIsDeclaration(function)); + + LLVMAddGlobalMapping(lp_build_engine, function, + func_to_pointer((func_pointer)format_desc->fetch_rgba_8unorm)); + } + + tmp_ptr = lp_build_alloca(builder, i32t, ""); + + res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels)); + + /* + * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result + * in the SoA vectors. + */ + + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); + LLVMValueRef args[4]; + + args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); + args[1] = lp_build_gather_elem_ptr(builder, num_pixels, + base_ptr, offset, k); + + if (num_pixels == 1) { + args[2] = i; + args[3] = j; + } + else { + args[2] = LLVMBuildExtractElement(builder, i, index, ""); + args[3] = LLVMBuildExtractElement(builder, j, index, ""); + } + + LLVMBuildCall(builder, function, args, Elements(args), ""); + + tmp = LLVMBuildLoad(builder, tmp_ptr, ""); + + if (num_pixels == 1) { + res = tmp; + } + else { + res = LLVMBuildInsertElement(builder, res, tmp, index, ""); + } + } + + /* Bitcast from <n x i32> to <4n x i8> */ + res = LLVMBuildBitCast(builder, res, bld.vec_type, ""); + + return res; + } + + + /* + * Fallback to util_format_description::fetch_rgba_float(). + */ + + if (format_desc->fetch_rgba_float) { + /* + * Fallback to calling util_format_description::fetch_rgba_float. + * + * This is definitely not the most efficient way of fetching pixels, as + * we miss the opportunity to do vectorization, but this it is a + * convenient for formats or scenarios for which there was no opportunity + * or incentive to optimize. + */ + + LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); + char name[256]; + LLVMTypeRef f32t = LLVMFloatType(); + LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4); + LLVMTypeRef pf32t = LLVMPointerType(f32t, 0); + LLVMValueRef function; + LLVMValueRef tmp_ptr; + LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef res; + unsigned k; util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", format_desc->short_name); @@ -379,7 +623,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, LLVMTypeRef function_type; ret_type = LLVMVoidType(); - arg_types[0] = LLVMPointerType(LLVMFloatType(), 0); + arg_types[0] = pf32t; arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0); arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8); function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); @@ -394,25 +638,43 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, func_to_pointer((func_pointer)format_desc->fetch_rgba_float)); } - tmp = lp_build_alloca(builder, LLVMVectorType(LLVMFloatType(), 4), ""); + tmp_ptr = lp_build_alloca(builder, f32x4t, ""); /* * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result * in the SoA vectors. */ - args[0] = LLVMBuildBitCast(builder, tmp, - LLVMPointerType(LLVMFloatType(), 0), ""); - args[1] = ptr; - args[2] = i; - args[3] = j; + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef args[4]; - LLVMBuildCall(builder, function, args, Elements(args), ""); + args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, ""); + args[1] = lp_build_gather_elem_ptr(builder, num_pixels, + base_ptr, offset, k); - return LLVMBuildLoad(builder, tmp, ""); - } - else { - assert(0); - return LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)); + if (num_pixels == 1) { + args[2] = i; + args[3] = j; + } + else { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); + args[2] = LLVMBuildExtractElement(builder, i, index, ""); + args[3] = LLVMBuildExtractElement(builder, j, index, ""); + } + + LLVMBuildCall(builder, function, args, Elements(args), ""); + + tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); + } + + lp_build_conv(builder, + lp_float32_vec4_type(), + type, + tmps, num_pixels, &res, 1); + + return res; } + + assert(0); + return lp_build_undef(type); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index e1b94adc85..9f405921b0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -36,7 +36,7 @@ #include "lp_bld_const.h" #include "lp_bld_conv.h" #include "lp_bld_swizzle.h" -#include "lp_bld_sample.h" /* for lp_build_gather */ +#include "lp_bld_gather.h" #include "lp_bld_format.h" @@ -251,6 +251,41 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, } +void +lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder, + struct lp_type dst_type, + LLVMValueRef packed, + LLVMValueRef *rgba) +{ + LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff); + unsigned chan; + + packed = LLVMBuildBitCast(builder, packed, + lp_build_int_vec_type(dst_type), ""); + + /* Decode the input vector components */ + for (chan = 0; chan < 4; ++chan) { + unsigned start = chan*8; + unsigned stop = start + 8; + LLVMValueRef input; + + input = packed; + + if (start) + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(dst_type, start), ""); + + if (stop < 32) + input = LLVMBuildAnd(builder, input, mask, ""); + + input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input); + + rgba[chan] = input; + } +} + + + /** * Fetch a texels from a texture, returning them in SoA layout. * @@ -311,20 +346,49 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, format_desc, type, packed, rgba_out); + return; } - else { - /* - * Fallback to calling lp_build_fetch_rgba_aos for each pixel. - * - * This is not the most efficient way of fetching pixels, as we - * miss some opportunities to do vectorization, but this is - * convenient for formats or scenarios for which there was no - * opportunity or incentive to optimize. - */ + /* + * Try calling lp_build_fetch_rgba_aos for all pixels. + */ + + if (util_format_fits_8unorm(format_desc) && + type.floating && type.width == 32 && type.length == 4) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = type.length * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type, + base_ptr, offset, i, j); + + lp_build_rgba8_to_f32_soa(builder, + type, + tmp, + rgba_out); + + return; + } + + /* + * Fallback to calling lp_build_fetch_rgba_aos for each pixel. + * + * This is not the most efficient way of fetching pixels, as we + * miss some opportunities to do vectorization, but this is + * convenient for formats or scenarios for which there was no + * opportunity or incentive to optimize. + */ + + { unsigned k, chan; + struct lp_type tmp_type; - assert(type.floating); + tmp_type = type; + tmp_type.length = 4; for (chan = 0; chan < 4; ++chan) { rgba_out[chan] = lp_build_undef(type); @@ -334,18 +398,17 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, for(k = 0; k < type.length; ++k) { LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); LLVMValueRef offset_elem; - LLVMValueRef ptr; LLVMValueRef i_elem, j_elem; LLVMValueRef tmp; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); - ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ - tmp = lp_build_fetch_rgba_aos(builder, format_desc, ptr, + tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type, + base_ptr, offset_elem, i_elem, j_elem); /* diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c new file mode 100644 index 0000000000..0a5038bc98 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c @@ -0,0 +1,399 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +/** + * @file + * YUV pixel format manipulation. + * + * @author Jose Fonseca <jfonseca@vmware.com> + */ + + +#include "util/u_format.h" + +#include "lp_bld_arit.h" +#include "lp_bld_type.h" +#include "lp_bld_const.h" +#include "lp_bld_conv.h" +#include "lp_bld_gather.h" +#include "lp_bld_format.h" + + +/** + * Extract Y, U, V channels from packed UYVY. + * @param packed is a <n x i32> vector with the packed UYVY blocks + * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) + */ +static void +uyvy_to_yuv_soa(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i, + LLVMValueRef *y, + LLVMValueRef *u, + LLVMValueRef *v) +{ + struct lp_type type; + LLVMValueRef shift, mask; + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + assert(lp_check_value(type, packed)); + assert(lp_check_value(type, i)); + + /* + * y = (uyvy >> 16*i) & 0xff + * u = (uyvy ) & 0xff + * v = (uyvy >> 16 ) & 0xff + */ + + shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); + shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), ""); + *y = LLVMBuildLShr(builder, packed, shift, ""); + *u = packed; + *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), ""); + + mask = lp_build_const_int_vec(type, 0xff); + + *y = LLVMBuildAnd(builder, *y, mask, "y"); + *u = LLVMBuildAnd(builder, *u, mask, "u"); + *v = LLVMBuildAnd(builder, *v, mask, "v"); +} + + +/** + * Extract Y, U, V channels from packed YUYV. + * @param packed is a <n x i32> vector with the packed YUYV blocks + * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) + */ +static void +yuyv_to_yuv_soa(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i, + LLVMValueRef *y, + LLVMValueRef *u, + LLVMValueRef *v) +{ + struct lp_type type; + LLVMValueRef shift, mask; + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + assert(lp_check_value(type, packed)); + assert(lp_check_value(type, i)); + + /* + * y = (yuyv >> 16*i) & 0xff + * u = (yuyv >> 8 ) & 0xff + * v = (yuyv >> 24 ) & 0xff + */ + + shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); + *y = LLVMBuildLShr(builder, packed, shift, ""); + *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), ""); + *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), ""); + + mask = lp_build_const_int_vec(type, 0xff); + + *y = LLVMBuildAnd(builder, *y, mask, "y"); + *u = LLVMBuildAnd(builder, *u, mask, "u"); + *v = LLVMBuildAnd(builder, *v, mask, "v"); +} + + +static INLINE void +yuv_to_rgb_soa(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef y, LLVMValueRef u, LLVMValueRef v, + LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b) +{ + struct lp_type type; + struct lp_build_context bld; + + LLVMValueRef c0; + LLVMValueRef c8; + LLVMValueRef c16; + LLVMValueRef c128; + LLVMValueRef c255; + + LLVMValueRef cy; + LLVMValueRef cug; + LLVMValueRef cub; + LLVMValueRef cvr; + LLVMValueRef cvg; + + memset(&type, 0, sizeof type); + type.sign = TRUE; + type.width = 32; + type.length = n; + + lp_build_context_init(&bld, builder, type); + + assert(lp_check_value(type, y)); + assert(lp_check_value(type, u)); + assert(lp_check_value(type, v)); + + /* + * Constants + */ + + c0 = lp_build_const_int_vec(type, 0); + c8 = lp_build_const_int_vec(type, 8); + c16 = lp_build_const_int_vec(type, 16); + c128 = lp_build_const_int_vec(type, 128); + c255 = lp_build_const_int_vec(type, 255); + + cy = lp_build_const_int_vec(type, 298); + cug = lp_build_const_int_vec(type, -100); + cub = lp_build_const_int_vec(type, 516); + cvr = lp_build_const_int_vec(type, 409); + cvg = lp_build_const_int_vec(type, -208); + + /* + * y -= 16; + * u -= 128; + * v -= 128; + */ + + y = LLVMBuildSub(builder, y, c16, ""); + u = LLVMBuildSub(builder, u, c128, ""); + v = LLVMBuildSub(builder, v, c128, ""); + + /* + * r = 298 * _y + 409 * _v + 128; + * g = 298 * _y - 100 * _u - 208 * _v + 128; + * b = 298 * _y + 516 * _u + 128; + */ + + y = LLVMBuildMul(builder, y, cy, ""); + y = LLVMBuildAdd(builder, y, c128, ""); + + *r = LLVMBuildMul(builder, v, cvr, ""); + *g = LLVMBuildAdd(builder, + LLVMBuildMul(builder, u, cug, ""), + LLVMBuildMul(builder, v, cvg, ""), + ""); + *b = LLVMBuildMul(builder, u, cub, ""); + + *r = LLVMBuildAdd(builder, *r, y, ""); + *g = LLVMBuildAdd(builder, *g, y, ""); + *b = LLVMBuildAdd(builder, *b, y, ""); + + /* + * r >>= 8; + * g >>= 8; + * b >>= 8; + */ + + *r = LLVMBuildAShr(builder, *r, c8, "r"); + *g = LLVMBuildAShr(builder, *g, c8, "g"); + *b = LLVMBuildAShr(builder, *b, c8, "b"); + + /* + * Clamp + */ + + *r = lp_build_clamp(&bld, *r, c0, c255); + *g = lp_build_clamp(&bld, *g, c0, c255); + *b = lp_build_clamp(&bld, *b, c0, c255); +} + + +static LLVMValueRef +rgb_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef r, LLVMValueRef g, LLVMValueRef b) +{ + struct lp_type type; + LLVMValueRef a; + LLVMValueRef rgba; + + memset(&type, 0, sizeof type); + type.sign = TRUE; + type.width = 32; + type.length = n; + + assert(lp_check_value(type, r)); + assert(lp_check_value(type, g)); + assert(lp_check_value(type, b)); + + /* + * Make a 4 x unorm8 vector + */ + + r = r; + g = LLVMBuildShl(builder, g, lp_build_const_int_vec(type, 8), ""); + b = LLVMBuildShl(builder, b, lp_build_const_int_vec(type, 16), ""); + a = lp_build_const_int_vec(type, 0xff000000); + + rgba = r; + rgba = LLVMBuildOr(builder, rgba, g, ""); + rgba = LLVMBuildOr(builder, rgba, b, ""); + rgba = LLVMBuildOr(builder, rgba, a, ""); + + rgba = LLVMBuildBitCast(builder, rgba, + LLVMVectorType(LLVMInt8Type(), 4*n), ""); + + return rgba; +} + + +/** + * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS + */ +static LLVMValueRef +uyvy_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef y, u, v; + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + uyvy_to_yuv_soa(builder, n, packed, i, &y, &u, &v); + yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS + */ +static LLVMValueRef +yuyv_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef y, u, v; + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + yuyv_to_yuv_soa(builder, n, packed, i, &y, &u, &v); + yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS + */ +static LLVMValueRef +rgbg_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + uyvy_to_yuv_soa(builder, n, packed, i, &g, &r, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS + */ +static LLVMValueRef +grgb_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + yuyv_to_yuv_soa(builder, n, packed, i, &g, &r, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * @param n is the number of pixels processed + * @param packed is a <n x i32> vector with the packed YUYV blocks + * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) + * @return a <4*n x i8> vector with the pixel RGBA values in AoS + */ +LLVMValueRef +lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMValueRef packed; + LLVMValueRef rgba; + + assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED); + assert(format_desc->block.bits == 32); + assert(format_desc->block.width == 2); + assert(format_desc->block.height == 1); + + packed = lp_build_gather(builder, n, 32, 32, base_ptr, offset); + + (void)j; + + switch (format_desc->format) { + case PIPE_FORMAT_UYVY: + rgba = uyvy_to_rgba_aos(builder, n, packed, i); + break; + case PIPE_FORMAT_YUYV: + rgba = yuyv_to_rgba_aos(builder, n, packed, i); + break; + case PIPE_FORMAT_R8G8_B8G8_UNORM: + rgba = rgbg_to_rgba_aos(builder, n, packed, i); + break; + case PIPE_FORMAT_G8R8_G8B8_UNORM: + rgba = grgb_to_rgba_aos(builder, n, packed, i); + break; + default: + assert(0); + rgba = LLVMGetUndef(LLVMVectorType(LLVMInt8Type(), 4*n)); + break; + } + + return rgba; +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c new file mode 100644 index 0000000000..d60472e065 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c @@ -0,0 +1,148 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#include "util/u_debug.h" +#include "lp_bld_debug.h" +#include "lp_bld_const.h" +#include "lp_bld_format.h" +#include "lp_bld_gather.h" + + +/** + * Get the pointer to one element from scatter positions in memory. + * + * @sa lp_build_gather() + */ +LLVMValueRef +lp_build_gather_elem_ptr(LLVMBuilderRef builder, + unsigned length, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i) +{ + LLVMValueRef offset; + LLVMValueRef ptr; + + assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0)); + + if (length == 1) { + assert(i == 0); + offset = offsets; + } else { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + offset = LLVMBuildExtractElement(builder, offsets, index, ""); + } + + ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, ""); + + return ptr; +} + + +/** + * Gather one element from scatter positions in memory. + * + * @sa lp_build_gather() + */ +LLVMValueRef +lp_build_gather_elem(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i) +{ + LLVMTypeRef src_type = LLVMIntType(src_width); + LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); + LLVMTypeRef dst_elem_type = LLVMIntType(dst_width); + LLVMValueRef ptr; + LLVMValueRef res; + + assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0)); + + ptr = lp_build_gather_elem_ptr(builder, length, base_ptr, offsets, i); + ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, ""); + res = LLVMBuildLoad(builder, ptr, ""); + + assert(src_width <= dst_width); + if (src_width > dst_width) + res = LLVMBuildTrunc(builder, res, dst_elem_type, ""); + if (src_width < dst_width) + res = LLVMBuildZExt(builder, res, dst_elem_type, ""); + + return res; +} + + +/** + * Gather elements from scatter positions in memory into a single vector. + * Use for fetching texels from a texture. + * For SSE, typical values are length=4, src_width=32, dst_width=32. + * + * @param length length of the offsets + * @param src_width src element width in bits + * @param dst_width result element width in bits (src will be expanded to fit) + * @param base_ptr base pointer, should be a i8 pointer type. + * @param offsets vector with offsets + */ +LLVMValueRef +lp_build_gather(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets) +{ + LLVMValueRef res; + + if (length == 1) { + /* Scalar */ + return lp_build_gather_elem(builder, length, + src_width, dst_width, + base_ptr, offsets, 0); + } else { + /* Vector */ + + LLVMTypeRef dst_elem_type = LLVMIntType(dst_width); + LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length); + unsigned i; + + res = LLVMGetUndef(dst_vec_type); + for (i = 0; i < length; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef elem; + elem = lp_build_gather_elem(builder, length, + src_width, dst_width, + base_ptr, offsets, i); + res = LLVMBuildInsertElement(builder, res, elem, index, ""); + } + } + + return res; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h new file mode 100644 index 0000000000..131af8ea07 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h @@ -0,0 +1,61 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#ifndef LP_BLD_GATHER_H_ +#define LP_BLD_GATHER_H_ + + +#include "gallivm/lp_bld.h" + + +LLVMValueRef +lp_build_gather_elem_ptr(LLVMBuilderRef builder, + unsigned length, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i); + +LLVMValueRef +lp_build_gather_elem(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i); + +LLVMValueRef +lp_build_gather(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets); + + +#endif /* LP_BLD_GATHER_H_ */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 44cfdc4d3f..69353dea09 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -32,6 +32,8 @@ #include "lp_bld_debug.h" #include "lp_bld_init.h" +#include <llvm-c/Transforms/Scalar.h> + #ifdef DEBUG unsigned gallivm_debug = 0; @@ -50,6 +52,7 @@ LLVMModuleRef lp_build_module = NULL; LLVMExecutionEngineRef lp_build_engine = NULL; LLVMModuleProviderRef lp_build_provider = NULL; LLVMTargetDataRef lp_build_target = NULL; +LLVMPassManagerRef lp_build_pass = NULL; /* @@ -127,6 +130,33 @@ lp_build_init(void) if (!lp_build_target) lp_build_target = LLVMGetExecutionEngineTargetData(lp_build_engine); + if (!lp_build_pass) { + lp_build_pass = LLVMCreateFunctionPassManager(lp_build_provider); + LLVMAddTargetData(lp_build_target, lp_build_pass); + + if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) { + /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, + * but there are more on SVN. */ + /* TODO: Add more passes */ + LLVMAddCFGSimplificationPass(lp_build_pass); + LLVMAddPromoteMemoryToRegisterPass(lp_build_pass); + LLVMAddConstantPropagationPass(lp_build_pass); + if(util_cpu_caps.has_sse4_1) { + /* FIXME: There is a bug in this pass, whereby the combination of fptosi + * and sitofp (necessary for trunc/floor/ceil/round implementation) + * somehow becomes invalid code. + */ + LLVMAddInstructionCombiningPass(lp_build_pass); + } + LLVMAddGVNPass(lp_build_pass); + } else { + /* We need at least this pass to prevent the backends to fail in + * unexpected ways. + */ + LLVMAddPromoteMemoryToRegisterPass(lp_build_pass); + } + } + util_cpu_detect(); #if 0 diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h index 0ec2afcd1b..a32ced9b4c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h @@ -38,6 +38,7 @@ extern LLVMModuleRef lp_build_module; extern LLVMExecutionEngineRef lp_build_engine; extern LLVMModuleProviderRef lp_build_provider; extern LLVMTargetDataRef lp_build_target; +extern LLVMPassManagerRef lp_build_pass; void diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index d13fa1a5d0..39854e43b1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -34,6 +34,7 @@ #include "util/u_cpu_detect.h" +#include "util/u_memory.h" #include "util/u_debug.h" #include "lp_bld_type.h" @@ -187,12 +188,10 @@ lp_build_compare(LLVMBuilderRef builder, return lp_build_undef(type); } - /* There are no signed byte and unsigned word/dword comparison - * instructions. So flip the sign bit so that the results match. + /* There are no unsigned comparison instructions. So flip the sign bit + * so that the results match. */ - if(table[func].gt && - ((type.width == 8 && type.sign) || - (type.width != 8 && !type.sign))) { + if (table[func].gt && !type.sign) { LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); a = LLVMBuildXor(builder, a, msb, ""); b = LLVMBuildXor(builder, b, msb, ""); @@ -384,6 +383,46 @@ lp_build_select(struct lp_build_context *bld, mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), ""); res = LLVMBuildSelect(bld->builder, mask, a, b, ""); } + else if (util_cpu_caps.has_sse4_1 && + type.width * type.length == 128 && + !LLVMIsConstant(a) && + !LLVMIsConstant(b) && + !LLVMIsConstant(mask)) { + const char *intrinsic; + LLVMTypeRef arg_type; + LLVMValueRef args[3]; + + if (type.width == 64) { + intrinsic = "llvm.x86.sse41.blendvpd"; + arg_type = LLVMVectorType(LLVMDoubleType(), 2); + } else if (type.width == 32) { + intrinsic = "llvm.x86.sse41.blendvps"; + arg_type = LLVMVectorType(LLVMFloatType(), 4); + } else { + intrinsic = "llvm.x86.sse41.pblendvb"; + arg_type = LLVMVectorType(LLVMInt8Type(), 16); + } + + if (arg_type != bld->int_vec_type) { + mask = LLVMBuildBitCast(bld->builder, mask, arg_type, ""); + } + + if (arg_type != bld->vec_type) { + a = LLVMBuildBitCast(bld->builder, a, arg_type, ""); + b = LLVMBuildBitCast(bld->builder, b, arg_type, ""); + } + + args[0] = b; + args[1] = a; + args[2] = mask; + + res = lp_build_intrinsic(bld->builder, intrinsic, + arg_type, args, Elements(args)); + + if (arg_type != bld->vec_type) { + res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, ""); + } + } else { if(type.floating) { LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index 186f8849b8..7748f8f099 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -427,3 +427,123 @@ lp_build_pack(LLVMBuilderRef builder, return tmp[0]; } + + +/** + * Truncate or expand the bitwidth. + * + * NOTE: Getting the right sign flags is crucial here, as we employ some + * intrinsics that do saturation. + */ +void +lp_build_resize(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef *src, unsigned num_srcs, + LLVMValueRef *dst, unsigned num_dsts) +{ + LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + /* + * We don't support float <-> int conversion here. That must be done + * before/after calling this function. + */ + assert(src_type.floating == dst_type.floating); + + /* + * We don't support double <-> float conversion yet, although it could be + * added with little effort. + */ + assert((!src_type.floating && !dst_type.floating) || + src_type.width == dst_type.width); + + /* We must not loose or gain channels. Only precision */ + assert(src_type.length * num_srcs == dst_type.length * num_dsts); + + /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */ + assert(num_srcs == 1 || num_dsts == 1); + + assert(src_type.length <= LP_MAX_VECTOR_LENGTH); + assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); + assert(num_srcs <= LP_MAX_VECTOR_LENGTH); + assert(num_dsts <= LP_MAX_VECTOR_LENGTH); + + if (src_type.width > dst_type.width) { + /* + * Truncate bit width. + */ + + assert(num_dsts == 1); + + if (src_type.width * src_type.length == dst_type.width * dst_type.length) { + /* + * Register width remains constant -- use vector packing intrinsics + */ + + tmp[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs); + } + else { + /* + * Do it element-wise. + */ + + assert(src_type.length == dst_type.length); + tmp[0] = lp_build_undef(dst_type); + for (i = 0; i < dst_type.length; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); + val = LLVMBuildTrunc(builder, val, lp_build_elem_type(dst_type), ""); + tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + } + } + } + else if (src_type.width < dst_type.width) { + /* + * Expand bit width. + */ + + assert(num_srcs == 1); + + if (src_type.width * src_type.length == dst_type.width * dst_type.length) { + /* + * Register width remains constant -- use vector unpack intrinsics + */ + lp_build_unpack(builder, src_type, dst_type, src[0], tmp, num_dsts); + } + else { + /* + * Do it element-wise. + */ + + assert(src_type.length == dst_type.length); + tmp[0] = lp_build_undef(dst_type); + for (i = 0; i < dst_type.length; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); + + if (src_type.sign && dst_type.sign) { + val = LLVMBuildSExt(builder, val, lp_build_elem_type(dst_type), ""); + } else { + val = LLVMBuildZExt(builder, val, lp_build_elem_type(dst_type), ""); + } + tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + } + } + } + else { + /* + * No-op + */ + + assert(num_srcs == 1); + assert(num_dsts == 1); + + tmp[0] = src[0]; + } + + for(i = 0; i < num_dsts; ++i) + dst[i] = tmp[i]; +} + + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index 41adeed220..e470082b97 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -92,4 +92,12 @@ lp_build_pack(LLVMBuilderRef builder, const LLVMValueRef *src, unsigned num_srcs); +void +lp_build_resize(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef *src, unsigned num_srcs, + LLVMValueRef *dst, unsigned num_dsts); + + #endif /* !LP_BLD_PACK_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c index 38fd5a39ef..ca36046d22 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c @@ -61,8 +61,8 @@ LLVMValueRef lp_build_ddx(struct lp_build_context *bld, LLVMValueRef a) { - LLVMValueRef a_left = lp_build_swizzle1_aos(bld, a, swizzle_left); - LLVMValueRef a_right = lp_build_swizzle1_aos(bld, a, swizzle_right); + LLVMValueRef a_left = lp_build_swizzle_aos(bld, a, swizzle_left); + LLVMValueRef a_right = lp_build_swizzle_aos(bld, a, swizzle_right); return lp_build_sub(bld, a_right, a_left); } @@ -71,8 +71,8 @@ LLVMValueRef lp_build_ddy(struct lp_build_context *bld, LLVMValueRef a) { - LLVMValueRef a_top = lp_build_swizzle1_aos(bld, a, swizzle_top); - LLVMValueRef a_bottom = lp_build_swizzle1_aos(bld, a, swizzle_bottom); + LLVMValueRef a_top = lp_build_swizzle_aos(bld, a, swizzle_top); + LLVMValueRef a_bottom = lp_build_swizzle_aos(bld, a, swizzle_bottom); return lp_build_sub(bld, a_bottom, a_top); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 946c23e317..0fd014ab9b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -40,7 +40,6 @@ #include "lp_bld_const.h" #include "lp_bld_arit.h" #include "lp_bld_type.h" -#include "lp_bld_format.h" #include "lp_bld_sample.h" @@ -125,73 +124,53 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, /** - * Gather elements from scatter positions in memory into a single vector. - * Use for fetching texels from a texture. - * For SSE, typical values are length=4, src_width=32, dst_width=32. - * - * @param length length of the offsets - * @param src_width src element width in bits - * @param dst_width result element width in bits (src will be expanded to fit) - * @param base_ptr base pointer, should be a i8 pointer type. - * @param offsets vector with offsets - */ -LLVMValueRef -lp_build_gather(LLVMBuilderRef builder, - unsigned length, - unsigned src_width, - unsigned dst_width, - LLVMValueRef base_ptr, - LLVMValueRef offsets) -{ - LLVMTypeRef src_type = LLVMIntType(src_width); - LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); - LLVMTypeRef dst_elem_type = LLVMIntType(dst_width); - LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length); - LLVMValueRef res; - unsigned i; - - res = LLVMGetUndef(dst_vec_type); - for(i = 0; i < length; ++i) { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); - LLVMValueRef elem_offset; - LLVMValueRef elem_ptr; - LLVMValueRef elem; - - elem_offset = LLVMBuildExtractElement(builder, offsets, index, ""); - elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, ""); - elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, ""); - elem = LLVMBuildLoad(builder, elem_ptr, ""); - - assert(src_width <= dst_width); - if(src_width > dst_width) - elem = LLVMBuildTrunc(builder, elem, dst_elem_type, ""); - if(src_width < dst_width) - elem = LLVMBuildZExt(builder, elem, dst_elem_type, ""); - - res = LLVMBuildInsertElement(builder, res, elem, index, ""); - } - - return res; -} - - -/** * Compute the offset of a pixel block. * - * x, y, z, y_stride, z_stride are vectors, and they refer to pixel blocks, as - * per format description, and not individual pixels. + * x, y, z, y_stride, z_stride are vectors, and they refer to pixels. + * + * Returns the relative offset and i,j sub-block coordinates */ -LLVMValueRef +void lp_build_sample_offset(struct lp_build_context *bld, const struct util_format_description *format_desc, LLVMValueRef x, LLVMValueRef y, LLVMValueRef z, LLVMValueRef y_stride, - LLVMValueRef z_stride) + LLVMValueRef z_stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_i, + LLVMValueRef *out_j) { LLVMValueRef x_stride; LLVMValueRef offset; + LLVMValueRef i; + LLVMValueRef j; + + /* + * Describe the coordinates in terms of pixel blocks. + * + * TODO: pixel blocks are power of two. LLVM should convert rem/div to + * bit arithmetic. Verify this. + */ + + if (format_desc->block.width == 1) { + i = bld->zero; + } + else { + LLVMValueRef block_width = lp_build_const_int_vec(bld->type, format_desc->block.width); + i = LLVMBuildURem(bld->builder, x, block_width, ""); + x = LLVMBuildUDiv(bld->builder, x, block_width, ""); + } + + if (format_desc->block.height == 1) { + j = bld->zero; + } + else { + LLVMValueRef block_height = lp_build_const_int_vec(bld->type, format_desc->block.height); + j = LLVMBuildURem(bld->builder, y, block_height, ""); + y = LLVMBuildUDiv(bld->builder, y, block_height, ""); + } x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8); offset = lp_build_mul(bld, x, x_stride); @@ -206,5 +185,7 @@ lp_build_sample_offset(struct lp_build_context *bld, offset = lp_build_add(bld, offset, z_offset); } - return offset; + *out_offset = offset; + *out_i = i; + *out_j = j; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index 51e98ab2f9..5b8f478094 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -146,23 +146,17 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, const struct pipe_sampler_state *sampler); -LLVMValueRef -lp_build_gather(LLVMBuilderRef builder, - unsigned length, - unsigned src_width, - unsigned dst_width, - LLVMValueRef base_ptr, - LLVMValueRef offsets); - - -LLVMValueRef +void lp_build_sample_offset(struct lp_build_context *bld, const struct util_format_description *format_desc, LLVMValueRef x, LLVMValueRef y, LLVMValueRef z, LLVMValueRef y_stride, - LLVMValueRef z_stride); + LLVMValueRef z_stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_i, + LLVMValueRef *out_j); void diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 84c04fe272..1a20d74cac 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -50,8 +50,10 @@ #include "lp_bld_swizzle.h" #include "lp_bld_pack.h" #include "lp_bld_flow.h" +#include "lp_bld_gather.h" #include "lp_bld_format.h" #include "lp_bld_sample.h" +#include "lp_bld_quad.h" /** @@ -264,35 +266,11 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, } } - /* - * Describe the coordinates in terms of pixel blocks. - * - * TODO: pixel blocks are power of two. LLVM should convert rem/div to - * bit arithmetic. Verify this. - */ - - if (bld->format_desc->block.width == 1) { - i = bld->uint_coord_bld.zero; - } - else { - LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width); - i = LLVMBuildURem(bld->builder, x, block_width, ""); - x = LLVMBuildUDiv(bld->builder, x, block_width, ""); - } - - if (bld->format_desc->block.height == 1) { - j = bld->uint_coord_bld.zero; - } - else { - LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height); - j = LLVMBuildURem(bld->builder, y, block_height, ""); - y = LLVMBuildUDiv(bld->builder, y, block_height, ""); - } - /* convert x,y,z coords to linear offset from start of texture, in bytes */ - offset = lp_build_sample_offset(&bld->uint_coord_bld, - bld->format_desc, - x, y, z, y_stride, z_stride); + lp_build_sample_offset(&bld->uint_coord_bld, + bld->format_desc, + x, y, z, y_stride, z_stride, + &offset, &i, &j); if (use_border) { /* If we can sample the border color, it means that texcoords may @@ -344,6 +322,9 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, } +/** + * Fetch the texels as <4n x i8> in AoS form. + */ static LLVMValueRef lp_build_sample_packed(struct lp_build_sample_context *bld, LLVMValueRef x, @@ -351,25 +332,46 @@ lp_build_sample_packed(struct lp_build_sample_context *bld, LLVMValueRef y_stride, LLVMValueRef data_array) { - LLVMValueRef offset; + LLVMValueRef offset, i, j; LLVMValueRef data_ptr; + LLVMValueRef res; - offset = lp_build_sample_offset(&bld->uint_coord_bld, - bld->format_desc, - x, y, NULL, y_stride, NULL); - - assert(bld->format_desc->block.width == 1); - assert(bld->format_desc->block.height == 1); - assert(bld->format_desc->block.bits <= bld->texel_type.width); + /* convert x,y,z coords to linear offset from start of texture, in bytes */ + lp_build_sample_offset(&bld->uint_coord_bld, + bld->format_desc, + x, y, NULL, y_stride, NULL, + &offset, &i, &j); /* get pointer to mipmap level 0 data */ data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0); - return lp_build_gather(bld->builder, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset); + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* Just fetch the data directly without swizzling */ + assert(bld->format_desc->block.width == 1); + assert(bld->format_desc->block.height == 1); + assert(bld->format_desc->block.bits <= bld->texel_type.width); + + res = lp_build_gather(bld->builder, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset); + } + else { + struct lp_type type; + + assert(bld->texel_type.width == 32); + + memset(&type, 0, sizeof type); + type.width = 8; + type.length = bld->texel_type.length*4; + type.norm = TRUE; + + res = lp_build_fetch_rgba_aos(bld->builder, bld->format_desc, type, + data_ptr, offset, i, j); + } + + return res; } @@ -817,9 +819,8 @@ lp_build_minify(struct lp_build_sample_context *bld, /** * Generate code to compute texture level of detail (lambda). - * \param s vector of texcoord s values - * \param t vector of texcoord t values - * \param r vector of texcoord r values + * \param ddx partial derivatives of (s, t, r, q) with respect to X + * \param ddy partial derivatives of (s, t, r, q) with respect to Y * \param lod_bias optional float vector with the shader lod bias * \param explicit_lod optional float vector with the explicit lod * \param width scalar int texture width @@ -831,11 +832,8 @@ lp_build_minify(struct lp_build_sample_context *bld, */ static LLVMValueRef lp_build_lod_selector(struct lp_build_sample_context *bld, - LLVMValueRef s, - LLVMValueRef t, - LLVMValueRef r, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4], LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef width, @@ -870,14 +868,6 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL; LLVMValueRef rho; - /* - * dsdx = abs(s[1] - s[0]); - * dsdy = abs(s[2] - s[0]); - * dtdx = abs(t[1] - t[0]); - * dtdy = abs(t[2] - t[0]); - * drdx = abs(r[1] - r[0]); - * drdy = abs(r[2] - r[0]); - */ dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx"); dsdx = lp_build_abs(float_bld, dsdx); dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy"); @@ -1287,7 +1277,7 @@ lp_build_cube_face(struct lp_build_sample_context *bld, /** - * Generate code to do cube face selection and per-face texcoords. + * Generate code to do cube face selection and compute per-face texcoords. */ static void lp_build_cube_lookup(struct lp_build_sample_context *bld, @@ -1411,7 +1401,6 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, lp_build_endif(&if_ctx2); lp_build_flow_scope_end(flow_ctx2); lp_build_flow_destroy(flow_ctx2); - *face_s = face_s2; *face_t = face_t2; *face = face2; @@ -1457,13 +1446,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, int chan; if (img_filter == PIPE_TEX_FILTER_NEAREST) { + /* sample the first mipmap level */ lp_build_sample_image_nearest(bld, width0_vec, height0_vec, depth0_vec, row_stride0_vec, img_stride0_vec, data_ptr0, s, t, r, colors0); if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level, and interp */ + /* sample the second mipmap level */ lp_build_sample_image_nearest(bld, width1_vec, height1_vec, depth1_vec, row_stride1_vec, img_stride1_vec, @@ -1473,13 +1463,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, else { assert(img_filter == PIPE_TEX_FILTER_LINEAR); + /* sample the first mipmap level */ lp_build_sample_image_linear(bld, width0_vec, height0_vec, depth0_vec, row_stride0_vec, img_stride0_vec, data_ptr0, s, t, r, colors0); if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level, and interp */ + /* sample the second mipmap level */ lp_build_sample_image_linear(bld, width1_vec, height1_vec, depth1_vec, row_stride1_vec, img_stride1_vec, @@ -1542,6 +1533,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld, LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL; LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL; LLVMValueRef data_ptr0, data_ptr1 = NULL; + LLVMValueRef face_ddx[4], face_ddy[4]; /* printf("%s mip %d min %d mag %d\n", __FUNCTION__, @@ -1549,6 +1541,30 @@ lp_build_sample_general(struct lp_build_sample_context *bld, */ /* + * Choose cube face, recompute texcoords and derivatives for the chosen face. + */ + if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + LLVMValueRef face, face_s, face_t; + lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); + s = face_s; /* vec */ + t = face_t; /* vec */ + /* use 'r' to indicate cube face */ + r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ + + /* recompute ddx, ddy using the new (s,t) face texcoords */ + face_ddx[0] = lp_build_ddx(&bld->coord_bld, s); + face_ddx[1] = lp_build_ddx(&bld->coord_bld, t); + face_ddx[2] = NULL; + face_ddx[3] = NULL; + face_ddy[0] = lp_build_ddy(&bld->coord_bld, s); + face_ddy[1] = lp_build_ddy(&bld->coord_bld, t); + face_ddy[2] = NULL; + face_ddy[3] = NULL; + ddx = face_ddx; + ddy = face_ddy; + } + + /* * Compute the level of detail (float). */ if (min_filter != mag_filter || @@ -1556,7 +1572,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lod = lp_build_lod_selector(bld, s, t, r, ddx, ddy, + lod = lp_build_lod_selector(bld, ddx, ddy, lod_bias, explicit_lod, width, height, depth); } @@ -1566,9 +1582,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld, */ if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { /* always use mip level 0 */ - ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + /* XXX this is a work-around for an apparent bug in LLVM 2.7. + * We should be able to set ilevel0 = const(0) but that causes + * bad x86 code to be emitted. + */ + lod = lp_build_const_elem(bld->coord_bld.type, 0.0); + lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + } + else { + ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + } } else { + assert(lod); if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); } @@ -1623,18 +1650,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld, } /* - * Choose cube face, recompute per-face texcoords. - */ - if (bld->static_state->target == PIPE_TEXTURE_CUBE) { - LLVMValueRef face, face_s, face_t; - lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); - s = face_s; /* vec */ - t = face_t; /* vec */ - /* use 'r' to indicate cube face */ - r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ - } - - /* * Get pointer(s) to image data for mipmap level(s). */ data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0); @@ -1712,36 +1727,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld, static void -lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder, - struct lp_type dst_type, - LLVMValueRef packed, - LLVMValueRef *rgba) -{ - LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff); - unsigned chan; - - /* Decode the input vector components */ - for (chan = 0; chan < 4; ++chan) { - unsigned start = chan*8; - unsigned stop = start + 8; - LLVMValueRef input; - - input = packed; - - if(start) - input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), ""); - - if(stop < 32) - input = LLVMBuildAnd(builder, input, mask, ""); - - input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input); - - rgba[chan] = input; - } -} - - -static void lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, @@ -1935,15 +1920,20 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, * Convert to SoA and swizzle. */ - packed = LLVMBuildBitCast(builder, packed, i32_vec_type, ""); - lp_build_rgba8_to_f32_soa(bld->builder, bld->texel_type, packed, unswizzled); - lp_build_format_swizzle_soa(bld->format_desc, - &bld->texel_bld, - unswizzled, texel_out); + if (util_format_is_rgba8_variant(bld->format_desc)) { + lp_build_format_swizzle_soa(bld->format_desc, + &bld->texel_bld, + unswizzled, texel_out); + } else { + texel_out[0] = unswizzled[0]; + texel_out[1] = unswizzled[1]; + texel_out[2] = unswizzled[2]; + texel_out[3] = unswizzled[3]; + } apply_sampler_swizzle(bld, texel_out); } @@ -2007,6 +1997,8 @@ lp_build_sample_nop(struct lp_build_sample_context *bld, * 'texel' will return a vector of four LLVMValueRefs corresponding to * R, G, B, A. * \param type vector float type to use for coords, etc. + * \param ddx partial derivatives of (s,t,r,q) with respect to x + * \param ddy partial derivatives of (s,t,r,q) with respect to y */ void lp_build_sample_soa(LLVMBuilderRef builder, @@ -2016,8 +2008,8 @@ lp_build_sample_soa(LLVMBuilderRef builder, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - const LLVMValueRef *ddx, - const LLVMValueRef *ddy, + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4], LLVMValueRef lod_bias, /* optional */ LLVMValueRef explicit_lod, /* optional */ LLVMValueRef texel_out[4]) @@ -2079,7 +2071,8 @@ lp_build_sample_soa(LLVMBuilderRef builder, /* For debug: no-op texture sampling */ lp_build_sample_nop(&bld, texel_out); } - else if (util_format_is_rgba8_variant(bld.format_desc) && + else if (util_format_fits_8unorm(bld.format_desc) && + bld.format_desc->nr_channels > 1 && static_state->target == PIPE_TEXTURE_2D && static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR && static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR && diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c index 3c8a7bc09e..20cf96ca66 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c @@ -110,7 +110,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld, /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing * using shuffles here actually causes worst results. More investigation is * needed. */ - if (n <= 4) { + if (type.width >= 16) { /* * Shuffle. */ @@ -132,7 +132,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld, * YY00 YY00 .... YY00 * YYYY YYYY .... YYYY <= output */ - struct lp_type type4 = type; + struct lp_type type4; const char shifts[4][2] = { { 1, 2}, {-1, 2}, @@ -147,6 +147,13 @@ lp_build_broadcast_aos(struct lp_build_context *bld, a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), ""); + /* + * Build a type where each element is an integer that cover the four + * channels. + */ + + type4 = type; + type4.floating = FALSE; type4.width *= 4; type4.length /= 4; @@ -176,80 +183,170 @@ lp_build_broadcast_aos(struct lp_build_context *bld, LLVMValueRef -lp_build_swizzle1_aos(struct lp_build_context *bld, - LLVMValueRef a, - const unsigned char swizzle[4]) +lp_build_swizzle_aos(struct lp_build_context *bld, + LLVMValueRef a, + const unsigned char swizzles[4]) { - const unsigned n = bld->type.length; + const struct lp_type type = bld->type; + const unsigned n = type.length; unsigned i, j; - if(a == bld->undef || a == bld->zero || a == bld->one) + if (swizzles[0] == PIPE_SWIZZLE_RED && + swizzles[1] == PIPE_SWIZZLE_GREEN && + swizzles[2] == PIPE_SWIZZLE_BLUE && + swizzles[3] == PIPE_SWIZZLE_ALPHA) { return a; + } - if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3]) - return lp_build_broadcast_aos(bld, a, swizzle[0]); + if (swizzles[0] == swizzles[1] && + swizzles[1] == swizzles[2] && + swizzles[2] == swizzles[3]) { + switch (swizzles[0]) { + case PIPE_SWIZZLE_RED: + case PIPE_SWIZZLE_GREEN: + case PIPE_SWIZZLE_BLUE: + case PIPE_SWIZZLE_ALPHA: + return lp_build_broadcast_aos(bld, a, swizzles[0]); + case PIPE_SWIZZLE_ZERO: + return bld->zero; + case PIPE_SWIZZLE_ONE: + return bld->one; + default: + assert(0); + return bld->undef; + } + } - { + if (type.width >= 16) { /* * Shuffle. */ - LLVMTypeRef elem_type = LLVMInt32Type(); + LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(type)); + LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef aux[LP_MAX_VECTOR_LENGTH]; + + memset(aux, 0, sizeof aux); + + for(j = 0; j < n; j += 4) { + for(i = 0; i < 4; ++i) { + unsigned shuffle; + switch (swizzles[i]) { + default: + assert(0); + /* fall through */ + case PIPE_SWIZZLE_RED: + case PIPE_SWIZZLE_GREEN: + case PIPE_SWIZZLE_BLUE: + case PIPE_SWIZZLE_ALPHA: + shuffle = j + swizzles[i]; + break; + case PIPE_SWIZZLE_ZERO: + shuffle = type.length + 0; + if (!aux[0]) { + aux[0] = lp_build_const_elem(type, 0.0); + } + break; + case PIPE_SWIZZLE_ONE: + shuffle = type.length + 1; + if (!aux[1]) { + aux[1] = lp_build_const_elem(type, 1.0); + } + break; + } + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); + } + } - for(j = 0; j < n; j += 4) - for(i = 0; i < 4; ++i) - shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0); - - return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), ""); - } -} - + for (i = 0; i < n; ++i) { + if (!aux[i]) { + aux[i] = undef; + } + } -LLVMValueRef -lp_build_swizzle2_aos(struct lp_build_context *bld, - LLVMValueRef a, - LLVMValueRef b, - const unsigned char swizzle[4]) -{ - const unsigned n = bld->type.length; - unsigned i, j; + return LLVMBuildShuffleVector(bld->builder, a, + LLVMConstVector(aux, n), + LLVMConstVector(shuffles, n), ""); + } else { + /* + * Bit mask and shifts. + * + * For example, this will convert BGRA to RGBA by doing + * + * rgba = (bgra & 0x00ff0000) >> 16 + * | (bgra & 0xff00ff00) + * | (bgra & 0x000000ff) << 16 + * + * This is necessary not only for faster cause, but because X86 backend + * will refuse shuffles of <4 x i8> vectors + */ + LLVMValueRef res; + struct lp_type type4; + boolean cond[4]; + unsigned chan; + int shift; - if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4) - return lp_build_swizzle1_aos(bld, a, swizzle); + /* + * Start with a mixture of 1 and 0. + */ + for (chan = 0; chan < 4; ++chan) { + cond[chan] = swizzles[chan] == PIPE_SWIZZLE_ONE ? TRUE : FALSE; + } + res = lp_build_select_aos(bld, bld->one, bld->zero, cond); - if(a == b) { - unsigned char swizzle1[4]; - swizzle1[0] = swizzle[0] % 4; - swizzle1[1] = swizzle[1] % 4; - swizzle1[2] = swizzle[2] % 4; - swizzle1[3] = swizzle[3] % 4; - return lp_build_swizzle1_aos(bld, a, swizzle1); - } + /* + * Build a type where each element is an integer that cover the four + * channels. + */ + type4 = type; + type4.floating = FALSE; + type4.width *= 4; + type4.length /= 4; - if(swizzle[0] % 4 == 0 && - swizzle[1] % 4 == 1 && - swizzle[2] % 4 == 2 && - swizzle[3] % 4 == 3) { - boolean cond[4]; - cond[0] = swizzle[0] / 4; - cond[1] = swizzle[1] / 4; - cond[2] = swizzle[2] / 4; - cond[3] = swizzle[3] / 4; - return lp_build_select_aos(bld, a, b, cond); - } + a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), ""); + res = LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type4), ""); - { /* - * Shuffle. + * Mask and shift the channels, trying to group as many channels in the + * same shift as possible */ - LLVMTypeRef elem_type = LLVMInt32Type(); - LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; - - for(j = 0; j < n; j += 4) - for(i = 0; i < 4; ++i) - shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0); + for (shift = -3; shift <= 3; ++shift) { + unsigned long long mask = 0; + + assert(type4.width <= sizeof(mask)*8); + + for (chan = 0; chan < 4; ++chan) { + /* FIXME: big endian */ + if (swizzles[chan] < 4 && + chan - swizzles[chan] == shift) { + mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width); + } + } + + if (mask) { + LLVMValueRef masked; + LLVMValueRef shifted; + + if (0) + debug_printf("shift = %i, mask = 0x%08llx\n", shift, mask); + + masked = LLVMBuildAnd(bld->builder, a, + lp_build_const_int_vec(type4, mask), ""); + if (shift > 0) { + shifted = LLVMBuildShl(bld->builder, masked, + lp_build_const_int_vec(type4, shift*type.width), ""); + } else if (shift < 0) { + shifted = LLVMBuildLShr(bld->builder, masked, + lp_build_const_int_vec(type4, -shift*type.width), ""); + } else { + shifted = masked; + } + + res = LLVMBuildOr(bld->builder, res, shifted, ""); + } + } - return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), ""); + return LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type), ""); } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h index 4f4fa777c9..315e1bcb54 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h @@ -68,24 +68,12 @@ lp_build_broadcast_aos(struct lp_build_context *bld, /** * Swizzle a vector consisting of an array of XYZW structs. * - * @param swizzle is the in [0,4[ range. + * @param swizzles is the in [0,4[ range. */ LLVMValueRef -lp_build_swizzle1_aos(struct lp_build_context *bld, - LLVMValueRef a, - const unsigned char swizzle[4]); - - -/** - * Swizzle two vector consisting of an array of XYZW structs. - * - * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b. - */ -LLVMValueRef -lp_build_swizzle2_aos(struct lp_build_context *bld, - LLVMValueRef a, - LLVMValueRef b, - const unsigned char swizzle[4]); +lp_build_swizzle_aos(struct lp_build_context *bld, + LLVMValueRef a, + const unsigned char swizzles[4]); LLVMValueRef diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index dec7556138..21236839fb 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -49,6 +49,7 @@ #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_arit.h" +#include "lp_bld_gather.h" #include "lp_bld_logic.h" #include "lp_bld_swizzle.h" #include "lp_bld_flow.h" @@ -132,10 +133,14 @@ struct lp_build_tgsi_soa_context LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS]; LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS]; - /* we allocate an array of temps if we have indirect - * addressing and then the temps above is unused */ + /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is + * set in the indirect_files field. + * The temps[] array above is unused then. + */ LLVMValueRef temps_array; - boolean has_indirect_addressing; + + /** bitmask indicating which register files are accessed indirectly */ + unsigned indirect_files; struct lp_build_mask_context *mask; struct lp_exec_mask exec_mask; @@ -404,25 +409,92 @@ static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc) lp_exec_mask_update(mask); } + +/** + * Return pointer to a temporary register channel (src or dest). + * Note that indirect addressing cannot be handled here. + * \param index which temporary register + * \param chan which channel of the temp register. + */ static LLVMValueRef get_temp_ptr(struct lp_build_tgsi_soa_context *bld, unsigned index, - unsigned chan, - boolean is_indirect, - LLVMValueRef addr) + unsigned chan) { assert(chan < 4); - if (!bld->has_indirect_addressing) { - return bld->temps[index][chan]; - } else { - LLVMValueRef lindex = - LLVMConstInt(LLVMInt32Type(), index * 4 + chan, 0); - if (is_indirect) - lindex = lp_build_add(&bld->base, lindex, addr); + if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) { + LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan); return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, ""); } + else { + return bld->temps[index][chan]; + } } + +/** + * Gather vector. + * XXX the lp_build_gather() function should be capable of doing this + * with a little work. + */ +static LLVMValueRef +build_gather(struct lp_build_tgsi_soa_context *bld, + LLVMValueRef base_ptr, + LLVMValueRef indexes) +{ + LLVMValueRef res = bld->base.undef; + unsigned i; + + /* + * Loop over elements of index_vec, load scalar value, insert it into 'res'. + */ + for (i = 0; i < bld->base.type.length; i++) { + LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder, + indexes, ii, ""); + LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr, + &index, 1, ""); + LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, ""); + + res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, ""); + } + + return res; +} + + +/** + * Read the current value of the ADDR register, convert the floats to + * ints, multiply by four and return the vector of offsets. + * The offsets will be used to index into the constant buffer or + * temporary register file. + */ +static LLVMValueRef +get_indirect_offsets(struct lp_build_tgsi_soa_context *bld, + const struct tgsi_src_register *indirect_reg) +{ + /* always use X component of address register */ + const int x = indirect_reg->SwizzleX; + LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type); + uint swizzle = tgsi_util_get_src_register_swizzle(indirect_reg, x); + LLVMValueRef vec4 = lp_build_const_int_vec(bld->int_bld.type, 4); + LLVMValueRef addr_vec; + + addr_vec = LLVMBuildLoad(bld->base.builder, + bld->addr[indirect_reg->Index][swizzle], + "load addr reg"); + + /* for indexing we want integers */ + addr_vec = LLVMBuildFPToSI(bld->base.builder, addr_vec, + int_vec_type, ""); + + /* addr_vec = addr_vec * 4 */ + addr_vec = lp_build_mul(&bld->base, addr_vec, vec4); + + return addr_vec; +} + + /** * Register fetch. */ @@ -430,14 +502,14 @@ static LLVMValueRef emit_fetch( struct lp_build_tgsi_soa_context *bld, const struct tgsi_full_instruction *inst, - unsigned index, + unsigned src_op, const unsigned chan_index ) { - const struct tgsi_full_src_register *reg = &inst->Src[index]; + const struct tgsi_full_src_register *reg = &inst->Src[src_op]; const unsigned swizzle = tgsi_util_get_full_src_register_swizzle(reg, chan_index); LLVMValueRef res; - LLVMValueRef addr = NULL; + LLVMValueRef addr_vec = NULL; if (swizzle > 3) { assert(0 && "invalid swizzle in emit_fetch()"); @@ -445,32 +517,33 @@ emit_fetch( } if (reg->Register.Indirect) { - LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type); - unsigned swizzle = tgsi_util_get_src_register_swizzle( ®->Indirect, chan_index ); - addr = LLVMBuildLoad(bld->base.builder, - bld->addr[reg->Indirect.Index][swizzle], - ""); - /* for indexing we want integers */ - addr = LLVMBuildFPToSI(bld->base.builder, addr, - int_vec_type, ""); - addr = LLVMBuildExtractElement(bld->base.builder, - addr, LLVMConstInt(LLVMInt32Type(), 0, 0), - ""); - addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0)); + assert(bld->indirect_files); + addr_vec = get_indirect_offsets(bld, ®->Indirect); } switch (reg->Register.File) { case TGSI_FILE_CONSTANT: - { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), - reg->Register.Index*4 + swizzle, 0); + if (reg->Register.Indirect) { + LLVMValueRef index_vec; /* index into the const buffer */ + + assert(bld->indirect_files & (1 << TGSI_FILE_CONSTANT)); + + /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */ + index_vec = lp_build_const_int_vec(bld->int_bld.type, + reg->Register.Index * 4 + swizzle); + + /* index_vec = index_vec + addr_vec */ + index_vec = lp_build_add(&bld->base, index_vec, addr_vec); + + /* Gather values from the constant buffer */ + res = build_gather(bld, bld->consts_ptr, index_vec); + } + else { + LLVMValueRef index; /* index into the const buffer */ LLVMValueRef scalar, scalar_ptr; - if (reg->Register.Indirect) { - /*lp_build_printf(bld->base.builder, - "\taddr = %d\n", addr);*/ - index = lp_build_add(&bld->base, index, addr); - } + index = lp_build_const_int32(reg->Register.Index*4 + swizzle); + scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, ""); scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, ""); @@ -490,13 +563,38 @@ emit_fetch( break; case TGSI_FILE_TEMPORARY: - { - LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index, - swizzle, - reg->Register.Indirect, - addr); + if (reg->Register.Indirect) { + LLVMValueRef vec_len = + lp_build_const_int_vec(bld->int_bld.type, bld->base.type.length); + LLVMValueRef index_vec; /* index into the const buffer */ + LLVMValueRef temps_array; + LLVMTypeRef float4_ptr_type; + + assert(bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)); + + /* index_vec = broadcast(reg->Register.Index * 4 + swizzle) */ + index_vec = lp_build_const_int_vec(bld->int_bld.type, + reg->Register.Index * 4 + swizzle); + + /* index_vec += addr_vec */ + index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec); + + /* index_vec *= vector_length */ + index_vec = lp_build_mul(&bld->int_bld, index_vec, vec_len); + + /* cast temps_array pointer to float* */ + float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0); + temps_array = LLVMBuildBitCast(bld->int_bld.builder, bld->temps_array, + float4_ptr_type, ""); + + /* Gather values from the temporary register array */ + res = build_gather(bld, temps_array, index_vec); + } + else { + LLVMValueRef temp_ptr; + temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle); res = LLVMBuildLoad(bld->base.builder, temp_ptr, ""); - if(!res) + if (!res) return bld->base.undef; } break; @@ -660,8 +758,12 @@ emit_store( } if (reg->Register.Indirect) { + /* XXX use get_indirect_offsets() here eventually */ LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type); unsigned swizzle = tgsi_util_get_src_register_swizzle( ®->Indirect, chan_index ); + + assert(bld->indirect_files); + addr = LLVMBuildLoad(bld->base.builder, bld->addr[reg->Indirect.Index][swizzle], ""); @@ -680,14 +782,18 @@ emit_store( bld->outputs[reg->Register.Index][chan_index]); break; - case TGSI_FILE_TEMPORARY: { - LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index, - chan_index, - reg->Register.Indirect, - addr); - lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr); + case TGSI_FILE_TEMPORARY: + if (reg->Register.Indirect) { + /* XXX not done yet */ + debug_printf("WARNING: LLVM scatter store of temp regs" + " not implemented\n"); + } + else { + LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index, + chan_index); + lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr); + } break; - } case TGSI_FILE_ADDRESS: lp_exec_mask_store(&bld->exec_mask, pred, value, @@ -905,7 +1011,7 @@ emit_declaration( switch (decl->Declaration.File) { case TGSI_FILE_TEMPORARY: assert(idx < LP_MAX_TGSI_TEMPS); - if (bld->has_indirect_addressing) { + if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) { LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(), last*4 + 4, 0); bld->temps_array = lp_build_array_alloca(bld->base.builder, @@ -1929,8 +2035,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder, bld.outputs = outputs; bld.consts_ptr = consts_ptr; bld.sampler = sampler; - bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 || - info->opcode_count[TGSI_OPCODE_ARL] > 0; + bld.indirect_files = info->indirect_files; bld.instructions = (struct tgsi_full_instruction *) MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) ); bld.max_instructions = LP_MAX_INSTRUCTIONS; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h index df77ef2155..3ffe916f8e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -316,6 +316,54 @@ LLVMTypeRef lp_build_int32_vec4_type(void); +static INLINE struct lp_type +lp_float32_vec4_type(void) +{ + struct lp_type type; + + memset(&type, 0, sizeof(type)); + type.floating = TRUE; + type.sign = TRUE; + type.norm = FALSE; + type.width = 32; + type.length = 4; + + return type; +} + + +static INLINE struct lp_type +lp_int32_vec4_type(void) +{ + struct lp_type type; + + memset(&type, 0, sizeof(type)); + type.floating = FALSE; + type.sign = TRUE; + type.norm = FALSE; + type.width = 32; + type.length = 4; + + return type; +} + + +static INLINE struct lp_type +lp_unorm8_vec4_type(void) +{ + struct lp_type type; + + memset(&type, 0, sizeof(type)); + type.floating = FALSE; + type.sign = FALSE; + type.norm = TRUE; + type.width = 8; + type.length = 4; + + return type; +} + + struct lp_type lp_uint_type(struct lp_type type); |