summaryrefslogtreecommitdiff
path: root/src/gallium/drivers/llvmpipe/lp_bld_arit.c
diff options
context:
space:
mode:
authorJosé Fonseca <jfonseca@vmware.com>2009-08-03 00:01:27 +0100
committerJosé Fonseca <jfonseca@vmware.com>2009-08-29 09:21:22 +0100
commitede73258a7604109b257bddb029b5f4dad5eb09d (patch)
treee0c4569483e42411014239b03fcf3d674b4827e4 /src/gallium/drivers/llvmpipe/lp_bld_arit.c
parenta6622e6c544d3530a463d6a274a15bfae58f7ccc (diff)
llvmpipe: Get blending of normalized 8bit unsigned integers working.
Diffstat (limited to 'src/gallium/drivers/llvmpipe/lp_bld_arit.c')
-rw-r--r--src/gallium/drivers/llvmpipe/lp_bld_arit.c169
1 files changed, 163 insertions, 6 deletions
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index f45b7d82f1..ba272df296 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -184,7 +184,7 @@ lp_build_one(union lp_type type)
LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
unsigned i;
- assert(type.length < LP_MAX_VECTOR_LENGTH);
+ assert(type.length <= LP_MAX_VECTOR_LENGTH);
elem_type = lp_build_elem_type(type);
@@ -224,7 +224,7 @@ lp_build_const_aos(union lp_type type,
unsigned i;
assert(type.length % 4 == 0);
- assert(type.length < LP_MAX_VECTOR_LENGTH);
+ assert(type.length <= LP_MAX_VECTOR_LENGTH);
elem_type = lp_build_elem_type(type);
@@ -421,9 +421,9 @@ lp_build_add(struct lp_build_context *bld,
if(type.width * type.length == 128 &&
!type.floating && !type.fixed) {
if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.adds.b" : "llvm.x86.sse2.addus.b";
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.adds.w" : "llvm.x86.sse2.addus.w";
+ intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
}
#endif
@@ -468,9 +468,9 @@ lp_build_sub(struct lp_build_context *bld,
if(type.width * type.length == 128 &&
!type.floating && !type.fixed) {
if(type.width == 8)
- intrinsic = type.sign ? "llvm.x86.sse2.subs.b" : "llvm.x86.sse2.subus.b";
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
if(type.width == 16)
- intrinsic = type.sign ? "llvm.x86.sse2.subs.w" : "llvm.x86.sse2.subus.w";
+ intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
}
#endif
@@ -490,11 +490,124 @@ lp_build_sub(struct lp_build_context *bld,
}
+/**
+ * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
+ */
+static LLVMValueRef
+lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i, j;
+
+ assert(n <= LP_MAX_VECTOR_LENGTH);
+ assert(lo_hi < 2);
+
+ for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
+ elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
+ elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
+ }
+
+ return LLVMConstVector(elems, n);
+}
+
+
+static LLVMValueRef
+lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
+{
+ LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+ unsigned i;
+
+ assert(n <= LP_MAX_VECTOR_LENGTH);
+
+ for(i = 0; i < n; ++i)
+ elems[i] = LLVMConstInt(type, c, 0);
+
+ return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Normalized 8bit multiplication.
+ *
+ * - alpha plus one
+ *
+ * makes the following approximation to the division (Sree)
+ *
+ * a*b/255 ~= (a*(b + 1)) >> 256
+ *
+ * which is the fastest method that satisfies the following OpenGL criteria
+ *
+ * 0*0 = 0 and 255*255 = 255
+ *
+ * - geometric series
+ *
+ * takes the geometric series approximation to the division
+ *
+ * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
+ *
+ * in this case just the first two terms to fit in 16bit arithmetic
+ *
+ * t/255 ~= (t + (t >> 8)) >> 8
+ *
+ * note that just by itself it doesn't satisfies the OpenGL criteria, as
+ * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
+ * must be used
+ *
+ * - geometric series plus rounding
+ *
+ * when using a geometric series division instead of truncating the result
+ * use roundoff in the approximation (Jim Blinn)
+ *
+ * t/255 ~= (t + (t >> 8) + 0x80) >> 8
+ *
+ * achieving the exact results
+ *
+ * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
+ * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
+ * @sa Michael Herf, The "double blend trick", May 2000,
+ * http://www.stereopsis.com/doubleblend.html
+ */
+static LLVMValueRef
+lp_build_mul_u8n(LLVMBuilderRef builder,
+ LLVMValueRef a, LLVMValueRef b)
+{
+ static LLVMValueRef c01 = NULL;
+ static LLVMValueRef c08 = NULL;
+ static LLVMValueRef c80 = NULL;
+ LLVMValueRef ab;
+
+ if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
+ if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
+ if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
+
+#if 0
+
+ /* a*b/255 ~= (a*(b + 1)) >> 256 */
+ b = LLVMBuildAdd(builder, b, c01, "");
+ ab = LLVMBuildMul(builder, a, b, "");
+
+#else
+
+ /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
+ ab = LLVMBuildMul(builder, a, b, "");
+ ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
+ ab = LLVMBuildAdd(builder, ab, c80, "");
+
+#endif
+
+ ab = LLVMBuildLShr(builder, ab, c08, "");
+
+ return ab;
+}
+
+
LLVMValueRef
lp_build_mul(struct lp_build_context *bld,
LLVMValueRef a,
LLVMValueRef b)
{
+ const union lp_type type = bld->type;
+
if(a == bld->zero)
return bld->zero;
if(a == bld->one)
@@ -506,6 +619,50 @@ lp_build_mul(struct lp_build_context *bld,
if(a == bld->undef || b == bld->undef)
return bld->undef;
+ if(!type.floating && !type.fixed && type.norm) {
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+ if(type.width == 8 && type.length == 16) {
+ LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
+ LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
+ static LLVMValueRef ml = NULL;
+ static LLVMValueRef mh = NULL;
+ LLVMValueRef al, ah, bl, bh;
+ LLVMValueRef abl, abh;
+ LLVMValueRef ab;
+
+ if(!ml) ml = lp_build_unpack_shuffle(16, 0);
+ if(!mh) mh = lp_build_unpack_shuffle(16, 1);
+
+ /* PUNPCKLBW, PUNPCKHBW */
+ al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
+ bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
+ ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
+ bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
+
+ /* NOP */
+ al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
+ bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
+ ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
+ bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
+
+ /* PMULLW, PSRLW, PADDW */
+ abl = lp_build_mul_u8n(bld->builder, al, bl);
+ abh = lp_build_mul_u8n(bld->builder, ah, bh);
+
+ /* PACKUSWB */
+ ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , abl, abh);
+
+ /* NOP */
+ ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
+
+ return ab;
+ }
+#endif
+
+ /* FIXME */
+ assert(0);
+ }
+
if(LLVMIsConstant(a) && LLVMIsConstant(b))
return LLVMConstMul(a, b);