1 files changed, 427 insertions, 84 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index dce3c3745b..00f419a486 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -614,17 +614,15 @@ lp_build_div(struct lp_build_context *bld,
 
 
 /**
- * Linear interpolation.
- *
- * This also works for integer values with a few caveats.
+ * Linear interpolation -- without any checks.
  *
  * @sa http://www.stereopsis.com/doubleblend.html
  */
-LLVMValueRef
-lp_build_lerp(struct lp_build_context *bld,
-              LLVMValueRef x,
-              LLVMValueRef v0,
-              LLVMValueRef v1)
+static INLINE LLVMValueRef
+lp_build_lerp_simple(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef v0,
+                     LLVMValueRef v1)
 {
    LLVMValueRef delta;
    LLVMValueRef res;
@@ -639,12 +637,80 @@ lp_build_lerp(struct lp_build_context *bld,
 
    res = lp_build_add(bld, v0, res);
 
-   if(bld->type.fixed)
+   if (bld->type.fixed) {
       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
        * but it will be wrong for other uses. Basically we need a more
        * powerful lp_type, capable of further distinguishing the values
        * interpretation from the value storage. */
       res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Linear interpolation.
+ */
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, x));
+   assert(lp_check_value(type, v0));
+   assert(lp_check_value(type, v1));
+
+   if (type.norm) {
+      struct lp_type wide_type;
+      struct lp_build_context wide_bld;
+      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
+      LLVMValueRef shift;
+
+      assert(type.length >= 2);
+      assert(!type.sign);
+
+      /*
+       * Create a wider type, enough to hold the intermediate result of the
+       * multiplication.
+       */
+      memset(&wide_type, 0, sizeof wide_type);
+      wide_type.fixed  = TRUE;
+      wide_type.width  = type.width*2;
+      wide_type.length = type.length/2;
+
+      lp_build_context_init(&wide_bld, bld->builder, wide_type);
+
+      lp_build_unpack2(bld->builder, type, wide_type, x,  &xl,  &xh);
+      lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h);
+      lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h);
+
+      /*
+       * Scale x from [0, 255] to [0, 256]
+       */
+
+      shift = lp_build_const_int_vec(wide_type, type.width - 1);
+
+      xl = lp_build_add(&wide_bld, xl,
+                        LLVMBuildAShr(bld->builder, xl, shift, ""));
+      xh = lp_build_add(&wide_bld, xh,
+                        LLVMBuildAShr(bld->builder, xh, shift, ""));
+
+      /*
+       * Lerp both halves.
+       */
+
+      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
+      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
+
+      res = lp_build_pack2(bld->builder, wide_type, type, resl, resh);
+   } else {
+      res = lp_build_lerp_simple(bld, x, v0, v1);
+   }
 
    return res;
 }
@@ -923,35 +989,122 @@ lp_build_round_sse41(struct lp_build_context *bld,
                      enum lp_build_round_sse41_mode mode)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef i32t = LLVMInt32Type();
    const char *intrinsic;
+   LLVMValueRef res;
 
    assert(type.floating);
-   assert(type.width*type.length == 128);
+
    assert(lp_check_value(type, a));
    assert(util_cpu_caps.has_sse4_1);
 
-   switch(type.width) {
-   case 32:
-      intrinsic = "llvm.x86.sse41.round.ps";
-      break;
-   case 64:
-      intrinsic = "llvm.x86.sse41.round.pd";
-      break;
-   default:
-      assert(0);
-      return bld->undef;
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef args[3];
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ss";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.sd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      undef = LLVMGetUndef(vec_type);
+
+      args[0] = undef;
+      args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+      args[2] = LLVMConstInt(i32t, mode, 0);
+
+      res = lp_build_intrinsic(bld->builder, intrinsic,
+                               vec_type, args, Elements(args));
+
+      res = LLVMBuildExtractElement(bld->builder, res, index0, "");
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ps";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.pd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      res = lp_build_intrinsic_binary(bld->builder, intrinsic,
+                                      bld->vec_type, a,
+                                      LLVMConstInt(i32t, mode, 0));
+   }
+
+   return res;
+}
+
+
+static INLINE LLVMValueRef
+lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+                             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMTypeRef ret_type = lp_build_int_vec_type(type);
+   const char *intrinsic;
+   LLVMValueRef res;
+
+   assert(type.floating);
+   /* using the double precision conversions is a bit more complicated */
+   assert(type.width == 32);
+
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse2);
+
+   /* This is relying on MXCSR rounding mode, which should always be nearest. */
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef arg;
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      intrinsic = "llvm.x86.sse.cvtss2si";
+
+      undef = LLVMGetUndef(vec_type);
+
+      arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, arg);
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      intrinsic = "llvm.x86.sse2.cvtps2dq";
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, a);
    }
 
-   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
-                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+   return res;
 }
 
 
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * a float (vector).
- * Ex: trunc(-1.5) = 1.0
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is a float (vector).
+ * Ex: trunc(-1.5) = -1.0
  */
 LLVMValueRef
 lp_build_trunc(struct lp_build_context *bld,
@@ -962,8 +1115,10 @@ lp_build_trunc(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
@@ -990,8 +1145,10 @@ lp_build_round(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -1016,8 +1173,10 @@ lp_build_floor(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -1042,8 +1201,10 @@ lp_build_ceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -1068,9 +1229,9 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * an integer (vector).
- * Ex: itrunc(-1.5) = 1
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is an integer (vector).
+ * Ex: itrunc(-1.5) = -1
  */
 LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
@@ -1097,31 +1258,40 @@ lp_build_iround(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+      return lp_build_iround_nearest_sse2(bld, a);
+   }
+   else if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
       LLVMValueRef half;
 
-      /* get sign bit */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-
-      /* sign * 0.5 */
       half = lp_build_const_vec(type, 0.5);
-      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
-      half = LLVMBuildOr(bld->builder, sign, half, "");
-      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+
+      if (type.sign) {
+         LLVMTypeRef vec_type = bld->vec_type;
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* get sign bit */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+         /* sign * 0.5 */
+         half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+         half = LLVMBuildOr(bld->builder, sign, half, "");
+         half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+      }
 
       res = LLVMBuildFAdd(bld->builder, a, half, "");
    }
@@ -1142,37 +1312,42 @@ lp_build_ifloor(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    }
    else {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
-      LLVMValueRef offset;
-
-      /* sign = a < 0 ? ~0 : 0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
-
-      /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
-
-      /* offset = a < 0 ? offset : 0.0f */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
-
-      res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
+      res = a;
+
+      if (type.sign) {
+         /* Take the sign bit and add it to 1 constant */
+         LLVMTypeRef vec_type = bld->vec_type;
+         unsigned mantissa = lp_mantissa(type);
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+         LLVMValueRef offset;
+
+         /* sign = a < 0 ? ~0 : 0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
+
+         /* offset = -0.99999(9)f */
+         offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+         offset = LLVMConstBitCast(offset, int_vec_type);
+
+         /* offset = a < 0 ? offset : 0.0f */
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
+
+         res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res");
+      }
    }
 
    /* round to nearest (toward zero) */
@@ -1192,35 +1367,39 @@ lp_build_iceil(struct lp_build_context *bld,
                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef vec_type = bld->vec_type;
       unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
       LLVMValueRef offset;
 
-      /* sign = a < 0 ? 0 : ~0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
-      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
-
       /* offset = 0.99999(9)f */
       offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
 
-      /* offset = a < 0 ? 0.0 : offset */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      if (type.sign) {
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* sign = a < 0 ? 0 : ~0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+         sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+         /* offset = a < 0 ? 0.0 : offset */
+         offset = LLVMConstBitCast(offset, int_vec_type);
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      }
 
       res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
    }
@@ -1232,6 +1411,46 @@ lp_build_iceil(struct lp_build_context *bld,
 }
 
 
+/**
+ * Combined ifloor() & fract().
+ *
+ * Preferred to calling the functions separately, as it will ensure that the
+ * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ */
+void
+lp_build_ifloor_fract(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef *out_ipart,
+                      LLVMValueRef *out_fpart)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef ipart;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
+      /*
+       * floor() is easier.
+       */
+
+      ipart = lp_build_floor(bld, a);
+      *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
+      *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart");
+   }
+   else {
+      /*
+       * ifloor() is easier.
+       */
+
+      *out_ipart = lp_build_ifloor(bld, a);
+      ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart");
+      *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
+   }
+}
+
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -2041,6 +2260,71 @@ lp_build_exp2(struct lp_build_context *bld,
 
 
 /**
+ * Extract the exponent of a IEEE-754 floating point value.
+ *
+ * Optionally apply an integer bias.
+ *
+ * Result is an integer value with
+ *
+ *   ifloor(log2(x)) + bias
+ */
+LLVMValueRef
+lp_build_extract_exponent(struct lp_build_context *bld,
+                          LLVMValueRef x,
+                          int bias)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   res = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
+   res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(type, 255), "");
+   res = LLVMBuildSub(bld->builder, res, lp_build_const_int_vec(type, 127 - bias), "");
+
+   return res;
+}
+
+
+/**
+ * Extract the mantissa of the a floating.
+ *
+ * Result is a floating point value with
+ *
+ *   x / floor(log2(x))
+ */
+LLVMValueRef
+lp_build_extract_mantissa(struct lp_build_context *bld,
+                          LLVMValueRef x)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
+   LLVMValueRef res;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(type.floating);
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   /* res = x / 2**ipart */
+   res = LLVMBuildAnd(bld->builder, x, mantmask, "");
+   res = LLVMBuildOr(bld->builder, res, one, "");
+   res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+
+   return res;
+}
+
+
+
+/**
  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
  * These coefficients can be generate with
  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
@@ -2159,3 +2443,62 @@ lp_build_log2(struct lp_build_context *bld,
    lp_build_log2_approx(bld, x, NULL, NULL, &res);
    return res;
 }
+
+
+/**
+ * Faster (and less accurate) log2.
+ *
+ *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
+ *
+ * Piece-wise linear approximation, with exact results when x is a
+ * power of two.
+ *
+ * See http://www.flipcode.com/archives/Fast_log_Function.shtml
+ */
+LLVMValueRef
+lp_build_fast_log2(struct lp_build_context *bld,
+                   LLVMValueRef x)
+{
+   LLVMValueRef ipart;
+   LLVMValueRef fpart;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(bld->type.floating);
+
+   /* ipart = floor(log2(x)) - 1 */
+   ipart = lp_build_extract_exponent(bld, x, -1);
+   ipart = LLVMBuildSIToFP(bld->builder, ipart, bld->vec_type, "");
+
+   /* fpart = x / 2**ipart */
+   fpart = lp_build_extract_mantissa(bld, x);
+
+   /* ipart + fpart */
+   return LLVMBuildFAdd(bld->builder, ipart, fpart, "");
+}
+
+
+/**
+ * Fast implementation of iround(log2(x)).
+ *
+ * Not an approximation -- it should give accurate results all the time.
+ */
+LLVMValueRef
+lp_build_ilog2(struct lp_build_context *bld,
+               LLVMValueRef x)
+{
+   LLVMValueRef sqrt2 = lp_build_const_vec(bld->type, M_SQRT2);
+   LLVMValueRef ipart;
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
+   x = LLVMBuildFMul(bld->builder, x, sqrt2, "");
+
+   /* ipart = floor(log2(x) + 0.5)  */
+   ipart = lp_build_extract_exponent(bld, x, 0);
+
+   return ipart;
+}