diff options
| -rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_arit.c | 111 | 
1 files changed, 83 insertions, 28 deletions
| diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 816ee70119..7b35dd4bb4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1,6 +1,6 @@  /**************************************************************************   * - * Copyright 2009 VMware, Inc. + * Copyright 2009-2010 VMware, Inc.   * All Rights Reserved.   *   * Permission is hereby granted, free of charge, to any person obtaining a @@ -59,6 +59,19 @@  #include "lp_bld_arit.h" +/* + * XXX: Increasing eliminates some artifacts, but adds others, most + * noticeably corruption in the Earth halo in Google Earth. + */ +#define RCP_NEWTON_STEPS 0 + +#define RSQRT_NEWTON_STEPS 0 + +#define EXP_POLY_DEGREE 3 + +#define LOG_POLY_DEGREE 5 + +  /**   * Generate min(a, b)   * No checks for special case values of a or b = 1 or 0 are done. @@ -1248,6 +1261,31 @@ lp_build_sqrt(struct lp_build_context *bld,  } +/** + * Do one Newton-Raphson step to improve reciprocate precision: + * + *   x_{i+1} = x_i * (2 - a * x_i) + * + * See also: + * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division + * - http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static INLINE LLVMValueRef +lp_build_rcp_refine(struct lp_build_context *bld, +                    LLVMValueRef a, +                    LLVMValueRef rcp_a) +{ +   LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); +   LLVMValueRef res; + +   res = LLVMBuildFMul(bld->builder, a, rcp_a, ""); +   res = LLVMBuildFSub(bld->builder, two, res, ""); +   res = LLVMBuildFMul(bld->builder, rcp_a, res, ""); + +   return res; +} + +  LLVMValueRef  lp_build_rcp(struct lp_build_context *bld,               LLVMValueRef a) @@ -1269,32 +1307,16 @@ lp_build_rcp(struct lp_build_context *bld,        return LLVMConstFDiv(bld->one, a);     if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { -      /* -       * XXX: Added precision is not always necessary, so only enable this -       * when we have a better system in place to track minimum precision. -       */ - -#if 1 -      /* -       * Do one Newton-Raphson step to improve precision: -       * -       *   x1 = (2 - a * rcp(a)) * rcp(a) -       */ - -      LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); -      LLVMValueRef rcp_a;        LLVMValueRef res; +      unsigned i; -      rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); +      res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); -      res = LLVMBuildFMul(bld->builder, a, rcp_a, ""); -      res = LLVMBuildFSub(bld->builder, two, res, ""); -      res = LLVMBuildFMul(bld->builder, res, rcp_a, ""); +      for (i = 0; i < RCP_NEWTON_STEPS; ++i) { +         res = lp_build_rcp_refine(bld, a, res); +      }        return res; -#else -      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); -#endif     }     return LLVMBuildFDiv(bld->builder, bld->one, a, ""); @@ -1302,6 +1324,33 @@ lp_build_rcp(struct lp_build_context *bld,  /** + * Do one Newton-Raphson step to improve rsqrt precision: + * + *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) + * + * See also: + * - http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static INLINE LLVMValueRef +lp_build_rsqrt_refine(struct lp_build_context *bld, +                      LLVMValueRef a, +                      LLVMValueRef rsqrt_a) +{ +   LLVMValueRef half = lp_build_const_vec(bld->type, 0.5); +   LLVMValueRef three = lp_build_const_vec(bld->type, 3.0); +   LLVMValueRef res; + +   res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, ""); +   res = LLVMBuildFMul(bld->builder, a, res, ""); +   res = LLVMBuildFSub(bld->builder, three, res, ""); +   res = LLVMBuildFMul(bld->builder, rsqrt_a, res, ""); +   res = LLVMBuildFMul(bld->builder, half, res, ""); + +   return res; +} + + +/**   * Generate 1/sqrt(a)   */  LLVMValueRef @@ -1314,8 +1363,18 @@ lp_build_rsqrt(struct lp_build_context *bld,     assert(type.floating); -   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) -      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a); +   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { +      LLVMValueRef res; +      unsigned i; + +      res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); + +      for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) { +         res = lp_build_rsqrt_refine(bld, a, res); +      } + +      return res; +   }     return lp_build_rcp(bld, lp_build_sqrt(bld, a));  } @@ -1821,10 +1880,6 @@ lp_build_log(struct lp_build_context *bld,  } -#define EXP_POLY_DEGREE 3 -#define LOG_POLY_DEGREE 5 - -  /**   * Generate polynomial.   * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. | 
