diff options
| -rw-r--r-- | src/gallium/auxiliary/gallivm/lp_bld_arit.c | 36 | 
1 files changed, 24 insertions, 12 deletions
| diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 7b35dd4bb4..bb30e6e9df 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -59,14 +59,6 @@  #include "lp_bld_arit.h" -/* - * XXX: Increasing eliminates some artifacts, but adds others, most - * noticeably corruption in the Earth halo in Google Earth. - */ -#define RCP_NEWTON_STEPS 0 - -#define RSQRT_NEWTON_STEPS 0 -  #define EXP_POLY_DEGREE 3  #define LOG_POLY_DEGREE 5 @@ -1266,6 +1258,11 @@ lp_build_sqrt(struct lp_build_context *bld,   *   *   x_{i+1} = x_i * (2 - a * x_i)   * + * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or + * +/-Inf, giving NaN instead.  Certain applications rely on this behavior, + * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's + * halo. It would be necessary to clamp the argument to prevent this. + *   * See also:   * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division   * - http://softwarecommunity.intel.com/articles/eng/1818.htm @@ -1306,13 +1303,27 @@ lp_build_rcp(struct lp_build_context *bld,     if(LLVMIsConstant(a))        return LLVMConstFDiv(bld->one, a); -   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { +   /* +    * We don't use RCPPS because: +    * - it only has 10bits of precision +    * - it doesn't even get the reciprocate of 1.0 exactly +    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf +    * - for recent processors the benefit over DIVPS is marginal, a case +    *   depedent +    * +    * We could still use it on certain processors if benchmarks show that the +    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for +    * particular uses that require less workarounds. +    */ + +   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { +      const unsigned num_iterations = 0;        LLVMValueRef res;        unsigned i;        res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); -      for (i = 0; i < RCP_NEWTON_STEPS; ++i) { +      for (i = 0; i < num_iterations; ++i) {           res = lp_build_rcp_refine(bld, a, res);        } @@ -1363,13 +1374,14 @@ lp_build_rsqrt(struct lp_build_context *bld,     assert(type.floating); -   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { +   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { +      const unsigned num_iterations = 0;        LLVMValueRef res;        unsigned i;        res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); -      for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) { +      for (i = 0; i < num_iterations; ++i) {           res = lp_build_rsqrt_refine(bld, a, res);        } | 
