summaryrefslogtreecommitdiff
path: root/src/gallium/auxiliary/gallivm/lp_bld_arit.c
diff options
context:
space:
mode:
authorJosé Fonseca <jfonseca@vmware.com>2010-08-21 21:58:22 +0100
committerJosé Fonseca <jfonseca@vmware.com>2010-08-21 21:58:22 +0100
commit0d96cbe4a5e0a39c17db007f3815868c6a766382 (patch)
treec554638d8c0444d71beb2485e3d2a9f09f3ec3a8 /src/gallium/auxiliary/gallivm/lp_bld_arit.c
parent42210f44645c154983705a2caea6af8fbdafbc12 (diff)
gallivm: Emit DIVPS instead of RCPPS.
See comments for detailed rationale. Thanks to Michal Krol and Zack Rusin for detecting and investigating this in detail.
Diffstat (limited to 'src/gallium/auxiliary/gallivm/lp_bld_arit.c')
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_arit.c36
1 files changed, 24 insertions, 12 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 7b35dd4bb4..bb30e6e9df 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -59,14 +59,6 @@
#include "lp_bld_arit.h"
-/*
- * XXX: Increasing eliminates some artifacts, but adds others, most
- * noticeably corruption in the Earth halo in Google Earth.
- */
-#define RCP_NEWTON_STEPS 0
-
-#define RSQRT_NEWTON_STEPS 0
-
#define EXP_POLY_DEGREE 3
#define LOG_POLY_DEGREE 5
@@ -1266,6 +1258,11 @@ lp_build_sqrt(struct lp_build_context *bld,
*
* x_{i+1} = x_i * (2 - a * x_i)
*
+ * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
+ * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
+ * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * halo. It would be necessary to clamp the argument to prevent this.
+ *
* See also:
* - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
* - http://softwarecommunity.intel.com/articles/eng/1818.htm
@@ -1306,13 +1303,27 @@ lp_build_rcp(struct lp_build_context *bld,
if(LLVMIsConstant(a))
return LLVMConstFDiv(bld->one, a);
- if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ /*
+ * We don't use RCPPS because:
+ * - it only has 10bits of precision
+ * - it doesn't even get the reciprocate of 1.0 exactly
+ * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
+ * - for recent processors the benefit over DIVPS is marginal, a case
+ * depedent
+ *
+ * We could still use it on certain processors if benchmarks show that the
+ * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
+ * particular uses that require less workarounds.
+ */
+
+ if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ const unsigned num_iterations = 0;
LLVMValueRef res;
unsigned i;
res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
- for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
+ for (i = 0; i < num_iterations; ++i) {
res = lp_build_rcp_refine(bld, a, res);
}
@@ -1363,13 +1374,14 @@ lp_build_rsqrt(struct lp_build_context *bld,
assert(type.floating);
- if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+ const unsigned num_iterations = 0;
LLVMValueRef res;
unsigned i;
res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
- for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
+ for (i = 0; i < num_iterations; ++i) {
res = lp_build_rsqrt_refine(bld, a, res);
}