summaryrefslogtreecommitdiff
path: root/src/mesa/main
diff options
context:
space:
mode:
authorBrian Paul <brian.paul@tungstengraphics.com>2003-03-04 16:33:53 +0000
committerBrian Paul <brian.paul@tungstengraphics.com>2003-03-04 16:33:53 +0000
commitf9b1e5241facc8cf255c258082d5cb5b04783e93 (patch)
treeea3bb1859dba42e746b0f14dce5193b81c922ac5 /src/mesa/main
parent386578c5bcc5c6701a6b9692cdc04cfe4064ca06 (diff)
added _mesa_inv_sqrtf() and INV_SQRTF() (Josh Vanderhoof)
Diffstat (limited to 'src/mesa/main')
-rw-r--r--src/mesa/main/imports.c112
-rw-r--r--src/mesa/main/imports.h15
-rw-r--r--src/mesa/main/macros.h4
-rw-r--r--src/mesa/main/nvvertexec.c4
4 files changed, 129 insertions, 6 deletions
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index e9d579e764..8474ed4abc 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -1,4 +1,4 @@
-/* $Id: imports.c,v 1.32 2003/03/01 01:50:21 brianp Exp $ */
+/* $Id: imports.c,v 1.33 2003/03/04 16:33:53 brianp Exp $ */
/*
* Mesa 3-D graphics library
@@ -346,6 +346,7 @@ _mesa_sqrtf( float x )
* then reconstruct the result back into a float
*/
num.i = ((sqrttab[num.i >> 16]) << 16) | ((e + 127) << 23);
+
return num.f;
#else
return (float) _mesa_sqrtd((double) x);
@@ -353,6 +354,115 @@ _mesa_sqrtf( float x )
}
+/**
+ inv_sqrt - A single precision 1/sqrt routine for IEEE format floats.
+ written by Josh Vanderhoof, based on newsgroup posts by James Van Buskirk
+ and Vesa Karvonen.
+*/
+float
+_mesa_inv_sqrtf(float n)
+{
+#if defined(USE_IEEE) && !defined(DEBUG)
+ float r0, x0, y0;
+ float r1, x1, y1;
+ float r2, x2, y2;
+#if 0 /* not used, see below -BP */
+ float r3, x3, y3;
+#endif
+ union { float f; unsigned int i; } u;
+ unsigned int magic;
+
+ /*
+ Exponent part of the magic number -
+
+ We want to:
+ 1. subtract the bias from the exponent,
+ 2. negate it
+ 3. divide by two (rounding towards -inf)
+ 4. add the bias back
+
+ Which is the same as subtracting the exponent from 381 and dividing
+ by 2.
+
+ floor(-(x - 127) / 2) + 127 = floor((381 - x) / 2)
+ */
+
+ magic = 381 << 23;
+
+ /*
+ Significand part of magic number -
+
+ With the current magic number, "(magic - u.i) >> 1" will give you:
+
+ for 1 <= u.f <= 2: 1.25 - u.f / 4
+ for 2 <= u.f <= 4: 1.00 - u.f / 8
+
+ This isn't a bad approximation of 1/sqrt. The maximum difference from
+ 1/sqrt will be around .06. After three Newton-Raphson iterations, the
+ maximum difference is less than 4.5e-8. (Which is actually close
+ enough to make the following bias academic...)
+
+ To get a better approximation you can add a bias to the magic
+ number. For example, if you subtract 1/2 of the maximum difference in
+ the first approximation (.03), you will get the following function:
+
+ for 1 <= u.f <= 2: 1.22 - u.f / 4
+ for 2 <= u.f <= 3.76: 0.97 - u.f / 8
+ for 3.76 <= u.f <= 4: 0.72 - u.f / 16
+ (The 3.76 to 4 range is where the result is < .5.)
+
+ This is the closest possible initial approximation, but with a maximum
+ error of 8e-11 after three NR iterations, it is still not perfect. If
+ you subtract 0.0332281 instead of .03, the maximum error will be
+ 2.5e-11 after three NR iterations, which should be about as close as
+ is possible.
+
+ for 1 <= u.f <= 2: 1.2167719 - u.f / 4
+ for 2 <= u.f <= 3.73: 0.9667719 - u.f / 8
+ for 3.73 <= u.f <= 4: 0.7167719 - u.f / 16
+
+ */
+
+ magic -= (int)(0.0332281 * (1 << 25));
+
+ u.f = n;
+ u.i = (magic - u.i) >> 1;
+
+ /*
+ Instead of Newton-Raphson, we use Goldschmidt's algorithm, which
+ allows more parallelism. From what I understand, the parallelism
+ comes at the cost of less precision, because it lets error
+ accumulate across iterations.
+ */
+ x0 = 1.0f;
+ y0 = 0.5f * n;
+ r0 = u.f;
+
+ x1 = x0 * r0;
+ y1 = y0 * r0 * r0;
+ r1 = 1.5f - y1;
+
+ x2 = x1 * r1;
+ y2 = y1 * r1 * r1;
+ r2 = 1.5f - y2;
+
+#if 1
+ return x2 * r2; /* we can stop here, and be conformant -BP */
+#else
+ x3 = x2 * r2;
+ y3 = y2 * r2 * r2;
+ r3 = 1.5f - y3;
+
+ return x3 * r3;
+#endif
+#elif defined(XFree86LOADER) && defined(IN_MODULE)
+ return 1.0F / xf86sqrt(n);
+#else
+ return 1.0F / sqrt(n);
+#endif
+}
+
+
double
_mesa_pow(double x, double y)
{
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h
index 354819f43f..73c3b6eb42 100644
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -1,4 +1,4 @@
-/* $Id: imports.h,v 1.16 2003/03/03 21:44:39 brianp Exp $ */
+/* $Id: imports.h,v 1.17 2003/03/04 16:33:53 brianp Exp $ */
/*
* Mesa 3-D graphics library
@@ -169,6 +169,16 @@ float asm_sqrt (float x);
/***
+ *** INV_SQRTF: single-precision inverse square root
+ ***/
+#if 0
+#define INV_SQRTF(X) _mesa_inv_sqrt(X)
+#else
+#define INV_SQRTF(X) (1.0F / SQRTF(X)) /* this is faster on a P4 */
+#endif
+
+
+/***
*** LOG2: Log base 2 of float
***/
#ifdef USE_IEEE
@@ -588,6 +598,9 @@ _mesa_sqrtd(double x);
extern float
_mesa_sqrtf(float x);
+extern float
+_mesa_inv_sqrtf(float x);
+
extern double
_mesa_pow(double x, double y);
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index 0be9510995..63bd02bdff 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -1,4 +1,4 @@
-/* $Id: macros.h,v 1.31 2003/03/01 01:50:21 brianp Exp $ */
+/* $Id: macros.h,v 1.32 2003/03/04 16:33:54 brianp Exp $ */
/*
* Mesa 3-D graphics library
@@ -572,7 +572,7 @@ do { \
do { \
GLfloat len = (GLfloat) LEN_SQUARED_3FV(V); \
if (len) { \
- len = (GLfloat) (1.0 / SQRTF(len)); \
+ len = INV_SQRTF(len); \
(V)[0] = (GLfloat) ((V)[0] * len); \
(V)[1] = (GLfloat) ((V)[1] * len); \
(V)[2] = (GLfloat) ((V)[2] * len); \
diff --git a/src/mesa/main/nvvertexec.c b/src/mesa/main/nvvertexec.c
index 72ebcffca9..0702927354 100644
--- a/src/mesa/main/nvvertexec.c
+++ b/src/mesa/main/nvvertexec.c
@@ -1,4 +1,4 @@
-/* $Id: nvvertexec.c,v 1.2 2003/03/01 01:50:22 brianp Exp $ */
+/* $Id: nvvertexec.c,v 1.3 2003/03/04 16:33:55 brianp Exp $ */
/*
* Mesa 3-D graphics library
@@ -380,7 +380,7 @@ _mesa_exec_vertex_program(GLcontext *ctx, const struct vertex_program *program)
{
GLfloat t[4];
fetch_vector1( &inst->SrcReg[0], machine, t );
- t[0] = (float) (1.0 / sqrt(fabs(t[0])));
+ t[0] = INV_SQRTF(FABSF(t[0]));
t[1] = t[2] = t[3] = t[0];
store_vector4( &inst->DstReg, machine, t );
}