gallium/util: add fast half float conversion functions

This adds a fast half float conversion facility to Gallium. Mesa already contains such a facility, but using a much worse algorithm. This one is an implementation of www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf and uses a branch-less algorithm with some lookup tables small enough to fit in the L1 cache. Ideally, Mesa should start using these functions too, but I'm not sure how to arrange that with the current build system. A new "u_gctors.cpp" is added that defines a global C++ constructor allowing to initialize to conversion lookup tables at library init.
author: Luca Barbieri <luca@luca-barbieri.com> 2010-03-24 18:12:45 +0100
committer: Michal Krol <michal@vmware.com> 2010-04-01 13:33:07 +0200
commit: 3ff175d6de89ad92d167362355501f99d06f0f97 (patch)
tree: 6332294693fc3581785927a9df6013aecf9aca22
parent: 110e039d0df08ae1642adf4bd20f07992b9ffe9c (diff)
4 files changed, 199 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 4c629924b9..14c0fb1840 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -110,6 +110,7 @@ C_SOURCES = \
 	util/u_format_table.c \
 	util/u_format_tests.c \
 	util/u_gen_mipmap.c \
+	util/u_half.c \
 	util/u_handle_table.c \
 	util/u_hash_table.c \
 	util/u_hash.c \
@@ -138,6 +139,9 @@ C_SOURCES = \
 	#vl/vl_csc.c \
 	#vl/vl_shader_build.c \
 
+CPP_SOURCES = \
+	util/u_gctors.cpp
+
 GALLIVM_SOURCES = \
         gallivm/lp_bld_alpha.c \
         gallivm/lp_bld_arit.c \
diff --git a/src/gallium/auxiliary/util/u_gctors.cpp b/src/gallium/auxiliary/util/u_gctors.cpp
new file mode 100644
index 0000000000..9ea9819d73
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_gctors.cpp
@@ -0,0 +1,17 @@
+/* this file uses the C++ global constructor mechanism to automatically
+   initialize global data
+
+   __attribute__((constructor)) allows to do this in C, but is GCC-only
+*/
+
+extern "C" void util_half_init_tables(void);
+
+struct util_gctor_t
+{
+	util_gctor_t()
+	{
+		util_half_init_tables();
+	}
+};
+
+static struct util_gctor_t util_gctor;
diff --git a/src/gallium/auxiliary/util/u_half.c b/src/gallium/auxiliary/util/u_half.c
new file mode 100644
index 0000000000..8865acb76b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_half.c
@@ -0,0 +1,123 @@
+#include "util/u_half.h"
+
+/* see www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+ * "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008
+ */
+
+/* Note that using a 64K * 4 table is a terrible idea since it will not fit
+ * in the L1 cache and will massively pollute the L2 cache as well
+ *
+ * These should instead fit in the L1 cache.
+ *
+ * TODO: we could use a denormal bias table instead of the mantissa/offset
+ * tables: this would reduce the L1 cache usage from 8704 to 2304 bytes
+ * but would involve more computation
+ *
+ * Note however that if denormals are never encountered, the L1 cache usage
+ * is only about 4608 bytes anyway.
+ */
+uint32_t util_half_to_float_mantissa_table[2048];
+uint32_t util_half_to_float_exponent_table[64];
+uint32_t util_half_to_float_offset_table[64];
+uint16_t util_float_to_half_base_table[512];
+uint8_t util_float_to_half_shift_table[512];
+
+/* called by u_gctors.cpp, which defines the prototype itself */
+void util_half_init_tables(void);
+
+void util_half_init_tables(void)
+{
+	int i;
+
+	/* zero */
+	util_half_to_float_mantissa_table[0] = 0;
+
+	/* denormals */
+	for(i = 1; i < 1024; ++i) {
+		unsigned int m = i << 13;
+		unsigned int e = 0;
+
+		/* Normalize number */
+		while(!(m & 0x00800000)) {
+			e -= 0x00800000;
+			m<<=1;
+		}
+		m &= ~0x00800000;
+		e+= 0x38800000;
+		util_half_to_float_mantissa_table[i] = m | e;
+	}
+
+	/* normals */
+	for(i = 1024; i < 2048; ++i)
+		util_half_to_float_mantissa_table[i] = ((i-1024)<<13);
+
+	/* positive zero or denormals */
+	util_half_to_float_exponent_table[0] = 0;
+
+	/* positive numbers */
+	for(i = 1; i <= 30; ++i)
+		util_half_to_float_exponent_table[i] = 0x38000000 + (i << 23);
+
+	/* positive infinity/NaN */
+	util_half_to_float_exponent_table[31] = 0x7f800000;
+
+	/* negative zero or denormals */
+	util_half_to_float_exponent_table[32] = 0x80000000;
+
+	/* negative numbers */
+	for(i = 33; i <= 62; ++i)
+		util_half_to_float_exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
+
+	/* negative infinity/NaN */
+	util_half_to_float_exponent_table[63] = 0xff800000;
+
+	/* positive zero or denormals */
+	util_half_to_float_offset_table[0] = 0;
+
+	/* positive normals */
+	for(i = 1; i < 32; ++i)
+		util_half_to_float_offset_table[i] = 1024;
+
+	/* negative zero or denormals */
+	util_half_to_float_offset_table[32] = 0;
+
+	/* negative normals */
+	for(i = 33; i < 64; ++i)
+		util_half_to_float_offset_table[i] = 1024;
+
+
+
+	/* very small numbers mapping to zero */
+	for(i = -127; i < -24; ++i) {
+		util_float_to_half_base_table[127 + i] = 0;
+		util_float_to_half_shift_table[127 + i] = 24;
+	}
+
+	/* small numbers mapping to denormals */
+	for(i = -24; i < -14; ++i) {
+		util_float_to_half_base_table[127 + i] = 0x0400 >> (-14 - i);
+		util_float_to_half_shift_table[127 + i] = -i - 1;
+	}
+
+	/* normal numbers */
+	for(i = -14; i < 16; ++i) {
+		util_float_to_half_base_table[127 + i] = (i + 15) << 10;
+		util_float_to_half_shift_table[127 + i] = 13;
+	}
+
+	/* large numbers mapping to infinity */
+	for(i = 16; i < 128; ++i) {
+		util_float_to_half_base_table[127 + i] = 0x7c00;
+		util_float_to_half_shift_table[127 + i] = 24;
+	}
+
+	/* infinity and NaNs */
+	util_float_to_half_base_table[255] = 0x7c00;
+	util_float_to_half_shift_table[255] = 13;
+
+	/* negative numbers */
+	for(i = 0; i < 256; ++i) {
+		util_float_to_half_base_table[256 + i] = util_float_to_half_base_table[i] | 0x8000;
+		util_float_to_half_shift_table[256 + i] = util_float_to_half_shift_table[i];
+	}
+}
diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h
new file mode 100644
index 0000000000..464d43df8a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -0,0 +1,55 @@
+#ifndef U_HALF_H
+#define U_HALF_H
+
+#include "pipe/p_compiler.h"
+
+extern uint32_t util_half_to_float_mantissa_table[2048];
+extern uint32_t util_half_to_float_exponent_table[64];
+extern uint32_t util_half_to_float_offset_table[64];
+extern uint16_t util_float_to_half_base_table[512];
+extern uint8_t util_float_to_half_shift_table[512];
+
+/*
+ * Note that if the half float is a signaling NaN, the x87 FPU will turn
+ * it into a quiet NaN immediately upon loading into a float.
+ *
+ * Additionally, denormals may be flushed to zero.
+ *
+ * To avoid this, use the floatui functions instead of the float ones
+ * when just doing conversion rather than computation on the resulting
+ * floats.
+ */
+
+static INLINE uint32_t
+util_half_to_floatui(half h)
+{
+	unsigned exp = h >> 10;
+	return util_half_to_float_mantissa_table[util_half_to_float_offset_table[exp] + (h & 0x3ff)]
+		+ util_half_to_float_exponent_table[exp];
+}
+
+static INLINE float
+util_half_to_float(half h)
+{
+	union {float f; uint32_t v;} r;
+	r.v = util_half_to_floatui(h);
+	return r.f;
+}
+
+static INLINE half
+util_floatui_to_half(uint32_t v)
+{
+	unsigned signexp = v >> 23;
+	return util_float_to_half_base_table[signexp]
+		+ ((v & 0x007fffff) >> util_float_to_half_shift_table[signexp]);
+}
+
+static INLINE half
+util_float_to_half(float f)
+{
+	union {float f; uint32_t v;} i;
+	i.f = f;
+	return util_floatui_to_half(i.v);
+}
+
+#endif /* U_HALF_H */
author	Luca Barbieri <luca@luca-barbieri.com>	2010-03-24 18:12:45 +0100
committer	Michal Krol <michal@vmware.com>	2010-04-01 13:33:07 +0200
commit	3ff175d6de89ad92d167362355501f99d06f0f97 (patch)
tree	6332294693fc3581785927a9df6013aecf9aca22
parent	110e039d0df08ae1642adf4bd20f07992b9ffe9c (diff)