diff options
-rw-r--r-- | src/gallium/auxiliary/Makefile | 5 | ||||
-rw-r--r-- | src/gallium/auxiliary/SConscript | 8 | ||||
-rw-r--r-- | src/gallium/auxiliary/util/u_format.c | 1 | ||||
-rw-r--r-- | src/gallium/auxiliary/util/u_half.c | 165 | ||||
-rw-r--r-- | src/gallium/auxiliary/util/u_half.h | 3 | ||||
-rw-r--r-- | src/gallium/auxiliary/util/u_half.py | 179 |
6 files changed, 191 insertions, 170 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile index 1db4aaa439..843778d810 100644 --- a/src/gallium/auxiliary/Makefile +++ b/src/gallium/auxiliary/Makefile @@ -169,7 +169,8 @@ GALLIVM_CPP_SOURCES = \ GENERATED_SOURCES = \ indices/u_indices_gen.c \ indices/u_unfilled_gen.c \ - util/u_format_table.c + util/u_format_table.c \ + util/u_half.c ifeq ($(MESA_LLVM),1) @@ -198,3 +199,5 @@ util/u_format_table.c: util/u_format_table.py util/u_format_pack.py util/u_forma util/u_format_access.c: util/u_format_access.py util/u_format_parse.py util/u_format.csv python util/u_format_access.py util/u_format.csv > $@ +util/u_half.c: util/u_half.py + python util/u_half.py > $@ diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript index d0443db3f7..73d4150448 100644 --- a/src/gallium/auxiliary/SConscript +++ b/src/gallium/auxiliary/SConscript @@ -29,6 +29,14 @@ env.CodeGenerate( source = ['util/u_format.csv'], command = 'python $SCRIPT $SOURCE > $TARGET' ) + +env.CodeGenerate( + target = 'util/u_half.c', + script = 'util/u_half.py', + source = [], + command = 'python $SCRIPT > $TARGET' +) + env.Depends('util/u_format_table.c', [ 'util/u_format_parse.py', 'util/u_format_pack.py', diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c index d3ee1f0339..fae0a462dc 100644 --- a/src/gallium/auxiliary/util/u_format.c +++ b/src/gallium/auxiliary/util/u_format.c @@ -124,5 +124,4 @@ void util_format_do_init(void) { util_format_s3tc_init(); - util_half_init(); } diff --git a/src/gallium/auxiliary/util/u_half.c b/src/gallium/auxiliary/util/u_half.c deleted file mode 100644 index 4c8f8a51c5..0000000000 --- a/src/gallium/auxiliary/util/u_half.c +++ /dev/null @@ -1,165 +0,0 @@ - -/* - * Copyright 2010 Luca Barbieri - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial - * portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE - * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION - * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION - * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* The code is a reimplementation of the algorithm in - * www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf - * "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008 - * - * The table contents have been slightly changed so that the exponent - * bias is now in the exponent table instead of the mantissa table (mostly - * for cosmetic reasons, and because it theoretically allows a variant - * that flushes denormal to zero but uses a mantissa table with 24-bit - * entries). - * - * The tables are also constructed slightly differently. - */ - -/* Note that using a 64K * 4 table is a terrible idea since it will not fit - * in the L1 cache and will massively pollute the L2 cache as well - * - * These should instead fit in the L1 cache. - * - * TODO: we could use a denormal bias table instead of the mantissa/offset - * tables: this would reduce the L1 cache usage from 8704 to 2304 bytes - * but would involve more computation - * - * Note however that if denormals are never encountered, the L1 cache usage - * is only about 4608 bytes anyway. - */ - -#include "util/u_half.h" -#include "util/u_init.h" - -uint32_t util_half_to_float_mantissa_table[2048]; -uint32_t util_half_to_float_exponent_table[64]; -uint32_t util_half_to_float_offset_table[64]; -uint16_t util_float_to_half_base_table[512]; -uint8_t util_float_to_half_shift_table[512]; - -boolean util_half_inited; - -void -util_half_do_init(void) -{ - int i; - - /* zero */ - util_half_to_float_mantissa_table[0] = 0; - - /* denormals */ - for(i = 1; i < 1024; ++i) - { - unsigned int m = i << 13; - unsigned int e = 0; - - /* Normalize number */ - while(!(m & 0x00800000)) - { - e -= 0x00800000; - m <<= 1; - } - m &= ~0x00800000; - e += 0x38800000; - util_half_to_float_mantissa_table[i] = m | e; - } - - /* normals */ - for(i = 1024; i < 2048; ++i) - util_half_to_float_mantissa_table[i] = ((i - 1024) << 13); - - /* positive zero or denormals */ - util_half_to_float_exponent_table[0] = 0; - - /* positive numbers */ - for(i = 1; i <= 30; ++i) - util_half_to_float_exponent_table[i] = 0x38000000 + (i << 23); - - /* positive infinity/NaN */ - util_half_to_float_exponent_table[31] = 0x7f800000; - - /* negative zero or denormals */ - util_half_to_float_exponent_table[32] = 0x80000000; - - /* negative numbers */ - for(i = 33; i <= 62; ++i) - util_half_to_float_exponent_table[i] = 0xb8000000 + ((i - 32) << 23); - - /* negative infinity/NaN */ - util_half_to_float_exponent_table[63] = 0xff800000; - - /* positive zero or denormals */ - util_half_to_float_offset_table[0] = 0; - - /* positive normals */ - for(i = 1; i < 32; ++i) - util_half_to_float_offset_table[i] = 1024; - - /* negative zero or denormals */ - util_half_to_float_offset_table[32] = 0; - - /* negative normals */ - for(i = 33; i < 64; ++i) - util_half_to_float_offset_table[i] = 1024; - - /* very small numbers mapping to zero */ - for(i = -127; i < -24; ++i) - { - util_float_to_half_base_table[127 + i] = 0; - util_float_to_half_shift_table[127 + i] = 24; - } - - /* small numbers mapping to denormals */ - for(i = -24; i < -14; ++i) - { - util_float_to_half_base_table[127 + i] = 0x0400 >> (-14 - i); - util_float_to_half_shift_table[127 + i] = -i - 1; - } - - /* normal numbers */ - for(i = -14; i < 16; ++i) - { - util_float_to_half_base_table[127 + i] = (i + 15) << 10; - util_float_to_half_shift_table[127 + i] = 13; - } - - /* large numbers mapping to infinity */ - for(i = 16; i < 128; ++i) - { - util_float_to_half_base_table[127 + i] = 0x7c00; - util_float_to_half_shift_table[127 + i] = 24; - } - - /* infinity and NaNs */ - util_float_to_half_base_table[255] = 0x7c00; - util_float_to_half_shift_table[255] = 13; - - /* negative numbers */ - for(i = 0; i < 256; ++i) - { - util_float_to_half_base_table[256 + i] = util_float_to_half_base_table[i] | 0x8000; - util_float_to_half_shift_table[256 + i] = util_float_to_half_shift_table[i]; - } -} diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h index 02f0f24193..a28b1fd1d9 100644 --- a/src/gallium/auxiliary/util/u_half.h +++ b/src/gallium/auxiliary/util/u_half.h @@ -3,7 +3,6 @@ #include "pipe/p_compiler.h" #include "util/u_math.h" -#include "util/u_inline_init.h" #ifdef __cplusplus extern "C" { @@ -56,8 +55,6 @@ util_float_to_half(float f) return util_floatui_to_half(i.ui); } -UTIL_INLINE_INIT(util_half); - #ifdef __cplusplus } #endif diff --git a/src/gallium/auxiliary/util/u_half.py b/src/gallium/auxiliary/util/u_half.py new file mode 100644 index 0000000000..a92f758750 --- /dev/null +++ b/src/gallium/auxiliary/util/u_half.py @@ -0,0 +1,179 @@ +# Copyright 2010 Luca Barbieri +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice (including the +# next paragraph) shall be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# ************************************************************************* + +# The code is a reimplementation of the algorithm in +# www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf +# "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008 +# +# The table contents have been slightly changed so that the exponent +# bias is now in the exponent table instead of the mantissa table (mostly +# for cosmetic reasons, and because it theoretically allows a variant +# that flushes denormal to zero but uses a mantissa table with 24-bit +# entries). +# +# The tables are also constructed slightly differently. +# + +# Note that using a 64K * 4 table is a terrible idea since it will not fit +# in the L1 cache and will massively pollute the L2 cache as well +# +# These should instead fit in the L1 cache. +# +# TODO: we could use a denormal bias table instead of the mantissa/offset +# tables: this would reduce the L1 cache usage from 8704 to 2304 bytes +# but would involve more computation +# +# Note however that if denormals are never encountered, the L1 cache usage +# is only about 4608 bytes anyway. + +table_index = None +table_length = None + +def begin(t, n, l): + global table_length + global table_index + table_index = 0 + table_length = l + print + print t + " " + n + "[" + str(l) + "] = {" + +def value(v): + global table_index + table_index += 1 + print "\t" + hex(v) + "," + +def end(): + global table_length + global table_index + print "};" + assert table_index == table_length + +print "/* This file is autogenerated by u_half.py. Do not edit directly. */" +print "#include \"util/u_half.h\"" + +begin("uint32_t", "util_half_to_float_mantissa_table", 2048) +# zero +value(0) + +# denormals +for i in xrange(1, 1024): + m = i << 13 + e = 0 + + # normalize number + while (m & 0x00800000) == 0: + e -= 0x00800000; + m <<= 1; + + m &= ~0x00800000; + e += 0x38800000; + value(m | e) + +# normals +for i in xrange(1024, 2048): + value((i - 1024) << 13) +end() + +begin("uint32_t", "util_half_to_float_exponent_table", 64) +# positive zero or denormals +value(0) + +# positive numbers +for i in xrange(1, 31): + value(0x38000000 + (i << 23)) + +# positive infinity/NaN +value(0x7f800000) + +# negative zero or denormals +value(0x80000000) + +# negative numbers +for i in range(33, 63): + value(0xb8000000 + ((i - 32) << 23)) + +# negative infinity/NaN +value(0xff800000) +end() + +begin("uint32_t", "util_half_to_float_offset_table", 64) +# positive zero or denormals +value(0) + +# positive normals +for i in range(1, 32): + value(1024) + +# negative zero or denormals +value(0) + +# negative normals +for i in xrange(33, 64): + value(1024) +end() + +begin("uint16_t", "util_float_to_half_base_table", 512) +for sign in (0, 0x8000): + # very small numbers mapping to zero + for i in xrange(-127, -24): + value(sign | 0) + + # small numbers mapping to denormals + for i in xrange(-24, -14): + value(sign | (0x400 >> (-14 -i))) + + # normal numbers + for i in xrange(-14, 16): + value(sign | ((i + 15) << 10)) + + # large numbers mapping to infinity + for i in xrange(16, 128): + value(sign | 0x7c00) + + # infinity and NaNs + value(sign | 0x7c00) +end() + +begin("uint8_t", "util_float_to_half_shift_table", 512) +for sign in (0, 0x8000): + # very small numbers mapping to zero + for i in xrange(-127, -24): + value(24) + + # small numbers mapping to denormals + for i in xrange(-24, -14): + value(-1 - i) + + # normal numbers + for i in xrange(-14, 16): + value(13) + + # large numbers mapping to infinity + for i in xrange(16, 128): + value(24) + + # infinity and NaNs + value(13) +end() + |