From 17058e07469f2dc5b47b4f820bd5a31b7ed9177c Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Fri, 2 May 2008 16:02:18 +0200 Subject: tgsi: Implement fast rsqrtf. Not tested, inactive. --- src/gallium/auxiliary/tgsi/exec/tgsi_exec.c | 6 ++++ src/gallium/auxiliary/tgsi/exec/tgsi_exec.h | 10 +++++-- src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c | 43 +++++++++++++++++------------ 3 files changed, 40 insertions(+), 19 deletions(-) (limited to 'src/gallium/auxiliary') diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c index 5d5125f7cb..826b432f09 100644 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c @@ -88,6 +88,10 @@ #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C +#define TEMP_3_I TGSI_EXEC_TEMP_THREE_I +#define TEMP_3_C TGSI_EXEC_TEMP_THREE_C +#define TEMP_HALF_I TGSI_EXEC_TEMP_HALF_I +#define TEMP_HALF_C TGSI_EXEC_TEMP_HALF_C #define TEMP_R0 TGSI_EXEC_TEMP_R0 #define FOR_EACH_CHANNEL(CHAN)\ @@ -262,6 +266,8 @@ tgsi_exec_machine_init( mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f; mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f; mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f; + mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f; + mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f; } } diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h index 92e2e5e985..19bd78df3d 100644 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h @@ -133,9 +133,15 @@ struct tgsi_exec_labels #define TGSI_EXEC_TEMP_PRIMITIVE_I 34 #define TGSI_EXEC_TEMP_PRIMITIVE_C 2 -#define TGSI_EXEC_TEMP_R0 35 +#define TGSI_EXEC_TEMP_THREE_I 34 +#define TGSI_EXEC_TEMP_THREE_C 3 -#define TGSI_EXEC_NUM_TEMPS (32 + 4) +#define TGSI_EXEC_TEMP_HALF_I 35 +#define TGSI_EXEC_TEMP_HALF_C 0 + +#define TGSI_EXEC_TEMP_R0 36 + +#define TGSI_EXEC_NUM_TEMPS (32 + 5) #define TGSI_EXEC_NUM_ADDRS 1 #define TGSI_EXEC_NUM_IMMEDIATES 256 diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c index 2fd76a3072..dbf002130b 100755 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c @@ -36,7 +36,11 @@ #ifdef PIPE_ARCH_X86 -#define HIGH_PRECISION 1 /* for 1/sqrt() */ +/* for 1/sqrt() + * + * This costs about 100fps (close to 10%) in gears: + */ +#define HIGH_PRECISION 1 #define FOR_EACH_CHANNEL( CHAN )\ @@ -794,20 +798,25 @@ emit_rsqrt( * * See: http://softwarecommunity.intel.com/articles/eng/1818.htm */ - /* This is some code that woudl do the above for a scalar 'a'. We - * obviously are interested in a vector version: - * - * movss xmm3, a; - * movss xmm1, half; - * movss xmm2, three; - * rsqrtss xmm0, xmm3; - * mulss xmm3, xmm0; - * mulss xmm1, xmm0; - * mulss xmm3, xmm0; - * subss xmm2, xmm3; - * mulss xmm1, xmm2; - * movss x, xmm1; - */ + { + struct x86_reg dst = make_xmm( xmm_dst ); + struct x86_reg src = make_xmm( xmm_src ); + struct x86_reg tmp0 = make_xmm( 2 ); + struct x86_reg tmp1 = make_xmm( 3 ); + + assert( xmm_dst != xmm_src ); + assert( xmm_dst != 2 && xmm_dst != 3 ); + assert( xmm_src != 2 && xmm_src != 3 ); + + sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) ); + sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) ); + sse_rsqrtps( func, tmp1, src ); + sse_mulps( func, src, tmp1 ); + sse_mulps( func, dst, tmp1 ); + sse_mulps( func, src, tmp1 ); + sse_subps( func, tmp0, src ); + sse_mulps( func, dst, tmp0 ); + } #endif #else /* On Intel CPUs at least, this is only accurate to 12 bits -- not @@ -1295,9 +1304,9 @@ emit_instruction( case TGSI_OPCODE_RSQ: /* TGSI_OPCODE_RECIPSQRT */ FETCH( func, *inst, 0, 0, CHAN_X ); - emit_rsqrt( func, 0, 0 ); + emit_rsqrt( func, 1, 0 ); FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - STORE( func, *inst, 0, 0, chan_index ); + STORE( func, *inst, 1, 0, chan_index ); } break; -- cgit v1.2.3 From 6c15a70b75b1625b69790f98f2f44e9ae4435f6a Mon Sep 17 00:00:00 2001 From: Michal Krol Date: Fri, 2 May 2008 16:12:55 +0200 Subject: tgsi: Enable fast high precision rsqrt. --- src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'src/gallium/auxiliary') diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c index dbf002130b..b6b05944be 100755 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c @@ -761,20 +761,6 @@ emit_rcp ( make_xmm( xmm_src ) ); } -#if HIGH_PRECISION -static void XSTDCALL -rsqrt4f( - float *store ) -{ - const unsigned X = 0; - - store[X + 0] = 1.0F / sqrtf( store[X + 0] ); - store[X + 1] = 1.0F / sqrtf( store[X + 1] ); - store[X + 2] = 1.0F / sqrtf( store[X + 2] ); - store[X + 3] = 1.0F / sqrtf( store[X + 3] ); -} -#endif - static void emit_rsqrt( struct x86_function *func, @@ -782,13 +768,6 @@ emit_rsqrt( unsigned xmm_src ) { #if HIGH_PRECISION -#if 1 - emit_func_call_dst_src( - func, - xmm_dst, - xmm_src, - rsqrt4f ); -#else /* Although rsqrtps() and rcpps() are low precision on some/all SSE * implementations, it is possible to improve its precision at * fairly low cost, using a newton/raphson step, as below: @@ -817,7 +796,6 @@ emit_rsqrt( sse_subps( func, tmp0, src ); sse_mulps( func, dst, tmp0 ); } -#endif #else /* On Intel CPUs at least, this is only accurate to 12 bits -- not * good enough. -- cgit v1.2.3 From cff8d3bdcbf78b57b52a2f60c54e5a3cae286137 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Fri, 2 May 2008 08:22:25 -0600 Subject: gallium: remove ^M (CR) chars --- src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c | 46 ++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'src/gallium/auxiliary') diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c index b6b05944be..8018bd7fa4 100755 --- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c @@ -36,11 +36,11 @@ #ifdef PIPE_ARCH_X86 -/* for 1/sqrt() - * - * This costs about 100fps (close to 10%) in gears: - */ -#define HIGH_PRECISION 1 +/* for 1/sqrt() + * + * This costs about 100fps (close to 10%) in gears: + */ +#define HIGH_PRECISION 1 #define FOR_EACH_CHANNEL( CHAN )\ @@ -777,24 +777,24 @@ emit_rsqrt( * * See: http://softwarecommunity.intel.com/articles/eng/1818.htm */ - { - struct x86_reg dst = make_xmm( xmm_dst ); - struct x86_reg src = make_xmm( xmm_src ); - struct x86_reg tmp0 = make_xmm( 2 ); - struct x86_reg tmp1 = make_xmm( 3 ); - - assert( xmm_dst != xmm_src ); - assert( xmm_dst != 2 && xmm_dst != 3 ); - assert( xmm_src != 2 && xmm_src != 3 ); - - sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) ); - sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) ); - sse_rsqrtps( func, tmp1, src ); - sse_mulps( func, src, tmp1 ); - sse_mulps( func, dst, tmp1 ); - sse_mulps( func, src, tmp1 ); - sse_subps( func, tmp0, src ); - sse_mulps( func, dst, tmp0 ); + { + struct x86_reg dst = make_xmm( xmm_dst ); + struct x86_reg src = make_xmm( xmm_src ); + struct x86_reg tmp0 = make_xmm( 2 ); + struct x86_reg tmp1 = make_xmm( 3 ); + + assert( xmm_dst != xmm_src ); + assert( xmm_dst != 2 && xmm_dst != 3 ); + assert( xmm_src != 2 && xmm_src != 3 ); + + sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) ); + sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) ); + sse_rsqrtps( func, tmp1, src ); + sse_mulps( func, src, tmp1 ); + sse_mulps( func, dst, tmp1 ); + sse_mulps( func, src, tmp1 ); + sse_subps( func, tmp0, src ); + sse_mulps( func, dst, tmp0 ); } #else /* On Intel CPUs at least, this is only accurate to 12 bits -- not -- cgit v1.2.3 From 6e004e973bcc5b789ee3f70b70f5d728c8b8ea71 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Fri, 2 May 2008 14:00:35 -0600 Subject: gallium: remove 0.5 vertex biases in set_vertex_data() These should not be needed and were causing garbage to appear along the edges of the mipmap images. --- src/gallium/auxiliary/util/u_gen_mipmap.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'src/gallium/auxiliary') diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c index 2e8417ee13..c53c512268 100644 --- a/src/gallium/auxiliary/util/u_gen_mipmap.c +++ b/src/gallium/auxiliary/util/u_gen_mipmap.c @@ -778,23 +778,23 @@ set_vertex_data(struct gen_mipmap_state *ctx, float width, float height) { void *buf; - ctx->vertices[0][0][0] = -0.5f; /*x*/ - ctx->vertices[0][0][1] = -0.5f; /*y*/ + ctx->vertices[0][0][0] = 0.0f; /*x*/ + ctx->vertices[0][0][1] = 0.0f; /*y*/ ctx->vertices[0][1][0] = 0.0f; /*s*/ ctx->vertices[0][1][1] = 0.0f; /*t*/ - ctx->vertices[1][0][0] = width - 0.5f; /*x*/ - ctx->vertices[1][0][1] = -0.5f; /*y*/ - ctx->vertices[1][1][0] = 1.0f; /*s*/ - ctx->vertices[1][1][1] = 0.0f; /*t*/ + ctx->vertices[1][0][0] = width; + ctx->vertices[1][0][1] = 0.0f; + ctx->vertices[1][1][0] = 1.0f; + ctx->vertices[1][1][1] = 0.0f; - ctx->vertices[2][0][0] = width - 0.5f; - ctx->vertices[2][0][1] = height - 0.5f; + ctx->vertices[2][0][0] = width; + ctx->vertices[2][0][1] = height; ctx->vertices[2][1][0] = 1.0f; ctx->vertices[2][1][1] = 1.0f; - ctx->vertices[3][0][0] = -0.5f; - ctx->vertices[3][0][1] = height - 0.5f; + ctx->vertices[3][0][0] = 0.0f; + ctx->vertices[3][0][1] = height; ctx->vertices[3][1][0] = 0.0f; ctx->vertices[3][1][1] = 1.0f; -- cgit v1.2.3