summaryrefslogtreecommitdiff
path: root/src/gallium
diff options
context:
space:
mode:
authorKeith Whitwell <keith@tungstengraphics.com>2008-05-02 16:02:18 +0200
committerMichal Krol <michal@tungstengraphics.com>2008-05-02 16:02:18 +0200
commit17058e07469f2dc5b47b4f820bd5a31b7ed9177c (patch)
tree7772bd9f4a3fce87ff7c31599d442784ead33bbe /src/gallium
parent5e49037caa4cf9062efd0bbebf67b467684b633b (diff)
tgsi: Implement fast rsqrtf. Not tested, inactive.
Diffstat (limited to 'src/gallium')
-rw-r--r--src/gallium/auxiliary/tgsi/exec/tgsi_exec.c6
-rw-r--r--src/gallium/auxiliary/tgsi/exec/tgsi_exec.h10
-rwxr-xr-xsrc/gallium/auxiliary/tgsi/exec/tgsi_sse2.c43
3 files changed, 40 insertions, 19 deletions
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
index 5d5125f7cb..826b432f09 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -88,6 +88,10 @@
#define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
#define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
#define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_3_I TGSI_EXEC_TEMP_THREE_I
+#define TEMP_3_C TGSI_EXEC_TEMP_THREE_C
+#define TEMP_HALF_I TGSI_EXEC_TEMP_HALF_I
+#define TEMP_HALF_C TGSI_EXEC_TEMP_HALF_C
#define TEMP_R0 TGSI_EXEC_TEMP_R0
#define FOR_EACH_CHANNEL(CHAN)\
@@ -262,6 +266,8 @@ tgsi_exec_machine_init(
mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+ mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
+ mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
}
}
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
index 92e2e5e985..19bd78df3d 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
@@ -133,9 +133,15 @@ struct tgsi_exec_labels
#define TGSI_EXEC_TEMP_PRIMITIVE_I 34
#define TGSI_EXEC_TEMP_PRIMITIVE_C 2
-#define TGSI_EXEC_TEMP_R0 35
+#define TGSI_EXEC_TEMP_THREE_I 34
+#define TGSI_EXEC_TEMP_THREE_C 3
-#define TGSI_EXEC_NUM_TEMPS (32 + 4)
+#define TGSI_EXEC_TEMP_HALF_I 35
+#define TGSI_EXEC_TEMP_HALF_C 0
+
+#define TGSI_EXEC_TEMP_R0 36
+
+#define TGSI_EXEC_NUM_TEMPS (32 + 5)
#define TGSI_EXEC_NUM_ADDRS 1
#define TGSI_EXEC_NUM_IMMEDIATES 256
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 2fd76a3072..dbf002130b 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -36,7 +36,11 @@
#ifdef PIPE_ARCH_X86
-#define HIGH_PRECISION 1 /* for 1/sqrt() */
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
#define FOR_EACH_CHANNEL( CHAN )\
@@ -794,20 +798,25 @@ emit_rsqrt(
*
* See: http://softwarecommunity.intel.com/articles/eng/1818.htm
*/
- /* This is some code that woudl do the above for a scalar 'a'. We
- * obviously are interested in a vector version:
- *
- * movss xmm3, a;
- * movss xmm1, half;
- * movss xmm2, three;
- * rsqrtss xmm0, xmm3;
- * mulss xmm3, xmm0;
- * mulss xmm1, xmm0;
- * mulss xmm3, xmm0;
- * subss xmm2, xmm3;
- * mulss xmm1, xmm2;
- * movss x, xmm1;
- */
+ {
+ struct x86_reg dst = make_xmm( xmm_dst );
+ struct x86_reg src = make_xmm( xmm_src );
+ struct x86_reg tmp0 = make_xmm( 2 );
+ struct x86_reg tmp1 = make_xmm( 3 );
+
+ assert( xmm_dst != xmm_src );
+ assert( xmm_dst != 2 && xmm_dst != 3 );
+ assert( xmm_src != 2 && xmm_src != 3 );
+
+ sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+ sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+ sse_rsqrtps( func, tmp1, src );
+ sse_mulps( func, src, tmp1 );
+ sse_mulps( func, dst, tmp1 );
+ sse_mulps( func, src, tmp1 );
+ sse_subps( func, tmp0, src );
+ sse_mulps( func, dst, tmp0 );
+ }
#endif
#else
/* On Intel CPUs at least, this is only accurate to 12 bits -- not
@@ -1295,9 +1304,9 @@ emit_instruction(
case TGSI_OPCODE_RSQ:
/* TGSI_OPCODE_RECIPSQRT */
FETCH( func, *inst, 0, 0, CHAN_X );
- emit_rsqrt( func, 0, 0 );
+ emit_rsqrt( func, 1, 0 );
FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
- STORE( func, *inst, 0, 0, chan_index );
+ STORE( func, *inst, 1, 0, chan_index );
}
break;