From 17058e07469f2dc5b47b4f820bd5a31b7ed9177c Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 2 May 2008 16:02:18 +0200
Subject: tgsi: Implement fast rsqrtf. Not tested, inactive.

---
 src/gallium/auxiliary/tgsi/exec/tgsi_exec.c |  6 ++++
 src/gallium/auxiliary/tgsi/exec/tgsi_exec.h | 10 +++++--
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c | 43 +++++++++++++++++------------
 3 files changed, 40 insertions(+), 19 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
index 5d5125f7cb..826b432f09 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -88,6 +88,10 @@
 #define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
 #define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
 #define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
+#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
+#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
+#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
 #define TEMP_R0            TGSI_EXEC_TEMP_R0
 
 #define FOR_EACH_CHANNEL(CHAN)\
@@ -262,6 +266,8 @@ tgsi_exec_machine_init(
       mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
       mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
       mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
+      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
    }
 }
 
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
index 92e2e5e985..19bd78df3d 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
@@ -133,9 +133,15 @@ struct tgsi_exec_labels
 #define TGSI_EXEC_TEMP_PRIMITIVE_I  34
 #define TGSI_EXEC_TEMP_PRIMITIVE_C  2
 
-#define TGSI_EXEC_TEMP_R0           35
+#define TGSI_EXEC_TEMP_THREE_I      34
+#define TGSI_EXEC_TEMP_THREE_C      3
 
-#define TGSI_EXEC_NUM_TEMPS   (32 + 4)
+#define TGSI_EXEC_TEMP_HALF_I       35
+#define TGSI_EXEC_TEMP_HALF_C       0
+
+#define TGSI_EXEC_TEMP_R0           36
+
+#define TGSI_EXEC_NUM_TEMPS   (32 + 5)
 #define TGSI_EXEC_NUM_ADDRS   1
 #define TGSI_EXEC_NUM_IMMEDIATES  256
 
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 2fd76a3072..dbf002130b 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -36,7 +36,11 @@
 
 #ifdef PIPE_ARCH_X86
 
-#define HIGH_PRECISION 1  /* for 1/sqrt() */
+/* for 1/sqrt() 
+ * 
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1  
 
 
 #define FOR_EACH_CHANNEL( CHAN )\
@@ -794,20 +798,25 @@ emit_rsqrt(
     *
     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
     */
-   /* This is some code that woudl do the above for a scalar 'a'.  We
-    * obviously are interested in a vector version:
-    *
-    * movss   xmm3, a;
-    * movss   xmm1, half;
-    * movss   xmm2, three;
-    * rsqrtss xmm0, xmm3;
-    * mulss   xmm3, xmm0;
-    * mulss   xmm1, xmm0;
-    * mulss   xmm3, xmm0;
-    * subss   xmm2, xmm3;
-    * mulss   xmm1, xmm2;
-    * movss   x,    xmm1;
-    */
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
+   }
 #endif
 #else
    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
@@ -1295,9 +1304,9 @@ emit_instruction(
    case TGSI_OPCODE_RSQ:
    /* TGSI_OPCODE_RECIPSQRT */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rsqrt( func, 0, 0 );
+      emit_rsqrt( func, 1, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 1, 0, chan_index );
       }
       break;
 
-- 
cgit v1.2.3


From 6c15a70b75b1625b69790f98f2f44e9ae4435f6a Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@ubuntu-vbox.(none)>
Date: Fri, 2 May 2008 16:12:55 +0200
Subject: tgsi: Enable fast high precision rsqrt.

---
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index dbf002130b..b6b05944be 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -761,20 +761,6 @@ emit_rcp (
       make_xmm( xmm_src ) );
 }
 
-#if HIGH_PRECISION
-static void XSTDCALL
-rsqrt4f(
-   float *store )
-{
-   const unsigned X = 0;
-
-   store[X + 0] = 1.0F / sqrtf( store[X + 0] );
-   store[X + 1] = 1.0F / sqrtf( store[X + 1] );
-   store[X + 2] = 1.0F / sqrtf( store[X + 2] );
-   store[X + 3] = 1.0F / sqrtf( store[X + 3] );
-}
-#endif
-
 static void
 emit_rsqrt(
    struct x86_function *func,
@@ -782,13 +768,6 @@ emit_rsqrt(
    unsigned xmm_src )
 {
 #if HIGH_PRECISION
-#if 1
-   emit_func_call_dst_src(
-      func,
-      xmm_dst,
-      xmm_src,
-      rsqrt4f );
-#else
    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
     * implementations, it is possible to improve its precision at
     * fairly low cost, using a newton/raphson step, as below:
@@ -817,7 +796,6 @@ emit_rsqrt(
       sse_subps(   func, tmp0, src  );
       sse_mulps(   func, dst,  tmp0 );
    }
-#endif
 #else
    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
     * good enough.
-- 
cgit v1.2.3


From cff8d3bdcbf78b57b52a2f60c54e5a3cae286137 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 2 May 2008 08:22:25 -0600
Subject: gallium: remove ^M (CR) chars

---
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c | 46 ++++++++++++++---------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index b6b05944be..8018bd7fa4 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -36,11 +36,11 @@
 
 #ifdef PIPE_ARCH_X86
 
-/* for 1/sqrt() 
- * 
- * This costs about 100fps (close to 10%) in gears:
- */
-#define HIGH_PRECISION 1  
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
 
 
 #define FOR_EACH_CHANNEL( CHAN )\
@@ -777,24 +777,24 @@ emit_rsqrt(
     *
     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
     */
-   {
-      struct x86_reg dst = make_xmm( xmm_dst );
-      struct x86_reg src = make_xmm( xmm_src );
-      struct x86_reg tmp0 = make_xmm( 2 );
-      struct x86_reg tmp1 = make_xmm( 3 );
-
-      assert( xmm_dst != xmm_src );
-      assert( xmm_dst != 2 && xmm_dst != 3 );
-      assert( xmm_src != 2 && xmm_src != 3 );
-
-      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
-      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
-      sse_rsqrtps( func, tmp1, src  );
-      sse_mulps(   func, src,  tmp1 );
-      sse_mulps(   func, dst,  tmp1 );
-      sse_mulps(   func, src,  tmp1 );
-      sse_subps(   func, tmp0, src  );
-      sse_mulps(   func, dst,  tmp0 );
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
    }
 #else
    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-- 
cgit v1.2.3


From 6e004e973bcc5b789ee3f70b70f5d728c8b8ea71 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 2 May 2008 14:00:35 -0600
Subject: gallium: remove 0.5 vertex biases in set_vertex_data()

These should not be needed and were causing garbage to appear along the
edges of the mipmap images.
---
 src/gallium/auxiliary/util/u_gen_mipmap.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'src/gallium/auxiliary')

diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 2e8417ee13..c53c512268 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -778,23 +778,23 @@ set_vertex_data(struct gen_mipmap_state *ctx, float width, float height)
 {
    void *buf;
 
-   ctx->vertices[0][0][0] = -0.5f; /*x*/
-   ctx->vertices[0][0][1] = -0.5f; /*y*/
+   ctx->vertices[0][0][0] = 0.0f; /*x*/
+   ctx->vertices[0][0][1] = 0.0f; /*y*/
    ctx->vertices[0][1][0] = 0.0f; /*s*/
    ctx->vertices[0][1][1] = 0.0f; /*t*/
 
-   ctx->vertices[1][0][0] = width - 0.5f; /*x*/
-   ctx->vertices[1][0][1] = -0.5f;  /*y*/
-   ctx->vertices[1][1][0] = 1.0f; /*s*/
-   ctx->vertices[1][1][1] = 0.0f; /*t*/
+   ctx->vertices[1][0][0] = width;
+   ctx->vertices[1][0][1] = 0.0f;
+   ctx->vertices[1][1][0] = 1.0f;
+   ctx->vertices[1][1][1] = 0.0f;
 
-   ctx->vertices[2][0][0] = width - 0.5f;
-   ctx->vertices[2][0][1] = height - 0.5f;
+   ctx->vertices[2][0][0] = width;
+   ctx->vertices[2][0][1] = height;
    ctx->vertices[2][1][0] = 1.0f;
    ctx->vertices[2][1][1] = 1.0f;
 
-   ctx->vertices[3][0][0] = -0.5f;
-   ctx->vertices[3][0][1] = height - 0.5f;
+   ctx->vertices[3][0][0] = 0.0f;
+   ctx->vertices[3][0][1] = height;
    ctx->vertices[3][1][0] = 0.0f;
    ctx->vertices[3][1][1] = 1.0f;
 
-- 
cgit v1.2.3