From f0d742962377948a9688f4fa3b92c2f8bbca03e9 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 1 Feb 2011 20:52:49 +0100
Subject: nvc0: detect no-op MIN/MAX, do CSE earlier to succeed more often

---
 src/gallium/drivers/nvc0/nvc0_pc_optimize.c | 127 +++++++++++++++++-----------
 1 file changed, 79 insertions(+), 48 deletions(-)

diff --git a/src/gallium/drivers/nvc0/nvc0_pc_optimize.c b/src/gallium/drivers/nvc0/nvc0_pc_optimize.c
index e0d4e2daf9..b6d99724a1 100644
--- a/src/gallium/drivers/nvc0/nvc0_pc_optimize.c
+++ b/src/gallium/drivers/nvc0/nvc0_pc_optimize.c
@@ -607,25 +607,83 @@ constant_operand(struct nv_pc *pc,
    }
 }
 
+static void
+handle_min_max(struct nv_pass *ctx, struct nv_instruction *nvi)
+{
+   struct nv_value *src0 = nvi->src[0]->value;
+   struct nv_value *src1 = nvi->src[1]->value;
+
+   if (src0 != src1 || (nvi->src[0]->mod | nvi->src[1]->mod))
+      return;
+   if (src0->reg.file != NV_FILE_GPR)
+      return;
+   nvc0_pc_replace_value(ctx->pc, nvi->def[0], src0);
+   nvc0_insn_delete(nvi);
+}
+
+/* check if we can MUL + ADD -> MAD/FMA */
+static void
+handle_add_mul(struct nv_pass *ctx, struct nv_instruction *nvi)
+{
+   struct nv_value *src0 = nvi->src[0]->value;
+   struct nv_value *src1 = nvi->src[1]->value;
+   struct nv_value *src;
+   int s;
+   uint8_t mod[4];
+
+   if (SRC_IS_MUL(src0) && src0->refc == 1) s = 0;
+   else
+   if (SRC_IS_MUL(src1) && src1->refc == 1) s = 1;
+   else
+      return;
+
+   if ((src0->insn && src0->insn->bb != nvi->bb) ||
+       (src1->insn && src1->insn->bb != nvi->bb))
+      return;
+
+   /* check for immediates from prior constant folding */
+   if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
+      return;
+   src = nvi->src[s]->value;
+
+   mod[0] = nvi->src[0]->mod;
+   mod[1] = nvi->src[1]->mod;
+   mod[2] = src->insn->src[0]->mod;
+   mod[3] = src->insn->src[1]->mod;
+
+   if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG)
+      return;
+
+   nvi->opcode = NV_OP_MAD_F32;
+
+   nv_reference(ctx->pc, nvi, s, NULL);
+   nvi->src[2] = nvi->src[!s];
+   nvi->src[!s] = NULL;
+
+   nv_reference(ctx->pc, nvi, 0, src->insn->src[0]->value);
+   nvi->src[0]->mod = mod[2] ^ mod[s];
+   nv_reference(ctx->pc, nvi, 1, src->insn->src[1]->value);
+   nvi->src[1]->mod = mod[3];
+}
+
 static int
-nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
+nv_pass_algebraic_opt(struct nv_pass *ctx, struct nv_basic_block *b)
 {
    struct nv_instruction *nvi, *next;
    int j;
 
    for (nvi = b->entry; nvi; nvi = next) {
-      struct nv_value *src0, *src1, *src;
-      int s;
-      uint8_t mod[4];
+      struct nv_value *src0, *src1;
+      uint baseop = NV_BASEOP(nvi->opcode);
 
       next = nvi->next;
 
       src0 = nvc0_pc_find_immediate(nvi->src[0]);
       src1 = nvc0_pc_find_immediate(nvi->src[1]);
 
-      if (src0 && src1)
+      if (src0 && src1) {
          constant_expression(ctx->pc, nvi, src0, src1);
-      else {
+      } else {
          if (src0)
             constant_operand(ctx->pc, nvi, src0, 0);
          else
@@ -633,44 +691,13 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
             constant_operand(ctx->pc, nvi, src1, 1);
       }
 
-      /* check if we can MUL + ADD -> MAD/FMA */
-      if (nvi->opcode != NV_OP_ADD)
-         continue;
-
-      src0 = nvi->src[0]->value;
-      src1 = nvi->src[1]->value;
-
-      if (SRC_IS_MUL(src0) && src0->refc == 1)
-         src = src0;
-      else
-      if (SRC_IS_MUL(src1) && src1->refc == 1)
-         src = src1;
+      if (baseop == NV_OP_MIN || baseop == NV_OP_MAX)
+         handle_min_max(ctx, nvi);
       else
-         continue;
-
-      /* could have an immediate from above constant_*  */
-      if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
-         continue;
-      s = (src == src0) ? 0 : 1;
-
-      mod[0] = nvi->src[0]->mod;
-      mod[1] = nvi->src[1]->mod;
-      mod[2] = src->insn->src[0]->mod;
-      mod[3] = src->insn->src[0]->mod;
-
-      if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG)
-         continue;
-
-      nvi->opcode = NV_OP_MAD;
-      nv_reference(ctx->pc, nvi, s, NULL);
-      nvi->src[2] = nvi->src[!s];
-
-      nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value);
-      nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value);
-      nvi->src[0]->mod = mod[2] ^ mod[s];
-      nvi->src[1]->mod = mod[3];
+      if (nvi->opcode == NV_OP_ADD_F32)
+         handle_add_mul(ctx, nvi);
    }
-   DESCEND_ARBITRARY(j, nv_pass_lower_arith);
+   DESCEND_ARBITRARY(j, nv_pass_algebraic_opt);
 
    return 0;
 }
@@ -1158,11 +1185,17 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
    pass.n = 0;
    pass.pc = pc;
 
+   /* Do CSE so we can just compare values by pointer in subsequent passes. */
+   pc->pass_seq++;
+   ret = nv_pass_cse(&pass, root);
+   if (ret)
+      return ret;
+
    /* Do this first, so we don't have to pay attention
     * to whether sources are supported memory loads.
     */
    pc->pass_seq++;
-   ret = nv_pass_lower_arith(&pass, root);
+   ret = nv_pass_algebraic_opt(&pass, root);
    if (ret)
       return ret;
 
@@ -1190,11 +1223,9 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
       reldelim->pc = pc;
    }
 
-   pc->pass_seq++;
-   ret = nv_pass_cse(&pass, root);
-   if (ret)
-      return ret;
-
+   /* May run DCE before load-combining since that pass will clean up
+    * after itself.
+    */
    dce.pc = pc;
    do {
       dce.removed = 0;
-- 
cgit v1.2.3