nv50: attempt at making more complicated loops work

Nested loops, and loops with multiple exits (BREAK, CONT).
author: Christoph Bumiller <e0425955@student.tuwien.ac.at> 2010-09-01 17:54:56 +0200
committer: Christoph Bumiller <e0425955@student.tuwien.ac.at> 2010-09-01 18:02:50 +0200
commit: 0a8292e096bc37eeb225bf7d3854b6b6edc4bceb (patch)
tree: 80bbb0365721665a8ff932e7b6ccb7a7c44a3e5c /src/gallium/drivers/nv50
parent: d90502b2b468732e2a42985580bbbe9d9fdfd14e (diff)
5 files changed, 138 insertions, 57 deletions
diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 1c12fe1b9e..b03f5b27f6 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -220,6 +220,7 @@ edge_name(ubyte type)
    case CFG_EDGE_BACK: return "back";
    case CFG_EDGE_LOOP_ENTER: return "loop";
    case CFG_EDGE_LOOP_LEAVE: return "break";
+   case CFG_EDGE_FAKE: return "fake";
    default:
       return "?";
    }
@@ -247,6 +248,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
          case CFG_EDGE_BACK:
             continue;
          case CFG_EDGE_FORWARD:
+         case CFG_EDGE_FAKE:
             if (++b->out[j]->priv == b->out[j]->num_in)
                bb[p++] = b->out[j];
             break;
@@ -264,9 +266,11 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
 
       f(priv, b);
 
-      if (!p)
-         while (pp > 0)
-            bb[p++] = bbb[--pp];
+      if (!p) {
+         p = pp;
+         for (; pp > 0; --pp)
+            bb[pp - 1] = bbb[pp - 1];
+      }
    }
 }
 
@@ -366,11 +370,17 @@ nv50_generate_code(struct nv50_translation_info *ti)
    ret = nv_pc_exec_pass0(pc);
    if (ret)
       goto out;
+#ifdef NV50PC_DEBUG
+   nv_print_program(pc->root);
+#endif
 
    /* register allocation */
    ret = nv_pc_exec_pass1(pc);
    if (ret)
       goto out;
+#ifdef NV50PC_DEBUG
+   nv_print_program(pc->root);
+#endif
 
    /* prepare for emission */
    ret = nv_pc_exec_pass2(pc);
@@ -580,10 +590,10 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
    if (bp == bt)
       return FALSE;
 
-   if (bp->out[0] && bp->out_kind[0] != CFG_EDGE_BACK &&
+   if (bp->out[0] && !IS_WALL_EDGE(bp->out_kind[0]) &&
        nvbb_reachable_by(bf, bp->out[0], bt))
       return TRUE;
-   if (bp->out[1] && bp->out_kind[1] != CFG_EDGE_BACK &&
+   if (bp->out[1] && !IS_WALL_EDGE(bp->out_kind[1]) &&
        nvbb_reachable_by(bf, bp->out[1], bt))
       return TRUE;
    return FALSE;
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 48918f46d5..2bb3ea4374 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -257,6 +257,12 @@ struct nv_instruction {
 #define CFG_EDGE_BACK        1
 #define CFG_EDGE_LOOP_ENTER  2
 #define CFG_EDGE_LOOP_LEAVE  4
+#define CFG_EDGE_FAKE        8
+
+/* 'WALL' edge means where reachability check doesn't follow */
+/* 'LOOP' edge means just having to do with loops */
+#define IS_LOOP_EDGE(k) ((k) & 7)
+#define IS_WALL_EDGE(k) ((k) & 9)
 
 struct nv_basic_block {
    struct nv_instruction *entry; /* first non-phi instruction */
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 4b1cd56fc1..1d2710a8ac 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -362,6 +362,9 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
          nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
          if (ld->src[4])
             nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value);
+
+         if (!nv_nvi_refcount(ld))
+            nv_nvi_delete(ld);
       }
    }
    DESCEND_ARBITRARY(j, nv_pass_fold_loads);
@@ -504,7 +507,7 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi,
    u1.u32 = src1->reg.imm.u32;
 
    modifiers_apply(&u0.u32, type, nvi->src[0]->mod);
-   modifiers_apply(&u0.u32, type, nvi->src[1]->mod);
+   modifiers_apply(&u1.u32, type, nvi->src[1]->mod);
 
    switch (nvi->opcode) {
    case NV_OP_MAD:
@@ -951,7 +954,9 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
          if (b->exit && b->exit->opcode == NV_OP_JOINAT)
             nv_nvi_delete(b->exit);
 
-         if ((nvi = b->out[0]->out[0]->entry)) {
+         i = (b->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) ? 1 : 0;
+
+         if ((nvi = b->out[0]->out[i]->entry)) {
             nvi->is_join = 0;
             if (nvi->opcode == NV_OP_JOIN)
                nv_nvi_delete(nvi);
@@ -980,7 +985,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
             if (ir->opcode != ik->opcode)
                continue;
 
-            if (ik->opcode == NV_OP_LDA ||
+            if (!ir->def[0] || !ik->def[0] ||
+                ik->opcode == NV_OP_LDA ||
                 ik->opcode == NV_OP_STA ||
                 ik->opcode == NV_OP_MOV ||
                 nv_is_vector_op(ik->opcode))
@@ -993,8 +999,6 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
                 ik->flags_def || ir->flags_def)
                continue; /* and also not with flags, for now */
 
-            assert(ik->def[0] && ir->def[0]);
-
             if (ik->def[0]->reg.file == NV_FILE_OUT ||
                 ir->def[0]->reg.file == NV_FILE_OUT ||
                 !values_equal(ik->def[0], ir->def[0]))
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 59462cc11e..81decf8d4a 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -22,6 +22,10 @@
 
 /* #define NV50PC_DEBUG */
 
+/* #define NV50_RA_DEBUG_LIVEI */
+/* #define NV50_RA_DEBUG_LIVE_SETS */
+/* #define NV50_RA_DEBUG_JOIN */
+
 #include "nv50_context.h"
 #include "nv50_pc.h"
 
@@ -119,7 +123,7 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end)
    add_range_ex(val, bgn, end, NULL);
 }
 
-#ifdef NV50_RA_DEBUG_JOIN
+#if defined(NV50_RA_DEBUG_JOIN) || defined(NV50_RA_DEBUG_LIVEI)
 static void
 livei_print(struct nv_value *a)
 {
@@ -359,16 +363,37 @@ need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p)
    int i = 0, n = 0;
 
    for (; i < 2; ++i)
-      if (p->out[i] && p->out_kind[i] != CFG_EDGE_LOOP_LEAVE)
+      if (p->out[i] && !IS_LOOP_EDGE(p->out_kind[i]))
          ++n;
 
    return (b->num_in > 1) && (n == 2);
 }
 
+static int
+phi_opnd_for_bb(struct nv_instruction *phi, struct nv_basic_block *b,
+                struct nv_basic_block *tb)
+{
+   int i, j;
+
+   for (j = -1, i = 0; i < 4 && phi->src[i]; ++i) {
+      if (!nvbb_reachable_by(b, phi->src[i]->value->insn->bb, tb))
+         continue;
+      /* NOTE: back-edges are ignored by the reachable-by check */
+      if (j < 0 || !nvbb_reachable_by(phi->src[j]->value->insn->bb,
+                                      phi->src[i]->value->insn->bb, tb))
+         j = i;
+   }
+   return j;
+}
+
 /* For each operand of each PHI in b, generate a new value by inserting a MOV
  * at the end of the block it is coming from and replace the operand with its
  * result. This eliminates liveness conflicts and enables us to let values be
  * copied to the right register if such a conflict exists nonetheless.
+ *
+ * These MOVs are also crucial in making sure the live intervals of phi srces
+ * are extended until the end of the loop, since they are not included in the
+ * live-in sets.
  */
 static int
 pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
@@ -404,14 +429,17 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       ctx->pc->current_block = pn;
 
       for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
-         for (j = 0; j < 4 && i->src[j]; ++j) {
-            if (nvbb_reachable_by(p, i->src[j]->value->insn->bb, b))
-               break;
-         }
-         if (j >= 4 || !i->src[j])
+         if ((j = phi_opnd_for_bb(i, p, b)) < 0)
             continue;
          val = i->src[j]->value;
 
+         if (i->src[j]->flags) {
+            val = val->insn->src[0]->value;
+            while (j < 4 && i->src[j])
+               ++j;
+            assert(j < 4);
+         }
+
          ni = new_instruction(ctx->pc, NV_OP_MOV);
 
          /* TODO: insert instruction at correct position in the first place */
@@ -423,6 +451,8 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
          ni->src[0] = new_ref(ctx->pc, val);
 
          nv_reference(ctx->pc, &i->src[j], ni->def[0]);
+
+         i->src[j]->flags = 1;
       }
 
       if (pn != p && pn->exit) {
@@ -452,8 +482,8 @@ pass_join_values(struct nv_pc_pass *ctx, int iter)
       case NV_OP_PHI:
          if (!iter)
             continue;
-         try_join_values(ctx, i->src[0]->value, i->src[1]->value);
-         try_join_values(ctx, i->def[0], i->src[0]->value);
+         for (c = 0; c < 4 && i->src[c]; ++c)
+            try_join_values(ctx, i->def[0], i->src[c]->value);
          break;
       case NV_OP_MOV:
          if (iter && i->src[0]->value->insn &&
@@ -576,22 +606,6 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
          for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j)
             b->live_set[j] |= b->out[n]->live_set[j];
       }
-
-      /* Kick values out of our live set that are created in incoming
-       * blocks of our successors that are not us.
-       */
-      for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
-         for (j = 0; j < 4; ++j) {
-            if (!i->src[j])
-               break;
-            assert(i->src[j]->value->insn);
-
-            if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n]))
-               live_set_add(b, i->src[j]->value);
-            else
-               live_set_rem(b, i->src[j]->value);
-         }
-      }
    }
 
    if (!b->entry)
@@ -599,7 +613,7 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 
    bb_live_set_print(ctx->pc, b);
 
-   for (i = b->exit; i; i = i->prev) {
+   for (i = b->exit; i != b->entry->prev; i = i->prev) {
       for (j = 0; j < 4; j++) {
          if (!i->def[j])
             break;
@@ -617,6 +631,9 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       if (i->flags_src)
          live_set_add(b, i->flags_src->value);
    }
+   for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next)
+      live_set_rem(b, i->def[0]);
+
    bb_live_set_print(ctx->pc, b);
 
    return 0;
@@ -680,10 +697,12 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       for (j = 0; j < ctx->pc->num_values; ++j) {
          if (!(b->live_set[j / 32] & (1 << (j % 32))))
             continue;
+         add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
 #ifdef NV50_RA_DEBUG_LIVEI
-         debug_printf("adding range for live value %i\n", j);
+         debug_printf("adding range for live value %i: ", j);
+         livei_print(&ctx->pc->values[j]);
 #endif
-         add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
+
       }
    }
 
@@ -702,20 +721,22 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       for (j = 0; j < 5; ++j) {
          if (i->src[j] && !live_set_test(b, i->src[j])) {
             live_set_add(b, i->src[j]->value);
+            add_range(i->src[j]->value, b, i->serial);
 #ifdef NV50_RA_DEBUG_LIVEI
-            debug_printf("adding range for source that ends living: %i\n",
+            debug_printf("adding range for source %i (ends living): ",
                          i->src[j]->value->n);
+            livei_print(i->src[j]->value);
 #endif
-            add_range(i->src[j]->value, b, i->serial);
          }
       }
       if (i->flags_src && !live_set_test(b, i->flags_src)) {
          live_set_add(b, i->flags_src->value);
+         add_range(i->flags_src->value, b, i->serial);
 #ifdef NV50_RA_DEBUG_LIVEI
-         debug_printf("adding range for source that ends living: %i\n",
+         debug_printf("adding range for source %i (ends living): ",
                       i->flags_src->value->n);
+         livei_print(i->flags_src->value);
 #endif
-         add_range(i->flags_src->value, b, i->serial);
       }
    }
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 115b5df939..8b18a9c025 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -120,6 +120,8 @@ struct bld_context {
    struct nv_basic_block *brkt_bb[BLD_MAX_LOOP_NESTING];
    int loop_lvl;
 
+   ubyte out_kind; /* CFG_EDGE_FORWARD, or FAKE in case of BREAK/CONT */
+
    struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */
    struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */
    struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */
@@ -268,7 +270,7 @@ fetch_by_bb(struct bld_value_stack *stack,
       return;
    }
    for (i = 0; i < b->num_in; ++i)
-      if (b->in_kind[i] != CFG_EDGE_BACK)
+      if (!IS_WALL_EDGE(b->in_kind[i]))
          fetch_by_bb(stack, vals, n, b->in[i]);
 }
 
@@ -362,18 +364,31 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b,
    return phi->def[0];
 }
 
+/* Insert a phi function in the loop header.
+ * For nested loops, we need to insert phi functions in all the outer
+ * loop headers if they don't have one yet.
+ *
+ * @def: redefinition from inside loop, or NULL if to be replaced later
+ */
 static struct nv_value *
 bld_loop_phi(struct bld_context *bld, struct bld_value_stack *stack,
              struct nv_value *def)
 {
-   struct nv_basic_block *bb = bld->pc->current_block;
    struct nv_instruction *phi;
-   struct nv_value *val;
+   struct nv_basic_block *bb = bld->pc->current_block;
+   struct nv_value *val = NULL;
 
-   val = bld_phi(bld, bld->pc->current_block, stack);
+   if (bld->loop_lvl > 1) {
+      --bld->loop_lvl;
+      if (!((stack->loop_def | stack->loop_use) & (1 << bld->loop_lvl)))
+         val = bld_loop_phi(bld, stack, NULL);
+      ++bld->loop_lvl;
+   }
+
+   if (!val)
+      val = bld_phi(bld, bld->pc->current_block, stack); /* old definition */
    if (!val) {
       bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]->in[0];
-
       val = bld_undef(bld, bld_stack_file(bld, stack));
    }
 
@@ -449,10 +464,11 @@ bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *,
 static void
 bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb)
 {
+   struct nv_basic_block *save = bld->pc->current_block;
    struct nv_instruction *phi, *next;
    struct nv_value *val;
    struct bld_value_stack *stk;
-   int s;
+   int i, s, n;
 
    for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = next) {
       next = phi->next;
@@ -460,19 +476,33 @@ bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb)
       stk = (struct bld_value_stack *)phi->target;
       phi->target = NULL;
 
-      val = bld_fetch_global(bld, stk);
+      for (s = 1, n = 0; n < bb->num_in; ++n) {
+         if (bb->in_kind[n] != CFG_EDGE_BACK)
+            continue;
 
-      nv_reference(bld->pc, &phi->src[1], val);
+         assert(s < 4);
+         bld->pc->current_block = bb->in[n];
+         val = bld_fetch_global(bld, stk);
+
+         for (i = 0; i < 4; ++i)
+            if (phi->src[i] && phi->src[i]->value == val)
+               break;
+         if (i == 4)
+            nv_reference(bld->pc, &phi->src[s++], val);
+      }
+      bld->pc->current_block = save;
 
-      s = -1;
       if (phi->src[0]->value == phi->def[0] ||
           phi->src[0]->value == phi->src[1]->value)
          s = 1;
       else
       if (phi->src[1]->value == phi->def[0])
          s = 0;
+      else
+         continue;
 
       if (s >= 0) {
+         /* eliminate the phi */
          bld_vals_del_val(stk, phi->def[0]);
 
          ++bld->pc->pass_seq;
@@ -915,6 +945,8 @@ bld_new_block(struct bld_context *bld, struct nv_basic_block *b)
 
    for (i = 0; i < 128; ++i)
       bld->saved_inputs[i] = NULL;
+
+   bld->out_kind = CFG_EDGE_FORWARD;
 }
 
 static struct nv_value *
@@ -1366,7 +1398,7 @@ bld_instruction(struct bld_context *bld,
       struct nv_basic_block *b = new_basic_block(bld->pc);
 
       --bld->cond_lvl;
-      nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD);
+      nvbb_attach_block(bld->pc->current_block, b, bld->out_kind);
       nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD);
 
       bld->cond_bb[bld->cond_lvl]->exit->target = b;
@@ -1407,8 +1439,10 @@ bld_instruction(struct bld_context *bld,
 
       bld_flow(bld, NV_OP_BREAK, NV_CC_TR, NULL, bb, FALSE);
 
-      /* XXX: don't do this for redundant BRKs */
-      nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE);
+      if (bld->out_kind == CFG_EDGE_FORWARD) /* else we already had BRK/CONT */
+         nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE);
+
+      bld->out_kind = CFG_EDGE_FAKE;
    }
       break;
    case TGSI_OPCODE_CONT:
@@ -1418,11 +1452,17 @@ bld_instruction(struct bld_context *bld,
       bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE);
 
       nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK);
+
+      if ((bb = bld->join_bb[bld->cond_lvl - 1])) {
+         bld->join_bb[bld->cond_lvl - 1] = NULL;
+         nv_nvi_delete(bb->exit->prev);
+      }
+      bld->out_kind = CFG_EDGE_FAKE;
    }
       break;
    case TGSI_OPCODE_ENDLOOP:
    {
-      struct nv_basic_block *bb = bld->loop_bb[--bld->loop_lvl];
+      struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1];
 
       bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE);
 
@@ -1430,7 +1470,7 @@ bld_instruction(struct bld_context *bld,
 
       bld_loop_end(bld, bb); /* replace loop-side operand of the phis */
 
-      bld_new_block(bld, bld->brkt_bb[bld->loop_lvl]);
+      bld_new_block(bld, bld->brkt_bb[--bld->loop_lvl]);
    }
       break;
    case TGSI_OPCODE_ABS:
@@ -1651,7 +1691,7 @@ bld_replace_value(struct nv_pc *pc, struct nv_basic_block *b,
 {
    struct nv_instruction *nvi;
 
-   for (nvi = b->entry; nvi; nvi = nvi->next) {
+   for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = nvi->next) {
       int s;
       for (s = 0; s < 5; ++s) {
          if (!nvi->src[s])
author	Christoph Bumiller <e0425955@student.tuwien.ac.at>	2010-09-01 17:54:56 +0200
committer	Christoph Bumiller <e0425955@student.tuwien.ac.at>	2010-09-01 18:02:50 +0200
commit	0a8292e096bc37eeb225bf7d3854b6b6edc4bceb (patch)
tree	80bbb0365721665a8ff932e7b6ccb7a7c44a3e5c /src/gallium/drivers/nv50
parent	d90502b2b468732e2a42985580bbbe9d9fdfd14e (diff)