1 files changed, 717 insertions, 0 deletions
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
new file mode 100644
index 0000000000..0811420e42
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -0,0 +1,717 @@
+
+#include "nv50_pc.h"
+
+#define DESCEND_ARBITRARY(j, f)                                 \
+do {                                                            \
+   b->pass_seq = ctx->pc->pass_seq;                             \
+                                                                \
+   for (j = 0; j < 2; ++j)                                      \
+      if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \
+         f(ctx, b->out[j]);	                                  \
+} while (0)
+
+extern unsigned nv50_inst_min_size(struct nv_instruction *);
+
+struct nv_pc_pass {
+   struct nv_pc *pc;
+};
+
+static INLINE boolean
+values_equal(struct nv_value *a, struct nv_value *b)
+{
+   /* XXX: sizes */
+   return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id);
+}
+
+static INLINE boolean
+inst_commutation_check(struct nv_instruction *a,
+                       struct nv_instruction *b)
+{
+   int si, di;
+
+   for (di = 0; di < 4; ++di) {
+      if (!a->def[di])
+         break;
+      for (si = 0; si < 5; ++si) {
+         if (!b->src[si])
+            continue;
+         if (values_equal(a->def[di], b->src[si]->value))
+            return FALSE;
+      }
+   }
+
+   if (b->flags_src && b->flags_src->value == a->flags_def)
+      return FALSE;
+
+   return TRUE;
+}
+
+/* Check whether we can swap the order of the instructions,
+ * where a & b may be either the earlier or the later one.
+ */
+static boolean
+inst_commutation_legal(struct nv_instruction *a,
+		       struct nv_instruction *b)
+{
+   return inst_commutation_check(a, b) && inst_commutation_check(b, a);
+}
+
+static INLINE boolean
+inst_cullable(struct nv_instruction *nvi)
+{
+   return (!(nvi->is_terminator ||
+             nvi->target ||
+             nvi->fixed ||
+             nv_nvi_refcount(nvi)));
+}
+
+static INLINE boolean
+nvi_isnop(struct nv_instruction *nvi)
+{
+   if (nvi->opcode == NV_OP_EXPORT)
+      return TRUE;
+
+   if (nvi->fixed ||
+       nvi->is_terminator ||
+       nvi->flags_src ||
+       nvi->flags_def)
+      return FALSE;
+
+   if (nvi->def[0]->join->reg.id < 0)
+      return TRUE;
+
+   if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT)
+      return FALSE;
+
+   if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file)
+      return FALSE;
+
+   if (nvi->src[0]->value->join->reg.id < 0) {
+      debug_printf("nvi_isnop: orphaned value detected\n");
+      return TRUE;
+   }
+
+   if (nvi->opcode == NV_OP_SELECT)
+      if (!values_equal(nvi->def[0], nvi->src[1]->value))
+         return FALSE;
+
+   return values_equal(nvi->def[0], nvi->src[0]->value);
+}
+
+static void
+nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *next;
+   int j;
+   uint size, n32 = 0;
+
+   b->priv = 0;
+
+   if (pc->num_blocks)
+      b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos +
+                   pc->bb_list[pc->num_blocks - 1]->bin_size;
+
+   pc->bb_list[pc->num_blocks++] = b;
+
+   /* visit node */
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+      if (nvi_isnop(nvi))
+         nv_nvi_delete(nvi);
+   }
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+
+      size = nv50_inst_min_size(nvi);
+      if (nvi->next && size < 8)
+         ++n32;
+      else
+      if ((n32 & 1) && nvi->next &&
+          nv50_inst_min_size(nvi->next) == 4 &&
+          inst_commutation_legal(nvi, nvi->next)) {
+         ++n32;
+         debug_printf("permuting: ");
+         nv_print_instruction(nvi);
+         nv_print_instruction(nvi->next);
+         nv_nvi_permute(nvi, nvi->next);
+         next = nvi;
+      } else {
+         nvi->is_long = 1;
+
+         b->bin_size += n32 & 1;
+         if (n32 & 1)
+            nvi->prev->is_long = 1;
+         n32 = 0;
+      }
+      b->bin_size += 1 + nvi->is_long;
+   }
+
+   if (!b->entry) {
+      debug_printf("block %p is now empty\n", b);
+   } else
+   if (!b->exit->is_long) {
+      assert(n32);
+      b->exit->is_long = 1;
+      b->bin_size += 1;
+
+      /* might have del'd a hole tail of instructions */
+      if (!b->exit->prev->is_long && !(n32 & 1)) {
+         b->bin_size += 1;
+         b->exit->prev->is_long = 1;
+      }
+   }
+   assert(!b->exit || b->exit->is_long);
+
+   pc->bin_size += b->bin_size *= 4;
+
+   /* descend CFG */
+
+   if (!b->out[0])
+      return;
+   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
+      return;
+
+#if 0
+   /* delete ELSE branch */
+   if (b->entry &&
+       b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) {
+      nv_nvi_delete(b->entry);
+      b->bin_size -= 2;
+      pc->bin_size -= 8;
+   }
+#endif
+   for (j = 0; j < 2; ++j)
+      if (b->out[j] && b->out[j] != b)
+         nv_pc_pass_pre_emission(pc, b->out[j]);
+}
+
+int
+nv_pc_exec_pass2(struct nv_pc *pc)
+{
+   debug_printf("preparing %u blocks for emission\n", pc->num_blocks);
+
+   pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
+  
+   pc->num_blocks = 0;
+   nv_pc_pass_pre_emission(pc, pc->root);
+
+   return 0;
+}
+
+static INLINE boolean
+is_cmem_load(struct nv_instruction *nvi)
+{
+   return (nvi->opcode == NV_OP_LDA &&
+	   nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) &&
+	   nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15));
+}
+
+static INLINE boolean
+is_smem_load(struct nv_instruction *nvi)
+{
+   return (nvi->opcode == NV_OP_LDA &&
+	   (nvi->src[0]->value->reg.file == NV_FILE_MEM_S ||
+	    nvi->src[0]->value->reg.file <= NV_FILE_MEM_P));
+}
+
+static INLINE boolean
+is_immd_move(struct nv_instruction *nvi)
+{
+   return (nvi->opcode == NV_OP_MOV &&
+	   nvi->src[0]->value->reg.file == NV_FILE_IMM);
+}
+
+static INLINE void
+check_swap_src_0_1(struct nv_instruction *nvi)
+{
+   static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
+   struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1];
+
+   if (!nv_op_commutative(nvi->opcode))
+      return;
+   assert(src0 && src1);
+
+   if (is_cmem_load(src0->value->insn)) {
+      if (!is_cmem_load(src1->value->insn)) {
+         nvi->src[0] = src1;
+	 nvi->src[1] = src0;
+	 /* debug_printf("swapping cmem load to 1\n"); */
+      }
+   } else
+   if (is_smem_load(src1->value->insn)) {
+      if (!is_smem_load(src0->value->insn)) {
+         nvi->src[0] = src1;
+	 nvi->src[1] = src0;
+	 /* debug_printf("swapping smem load to 0\n"); */
+      }
+   }
+
+   if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0)
+      nvi->set_cond = cc_swapped[nvi->set_cond];
+}
+
+struct nv_pass {
+   struct nv_pc *pc;
+   int n;
+   void *priv;
+};
+
+static int
+nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *sti;
+   int j;
+
+   for (sti = b->entry; sti; sti = sti->next) {
+      if (!sti->def[0])
+         continue;
+
+      if (sti->def[0]->reg.file != NV_FILE_OUT)
+         continue;
+      if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA)
+         continue;
+
+      nvi = sti->src[0]->value->insn;
+      if (!nvi || nvi->opcode == NV_OP_PHI)
+         continue;
+      assert(nvi->def[0] == sti->src[0]->value);
+
+      if (nvi->def[0]->refc > 1)
+         continue;
+
+      nvi->def[0] = sti->def[0];
+      nvi->fixed = 1;
+      sti->fixed = 0;
+   }
+   DESCEND_ARBITRARY(j, nv_pass_fold_stores);
+
+   return 0;
+}
+
+static int
+nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *ld;
+   int j;
+
+   for (nvi = b->entry; nvi; nvi = nvi->next) {
+      check_swap_src_0_1(nvi);
+
+      for (j = 0; j < 3; ++j) {
+         if (!nvi->src[j])
+            break;
+         ld = nvi->src[j]->value->insn;
+         if (!ld)
+            continue;
+
+         if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) {
+            nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
+            debug_printf("folded immediate %i\n", ld->def[0]->n);
+            continue;
+         }
+
+         if (ld->opcode != NV_OP_LDA)
+            continue;
+         if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value))
+            continue;
+
+         if (j == 0 && ld->src[4]) /* can't load shared mem */
+            continue;
+
+         /* fold it ! */ /* XXX: ref->insn */
+         nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
+         if (ld->src[4])
+            nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value);
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_fold_loads);
+
+   return 0;
+}
+
+static int
+nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   int j;
+   struct nv_instruction *nvi, *mi, *next;
+   ubyte mod;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+      if (nvi->opcode == NV_OP_SUB) {
+         nvi->opcode = NV_OP_ADD;
+         nvi->src[1]->mod ^= NV_MOD_NEG;
+      }
+
+      /* should not put any modifiers on NEG and ABS */
+      assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod);
+      assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod);
+
+      for (j = 0; j < 4; ++j) {
+         if (!nvi->src[j])
+            break;
+
+         mi = nvi->src[j]->value->insn;
+         if (!mi)
+            continue;
+         if (mi->def[0]->refc > 1)
+            continue;
+
+         if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG;
+         else
+         if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS;
+         else
+            continue;
+
+         if (nvi->opcode == NV_OP_ABS)
+            mod &= ~(NV_MOD_NEG | NV_MOD_ABS);
+         else
+         if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) {
+            nvi->opcode = NV_OP_MOV;
+            mod = 0;
+         }
+
+         if (!(nv50_supported_src_mods(nvi->opcode, j) & mod))
+            continue;
+
+         nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value);
+
+         nvi->src[j]->mod ^= mod;
+      }
+
+      if (nvi->opcode == NV_OP_SAT) {
+         mi = nvi->src[0]->value->insn;
+
+         if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) {
+            mi->saturate = 1;
+            mi->def[0] = nvi->def[0];
+            nv_nvi_delete(nvi);
+         }
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_lower_mods);
+
+   return 0;
+}
+
+#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL)
+
+static struct nv_value *
+find_immediate(struct nv_ref *ref)
+{
+   struct nv_value *src;
+
+   if (!ref)
+      return NULL;
+
+   src = ref->value;
+   while (src->insn && src->insn->opcode == NV_OP_MOV) {
+      assert(!src->insn->src[0]->mod);
+      src = src->insn->src[0]->value;
+   }
+   return (src->reg.file == NV_FILE_IMM) ? src : NULL;
+}
+
+static void
+constant_operand(struct nv_pc *pc,
+                 struct nv_instruction *nvi, struct nv_value *val, int s)
+{
+   int t = s ? 0 : 1;
+   ubyte type;
+
+   if (!nvi->def[0])
+      return;
+   type = nvi->def[0]->reg.type;
+
+   switch (nvi->opcode) {
+   case NV_OP_MUL:
+      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) ||
+          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) {
+         nvi->opcode = NV_OP_MOV;
+         nv_reference(pc, &nvi->src[s], NULL);
+         if (!s) {
+            nvi->src[0] = nvi->src[1];
+            nvi->src[1] = NULL;
+         }
+      } else
+      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) ||
+          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) {
+         nvi->opcode = NV_OP_ADD;
+         nv_reference(pc, &nvi->src[s], NULL);
+         if (!s) {
+            nvi->src[0] = nvi->src[1];
+            nvi->src[1] = NULL;
+         }
+      } else
+      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) {
+         nvi->opcode = NV_OP_NEG;
+         nv_reference(pc, &nvi->src[s], NULL);
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
+      } else
+      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) {
+         nvi->opcode = NV_OP_ADD;
+         assert(!nvi->src[s]->mod);
+         nv_reference(pc, &nvi->src[s], nvi->src[t]->value);
+         nvi->src[t]->mod ^= NV_MOD_NEG;
+         nvi->src[s]->mod |= NV_MOD_NEG;
+      } else
+      if (val->reg.imm.u32 == 0) {
+         nvi->opcode = NV_OP_MOV;
+         nv_reference(pc, &nvi->src[t], NULL);
+         if (s) {
+            nvi->src[0] = nvi->src[1];
+            nvi->src[1] = NULL;
+         }
+      }
+      break;
+   case NV_OP_ADD:
+      if (val->reg.imm.u32 == 0) {
+         nvi->opcode = NV_OP_MOV;
+         nv_reference(pc, &nvi->src[s], NULL);
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
+      }
+      break;
+   default:
+      break;
+   }
+}
+
+static int
+nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *next;
+   int j;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      struct nv_value *src0, *src1, *src;
+      int mod;
+
+      next = nvi->next;
+
+      if ((src = find_immediate(nvi->src[0])) != NULL)
+         constant_operand(ctx->pc, nvi, src, 0);
+      else
+      if ((src = find_immediate(nvi->src[1])) != NULL)
+         constant_operand(ctx->pc, nvi, src, 1);
+
+      /* try to combine MUL, ADD into MAD */
+      if (nvi->opcode != NV_OP_ADD)
+         continue;
+
+      src0 = nvi->src[0]->value;
+      src1 = nvi->src[1]->value;
+
+      if (SRC_IS_MUL(src0) && src0->refc == 1)
+         src = src0;
+      else
+      if (SRC_IS_MUL(src1) && src1->refc == 1)
+         src = src1;
+      else
+         continue;
+
+      nvi->opcode = NV_OP_MAD;
+      mod = nvi->src[(src == src0) ? 0 : 1]->mod;
+      nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL);
+      nvi->src[2] = nvi->src[(src == src0) ? 1 : 0];
+
+      assert(!(mod & ~NV_MOD_NEG));
+      nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value);
+      nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value);
+      nvi->src[0]->mod = src->insn->src[0]->mod ^ mod;
+      nvi->src[1]->mod = src->insn->src[1]->mod;
+   }
+   DESCEND_ARBITRARY(j, nv_pass_lower_arith);
+
+   return 0;
+}
+
+/*
+set $r2 g f32 $r2 $r3
+cvt abs rn f32 $r2 s32 $r2
+cvt f32 $c0 # f32 $r2
+e $c0 bra 0x80
+*/
+#if 0
+static int
+nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   /* XXX: easier in IR builder for now */
+   return 0;
+}
+#endif
+
+/* TODO: reload elimination, redundant store elimination */
+
+struct nv_pass_reldelim {
+   struct nv_pc *pc;
+};
+
+static int
+nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b)
+{
+   int j;
+   struct nv_instruction *ld, *next;
+
+   for (ld = b->entry; ld; ld = next) {
+      next = ld->next;
+
+      if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) {
+
+      } else
+      if (ld->opcode == NV_OP_LDA) {
+         
+      } else
+      if (ld->opcode == NV_OP_MOV) {
+         
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_reload_elim);
+
+   return 0;
+}
+
+static int
+nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   int i, c, j;
+
+   for (i = 0; i < ctx->pc->num_instructions; ++i) {
+      struct nv_instruction *nvi = &ctx->pc->instructions[i];
+      struct nv_value *def[4];
+
+      if (!nv_is_vector_op(nvi->opcode))
+         continue;
+      nvi->tex_mask = 0;
+
+      for (c = 0; c < 4; ++c) {
+         if (nvi->def[c]->refc)
+            nvi->tex_mask |= 1 << c;
+         def[c] = nvi->def[c];
+      }
+
+      j = 0;
+      for (c = 0; c < 4; ++c)
+         if (nvi->tex_mask & (1 << c))
+            nvi->def[j++] = def[c];
+      for (c = 0; c < 4; ++c)
+         if (!(nvi->tex_mask & (1 << c)))
+           nvi->def[j++] = def[c];
+      assert(j == 4);
+   }
+   return 0;
+}
+
+struct nv_pass_dce {
+   struct nv_pc *pc;
+   uint removed;
+};
+
+static int
+nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)
+{
+   int j;
+   struct nv_instruction *nvi, *next;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+
+      if (inst_cullable(nvi)) {
+         nv_nvi_delete(nvi);
+
+         ++ctx->removed;
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_dce);
+
+   return 0;
+}
+
+static INLINE boolean
+bb_simple_if_endif(struct nv_basic_block *bb)
+{
+   return (bb->out[0] && bb->out[1] &&
+           bb->out[0]->out[0] == bb->out[1] &&
+           !bb->out[0]->out[1]);
+}
+
+static int
+nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   int j;
+
+   if (bb_simple_if_endif(b)) {
+      ++ctx->n;
+      debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n);
+   }
+   DESCEND_ARBITRARY(j, nv_pass_flatten);
+
+   return 0;
+}
+
+int
+nv_pc_exec_pass0(struct nv_pc *pc)
+{
+   struct nv_pass_reldelim *reldelim;
+   struct nv_pass pass;
+   struct nv_pass_dce dce;
+   int ret;
+
+   reldelim = CALLOC_STRUCT(nv_pass_reldelim);
+   reldelim->pc = pc;
+
+   ret = nv_pass_reload_elim(reldelim, pc->root);
+
+   FREE(reldelim);
+   if (ret)
+      return ret;
+
+   pass.pc = pc;
+
+   pc->pass_seq++;
+   ret = nv_pass_flatten(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   /* Do this first, so we don't have to pay attention
+    * to whether sources are supported memory loads.
+    */
+   pc->pass_seq++;
+   ret = nv_pass_lower_arith(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_fold_loads(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_fold_stores(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_lower_mods(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   dce.pc = pc;
+   do {
+      dce.removed = 0;
+      pc->pass_seq++;
+      ret = nv_pass_dce(&dce, pc->root);
+      if (ret)
+         return ret;
+   } while (dce.removed);
+
+   ret = nv_pass_tex_mask(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   return ret;
+}