1 files changed, 1174 insertions, 0 deletions
diff --git a/src/gallium/drivers/nvc0/nvc0_pc_optimize.c b/src/gallium/drivers/nvc0/nvc0_pc_optimize.c
new file mode 100644
index 0000000000..e9e387632b
--- /dev/null
+++ b/src/gallium/drivers/nvc0/nvc0_pc_optimize.c
@@ -0,0 +1,1174 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nvc0_pc.h"
+#include "nvc0_program.h"
+
+#define DESCEND_ARBITRARY(j, f)                                 \
+do {                                                            \
+   b->pass_seq = ctx->pc->pass_seq;                             \
+                                                                \
+   for (j = 0; j < 2; ++j)                                      \
+      if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \
+         f(ctx, b->out[j]);	                                  \
+} while (0)
+
+static INLINE boolean
+registers_interfere(struct nv_value *a, struct nv_value *b)
+{
+   if (a->reg.file != b->reg.file)
+      return FALSE;
+   if (NV_IS_MEMORY_FILE(a->reg.file) || NV_IS_MEMORY_FILE(b->reg.file))
+      return FALSE;
+
+   assert(a->join->reg.id >= 0 && b->join->reg.id >= 0);
+
+   if (a->join->reg.id < b->join->reg.id) {
+      return (a->join->reg.id + a->reg.size >= b->join->reg.id);
+   } else
+   if (a->join->reg.id > b->join->reg.id) {
+      return (b->join->reg.id + b->reg.size >= a->join->reg.id);
+   }
+
+   return FALSE;
+}
+
+static INLINE boolean
+values_equal(struct nv_value *a, struct nv_value *b)
+{
+   if (a->reg.file != b->reg.file || a->reg.size != b->reg.size)
+      return FALSE;
+   if (NV_IS_MEMORY_FILE(a->reg.file))
+      return a->reg.address == b->reg.address;
+   else
+      return a->join->reg.id == b->join->reg.id;
+}
+
+#if 0
+static INLINE boolean
+inst_commutation_check(struct nv_instruction *a, struct nv_instruction *b)
+{
+   int si, di;
+
+   for (di = 0; di < 4 && a->def[di]; ++di)
+      for (si = 0; si < 5 && b->src[si]; ++si)
+         if (registers_interfere(a->def[di], b->src[si]->value))
+            return FALSE;
+
+   return TRUE;
+}
+
+/* Check whether we can swap the order of the instructions,
+ * where a & b may be either the earlier or the later one.
+ */
+static boolean
+inst_commutation_legal(struct nv_instruction *a, struct nv_instruction *b)
+{
+   return inst_commutation_check(a, b) && inst_commutation_check(b, a);
+}
+#endif
+
+static INLINE boolean
+inst_removable(struct nv_instruction *nvi)
+{
+   if (nvi->opcode == NV_OP_ST)
+      return FALSE;
+   return (!(nvi->terminator ||
+             nvi->join ||
+             nvi->target ||
+             nvi->fixed ||
+             nvc0_insn_refcount(nvi)));
+}
+
+static INLINE boolean
+inst_is_noop(struct nv_instruction *nvi)
+{
+   if (nvi->opcode == NV_OP_UNDEF || nvi->opcode == NV_OP_BIND)
+      return TRUE;
+   if (nvi->terminator || nvi->join)
+      return FALSE;
+   if (nvi->def[0] && nvi->def[0]->join->reg.id < 0)
+      return TRUE;
+   if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT)
+      return FALSE;
+   if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file)
+      return FALSE;
+
+   if (nvi->src[0]->value->join->reg.id < 0) {
+      NOUVEAU_DBG("inst_is_noop: orphaned value detected\n");
+      return TRUE;
+   }
+
+   if (nvi->opcode == NV_OP_SELECT)
+      if (!values_equal(nvi->def[0], nvi->src[1]->value))
+         return FALSE;
+   return values_equal(nvi->def[0], nvi->src[0]->value);
+}
+
+struct nv_pass {
+   struct nv_pc *pc;
+   int n;
+   void *priv;
+};
+
+static int
+nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b);
+
+static void
+nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
+{
+   struct nv_pc *pc = (struct nv_pc *)priv;
+   struct nv_basic_block *in;
+   struct nv_instruction *nvi, *next;
+   int j;
+
+   for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->emit_size; --j);
+
+   if (j >= 0) {
+      in = pc->bb_list[j];
+
+      /* check for no-op branches (BRA $PC+8) */
+      if (in->exit && in->exit->opcode == NV_OP_BRA && in->exit->target == b) {
+         in->emit_size -= 8;
+         pc->emit_size -= 8;
+
+         for (++j; j < pc->num_blocks; ++j)
+            pc->bb_list[j]->emit_pos -= 8;
+
+         nvc0_insn_delete(in->exit);
+      }
+      b->emit_pos = in->emit_pos + in->emit_size;
+   }
+
+   pc->bb_list[pc->num_blocks++] = b;
+
+   /* visit node */
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+      if (inst_is_noop(nvi) ||
+          (pc->is_fragprog && nvi->opcode == NV_OP_EXPORT)) {
+         nvc0_insn_delete(nvi);
+      } else
+         b->emit_size += 8;
+   }
+   pc->emit_size += b->emit_size;
+
+#ifdef NOUVEAU_DEBUG
+   if (!b->entry)
+      debug_printf("BB:%i is now empty\n", b->id);
+   else
+      debug_printf("BB:%i size = %u\n", b->id, b->emit_size);
+#endif
+}
+
+static int
+nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root)
+{
+   struct nv_pass pass;
+
+   pass.pc = pc;
+
+   pc->pass_seq++;
+   nv_pass_flatten(&pass, root);
+
+   nvc0_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc);
+
+   return 0;
+}
+
+int
+nvc0_pc_exec_pass2(struct nv_pc *pc)
+{
+   int i, ret;
+
+   NOUVEAU_DBG("preparing %u blocks for emission\n", pc->num_blocks);
+
+   pc->num_blocks = 0; /* will reorder bb_list */
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i])))
+         return ret;
+   return 0;
+}
+
+static INLINE boolean
+is_cspace_load(struct nv_instruction *nvi)
+{
+   if (!nvi)
+      return FALSE;
+   assert(nvi->indirect != 0);
+   return (nvi->opcode == NV_OP_LD &&
+           nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) &&
+           nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15));
+}
+
+static INLINE boolean
+is_immd32_load(struct nv_instruction *nvi)
+{
+   if (!nvi)
+      return FALSE;
+   return (nvi->opcode == NV_OP_MOV &&
+           nvi->src[0]->value->reg.file == NV_FILE_IMM &&
+           nvi->src[0]->value->reg.size == 4);
+}
+
+static INLINE void
+check_swap_src_0_1(struct nv_instruction *nvi)
+{
+   static const uint8_t cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
+   struct nv_ref *src0 = nvi->src[0];
+   struct nv_ref *src1 = nvi->src[1];
+
+   if (!nv_op_commutative(nvi->opcode))
+      return;
+   assert(src0 && src1 && src0->value && src1->value);
+
+   if (is_cspace_load(src0->value->insn)) {
+      if (!is_cspace_load(src1->value->insn)) {
+         nvi->src[0] = src1;
+         nvi->src[1] = src0;
+      }
+   }
+
+   if (nvi->src[0] != src0 && nvi->opcode == NV_OP_SET)
+      nvi->set_cond = cc_swapped[nvi->set_cond];
+}
+
+static void
+nvi_set_indirect_load(struct nv_pc *pc,
+                      struct nv_instruction *nvi, struct nv_value *val)
+{
+   for (nvi->indirect = 0; nvi->indirect < 6 && nvi->src[nvi->indirect];
+        ++nvi->indirect);
+   assert(nvi->indirect < 6);
+   nv_reference(pc, nvi, nvi->indirect, val);
+}
+
+static int
+nvc0_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *ld;
+   int s;
+
+   for (nvi = b->entry; nvi; nvi = nvi->next) {
+      check_swap_src_0_1(nvi);
+
+      for (s = 0; s < 3 && nvi->src[s]; ++s) {
+         ld = nvi->src[s]->value->insn;
+         if (!ld || ld->opcode != NV_OP_LD)
+            continue;
+         if (!nvc0_insn_can_load(nvi, s, ld))
+            continue;
+
+         /* fold it ! */
+         nv_reference(ctx->pc, nvi, s, ld->src[0]->value);
+         if (ld->indirect >= 0)
+            nvi_set_indirect_load(ctx->pc, nvi, ld->src[ld->indirect]->value);
+
+         if (!nvc0_insn_refcount(ld))
+            nvc0_insn_delete(ld);
+      }
+   }
+   DESCEND_ARBITRARY(s, nvc0_pass_fold_loads);
+
+   return 0;
+}
+
+static INLINE uint
+modifiers_opcode(uint8_t mod)
+{
+   switch (mod) {
+   case NV_MOD_NEG: return NV_OP_NEG;
+   case NV_MOD_ABS: return NV_OP_ABS;
+   case 0:
+      return NV_OP_MOV;
+   default:
+      return NV_OP_NOP;
+   }
+}
+
+/* NOTE: Assumes loads have not yet been folded. */
+static int
+nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *mi, *next;
+   int j;
+   uint8_t mod;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+      if (nvi->opcode == NV_OP_SUB) {
+         nvi->src[1]->mod ^= NV_MOD_NEG;
+         nvi->opcode = NV_OP_ADD;
+      }
+
+      for (j = 0; j < 3 && nvi->src[j]; ++j) {
+         mi = nvi->src[j]->value->insn;
+         if (!mi)
+            continue;
+         if (mi->def[0]->refc > 1 || mi->predicate >= 0)
+            continue;
+
+         if (NV_BASEOP(mi->opcode) == NV_OP_NEG) mod = NV_MOD_NEG;
+         else
+         if (NV_BASEOP(mi->opcode) == NV_OP_ABS) mod = NV_MOD_ABS;
+         else
+            continue;
+         assert(!(mod & mi->src[0]->mod & NV_MOD_NEG));
+
+         mod |= mi->src[0]->mod;
+
+         if ((nvi->opcode == NV_OP_ABS) || (nvi->src[j]->mod & NV_MOD_ABS)) {
+            /* abs neg [abs] = abs */
+            mod &= ~(NV_MOD_NEG | NV_MOD_ABS);
+         } else
+         if ((nvi->opcode == NV_OP_NEG) && (mod & NV_MOD_NEG)) {
+            /* neg as opcode and modifier on same insn cannot occur */
+            /* neg neg abs = abs, neg neg = identity */
+            assert(j == 0);
+            if (mod & NV_MOD_ABS)
+               nvi->opcode = NV_OP_ABS;
+            else
+               nvi->opcode = NV_OP_MOV;
+            mod = 0;
+         }
+
+         if ((nv_op_supported_src_mods(nvi->opcode) & mod) != mod)
+            continue;
+
+         nv_reference(ctx->pc, nvi, j, mi->src[0]->value);
+
+         nvi->src[j]->mod ^= mod;
+      }
+
+      if (nvi->opcode == NV_OP_SAT) {
+         mi = nvi->src[0]->value->insn;
+
+         if (mi->def[0]->refc > 1 ||
+             (mi->opcode != NV_OP_ADD &&
+              mi->opcode != NV_OP_MUL &&
+              mi->opcode != NV_OP_MAD))
+            continue;
+         mi->saturate = 1;
+         mi->def[0] = nvi->def[0];
+         mi->def[0]->insn = mi;
+         nvc0_insn_delete(nvi);
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_lower_mods);
+
+   return 0;
+}
+
+#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL)
+
+/*
+static void
+modifiers_apply(uint32_t *val, ubyte type, ubyte mod)
+{
+   if (mod & NV_MOD_ABS) {
+      if (type == NV_TYPE_F32)
+         *val &= 0x7fffffff;
+      else
+      if ((*val) & (1 << 31))
+         *val = ~(*val) + 1;
+   }
+   if (mod & NV_MOD_NEG) {
+      if (type == NV_TYPE_F32)
+         *val ^= 0x80000000;
+      else
+         *val = ~(*val) + 1;
+   }
+}
+*/
+
+#if 0
+static void
+constant_expression(struct nv_pc *pc, struct nv_instruction *nvi,
+                    struct nv_value *src0, struct nv_value *src1)
+{
+   struct nv_value *val;
+   union {
+      float f32;
+      uint32_t u32;
+      int32_t s32;
+   } u0, u1, u;
+   ubyte type;
+
+   if (!nvi->def[0])
+      return;
+   type = NV_OPTYPE(nvi->opcode);
+
+   u.u32 = 0;
+   u0.u32 = src0->reg.imm.u32;
+   u1.u32 = src1->reg.imm.u32;
+
+   modifiers_apply(&u0.u32, type, nvi->src[0]->mod);
+   modifiers_apply(&u1.u32, type, nvi->src[1]->mod);
+
+   switch (nvi->opcode) {
+   case NV_OP_MAD:
+      if (nvi->src[2]->value->reg.file != NV_FILE_GPR)
+         return;
+      /* fall through */
+   case NV_OP_MUL:
+      switch (type) {
+      case NV_TYPE_F32: u.f32 = u0.f32 * u1.f32; break;
+      case NV_TYPE_U32: u.u32 = u0.u32 * u1.u32; break;
+      case NV_TYPE_S32: u.s32 = u0.s32 * u1.s32; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case NV_OP_ADD:
+      switch (type) {
+      case NV_TYPE_F32: u.f32 = u0.f32 + u1.f32; break;
+      case NV_TYPE_U32: u.u32 = u0.u32 + u1.u32; break;
+      case NV_TYPE_S32: u.s32 = u0.s32 + u1.s32; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case NV_OP_SUB:
+      switch (type) {
+      case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32; break;
+      case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32; break;
+      case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   default:
+      return;
+   }
+
+   nvi->opcode = NV_OP_MOV;
+
+   val = new_value(pc, NV_FILE_IMM, type);
+
+   val->reg.imm.u32 = u.u32;
+
+   nv_reference(pc, nvi, 1, NULL);
+   nv_reference(pc, nvi, 0, val);
+
+   if (nvi->src[2]) { /* from MAD */
+      nvi->src[1] = nvi->src[0];
+      nvi->src[0] = nvi->src[2];
+      nvi->src[2] = NULL;
+      nvi->opcode = NV_OP_ADD;
+
+      if (val->reg.imm.u32 == 0) {
+         nvi->src[1] = NULL;
+         nvi->opcode = NV_OP_MOV;
+      }
+   }
+}
+
+static void
+constant_operand(struct nv_pc *pc,
+                 struct nv_instruction *nvi, struct nv_value *val, int s)
+{
+   union {
+      float f32;
+      uint32_t u32;
+      int32_t s32;
+   } u;
+   int t = s ? 0 : 1;
+   uint op;
+   ubyte type;
+
+   if (!nvi->def[0])
+      return;
+   type = NV_OPTYPE(nvi->opcode);
+
+   u.u32 = val->reg.imm.u32;
+   modifiers_apply(&u.u32, type, nvi->src[s]->mod);
+
+   switch (NV_BASEOP(nvi->opcode)) {
+   case NV_OP_MUL:
+      if ((type == NV_TYPE_F32 && u.f32 == 1.0f) ||
+          (NV_TYPE_ISINT(type) && u.u32 == 1)) {
+         if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP)
+            break;
+         nvi->opcode = op;
+         nv_reference(pc, nvi, s, NULL);
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
+      } else
+      if ((type == NV_TYPE_F32 && u.f32 == 2.0f) ||
+          (NV_TYPE_ISINT(type) && u.u32 == 2)) {
+         nvi->opcode = NV_OP_ADD;
+         nv_reference(pc, nvi, s, nvi->src[t]->value);
+         nvi->src[s]->mod = nvi->src[t]->mod;
+      } else
+      if (type == NV_TYPE_F32 && u.f32 == -1.0f) {
+         if (nvi->src[t]->mod & NV_MOD_NEG)
+            nvi->opcode = NV_OP_MOV;
+         else
+            nvi->opcode = NV_OP_NEG;
+         nv_reference(pc, nvi, s, NULL);
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
+      } else
+      if (type == NV_TYPE_F32 && u.f32 == -2.0f) {
+         nvi->opcode = NV_OP_ADD;
+         nv_reference(pc, nvi, s, nvi->src[t]->value);
+         nvi->src[s]->mod = (nvi->src[t]->mod ^= NV_MOD_NEG);
+      } else
+      if (u.u32 == 0) {
+         nvi->opcode = NV_OP_MOV;
+         nv_reference(pc, nvi, t, NULL);
+         if (s) {
+            nvi->src[0] = nvi->src[1];
+            nvi->src[1] = NULL;
+         }
+      }
+      break;
+   case NV_OP_ADD:
+      if (u.u32 == 0) {
+         if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP)
+            break;
+         nvi->opcode = op;
+         nv_reference(pc, nvi, s, NULL);
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
+      }
+      break;
+   case NV_OP_RCP:
+      u.f32 = 1.0f / u.f32;
+      (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32;
+      nvi->opcode = NV_OP_MOV;
+      assert(s == 0);
+      nv_reference(pc, nvi, 0, val);
+      break;
+   case NV_OP_RSQ:
+      u.f32 = 1.0f / sqrtf(u.f32);
+      (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32;
+      nvi->opcode = NV_OP_MOV;
+      assert(s == 0);
+      nv_reference(pc, nvi, 0, val);
+      break;
+   default:
+      break;
+   }
+}
+#endif
+
+static int
+nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+#if 0
+   struct nv_instruction *nvi, *next;
+   int j;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      struct nv_value *src0, *src1, *src;
+      int mod;
+
+      next = nvi->next;
+
+      src0 = nvcg_find_immediate(nvi->src[0]);
+      src1 = nvcg_find_immediate(nvi->src[1]);
+
+      if (src0 && src1)
+         constant_expression(ctx->pc, nvi, src0, src1);
+      else {
+         if (src0)
+            constant_operand(ctx->pc, nvi, src0, 0);
+         else
+         if (src1)
+            constant_operand(ctx->pc, nvi, src1, 1);
+      }
+
+      /* try to combine MUL, ADD into MAD */
+      if (nvi->opcode != NV_OP_ADD)
+         continue;
+
+      src0 = nvi->src[0]->value;
+      src1 = nvi->src[1]->value;
+
+      if (SRC_IS_MUL(src0) && src0->refc == 1)
+         src = src0;
+      else
+      if (SRC_IS_MUL(src1) && src1->refc == 1)
+         src = src1;
+      else
+         continue;
+
+      /* could have an immediate from above constant_*  */
+      if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
+         continue;
+
+      nvi->opcode = NV_OP_MAD;
+      mod = nvi->src[(src == src0) ? 0 : 1]->mod;
+      nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL);
+      nvi->src[2] = nvi->src[(src == src0) ? 1 : 0];
+
+      assert(!(mod & ~NV_MOD_NEG));
+      nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value);
+      nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value);
+      nvi->src[0]->mod = src->insn->src[0]->mod ^ mod;
+      nvi->src[1]->mod = src->insn->src[1]->mod;
+   }
+   DESCEND_ARBITRARY(j, nv_pass_lower_arith);
+#endif
+   return 0;
+}
+
+/* TODO: redundant store elimination */
+
+struct mem_record {
+   struct mem_record *next;
+   struct nv_instruction *insn;
+   uint32_t ofst;
+   uint32_t base;
+   uint32_t size;
+};
+
+#define MEM_RECORD_POOL_SIZE 1024
+
+struct pass_reld_elim {
+   struct nv_pc *pc;
+
+   struct mem_record *imm;
+   struct mem_record *mem_v;
+   struct mem_record *mem_a;
+   struct mem_record *mem_c[16];
+   struct mem_record *mem_l;
+
+   struct mem_record pool[MEM_RECORD_POOL_SIZE];
+   int alloc;
+};
+
+static void
+combine_load(struct mem_record *rec, struct nv_instruction *ld)
+{
+   struct nv_instruction *fv = rec->insn;
+   struct nv_value *mem = ld->src[0]->value;
+   uint32_t size = rec->size + mem->reg.size;
+   int j;
+   int d = rec->size / 4;
+
+   assert(rec->size < 16);
+   if (rec->ofst > mem->reg.address) {
+      if ((size == 8 && mem->reg.address & 3) ||
+          (size > 8 && mem->reg.address & 7))
+         return;
+      rec->ofst = mem->reg.address;
+      for (j = 0; j < d; ++j)
+         fv->def[d + j] = fv->def[j];
+      d = 0;
+   } else
+   if ((size == 8 && rec->ofst & 3) ||
+       (size > 8 && rec->ofst & 7)) {
+      return;
+   }
+
+   for (j = 0; j < mem->reg.size / 4; ++j) {
+      fv->def[d] = ld->def[j];
+      fv->def[d++]->insn = fv;
+   }
+
+   fv->src[0]->value->reg.size = rec->size = size;
+
+   nvc0_insn_delete(ld);
+}
+
+static void
+combine_export(struct mem_record *rec, struct nv_instruction *ex)
+{
+
+}
+
+static INLINE void
+add_mem_record(struct pass_reld_elim *ctx, struct mem_record **rec,
+               uint32_t base, uint32_t ofst, struct nv_instruction *nvi)
+{
+   struct mem_record *it = &ctx->pool[ctx->alloc++];
+
+   it->next = *rec;
+   *rec = it;
+   it->base = base;
+   it->ofst = ofst;
+   it->insn = nvi;
+   it->size = nvi->src[0]->value->reg.size;
+}
+
+/* vectorize and reuse loads from memory or of immediates */
+static int
+nv_pass_mem_opt(struct pass_reld_elim *ctx, struct nv_basic_block *b)
+{
+   struct mem_record **rec, *it;
+   struct nv_instruction *ld, *next;
+   struct nv_value *mem;
+   uint32_t base, ofst;
+   int s;
+
+   for (ld = b->entry; ld; ld = next) {
+      next = ld->next;
+
+      if (is_cspace_load(ld)) {
+         mem = ld->src[0]->value;
+         rec = &ctx->mem_c[ld->src[0]->value->reg.file - NV_FILE_MEM_C(0)];
+      } else
+      if (ld->opcode == NV_OP_VFETCH) {
+         mem = ld->src[0]->value;
+         rec = &ctx->mem_a;
+      } else
+      if (ld->opcode == NV_OP_EXPORT) {
+         mem = ld->src[0]->value;
+         if (mem->reg.file != NV_FILE_MEM_V)
+            continue;
+         rec = &ctx->mem_v;
+      } else {
+         continue;
+      }
+      if (ld->def[0] && ld->def[0]->refc == 0)
+         continue;
+      ofst = mem->reg.address;
+      base = (ld->indirect >= 0) ? ld->src[ld->indirect]->value->n : 0;
+
+      for (it = *rec; it; it = it->next) {
+         if (it->base == base &&
+             ((it->ofst >> 4) == (ofst >> 4)) &&
+             ((it->ofst + it->size == ofst) ||
+              (it->ofst - mem->reg.size == ofst))) {
+            if (ld->opcode == NV_OP_LD && it->size + mem->reg.size == 12)
+               continue;
+            if (it->ofst < ofst) {
+               if ((it->ofst & 0xf) == 4)
+                  continue;
+            } else
+            if ((ofst & 0xf) == 4)
+               continue;
+            break;
+         }
+      }
+      if (it) {
+         switch (ld->opcode) {
+         case NV_OP_EXPORT: combine_export(it, ld); break;
+         default:
+            combine_load(it, ld);
+            break;
+         }
+      } else
+      if (ctx->alloc < MEM_RECORD_POOL_SIZE) {
+         add_mem_record(ctx, rec, base, ofst, ld);
+      }
+   }
+
+   DESCEND_ARBITRARY(s, nv_pass_mem_opt);
+   return 0;
+}
+
+static void
+eliminate_store(struct mem_record *rec, struct nv_instruction *st)
+{
+}
+
+/* elimination of redundant stores */
+static int
+pass_store_elim(struct pass_reld_elim *ctx, struct nv_basic_block *b)
+{
+   struct mem_record **rec, *it;
+   struct nv_instruction *st, *next;
+   struct nv_value *mem;
+   uint32_t base, ofst, size;
+   int s;
+
+   for (st = b->entry; st; st = next) {
+      next = st->next;
+
+      if (st->opcode == NV_OP_ST) {
+         mem = st->src[0]->value;
+         rec = &ctx->mem_l;
+      } else
+      if (st->opcode == NV_OP_EXPORT) {
+         mem = st->src[0]->value;
+         if (mem->reg.file != NV_FILE_MEM_V)
+            continue;
+         rec = &ctx->mem_v;
+      } else
+      if (st->opcode == NV_OP_ST) {
+         /* TODO: purge */
+      }
+      ofst = mem->reg.address;
+      base = (st->indirect >= 0) ? st->src[st->indirect]->value->n : 0;
+      size = mem->reg.size;
+
+      for (it = *rec; it; it = it->next) {
+         if (it->base == base &&
+             (it->ofst <= ofst && (it->ofst + size) > ofst))
+            break;
+      }
+      if (it)
+         eliminate_store(it, st);
+      else
+         add_mem_record(ctx, rec, base, ofst, st);
+   }
+
+   DESCEND_ARBITRARY(s, nv_pass_mem_opt);
+   return 0;
+}
+
+/* TODO: properly handle loads from l[] memory in the presence of stores */
+static int
+nv_pass_reload_elim(struct pass_reld_elim *ctx, struct nv_basic_block *b)
+{
+#if 0
+   struct load_record **rec, *it;
+   struct nv_instruction *ld, *next;
+   uint64_t data[2];
+   struct nv_value *val;
+   int j;
+
+   for (ld = b->entry; ld; ld = next) {
+      next = ld->next;
+      if (!ld->src[0])
+         continue;
+      val = ld->src[0]->value;
+      rec = NULL;
+
+      if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) {
+         data[0] = val->reg.id;
+         data[1] = 0;
+         rec = &ctx->mem_v;
+      } else
+      if (ld->opcode == NV_OP_LDA) {
+         data[0] = val->reg.id;
+         data[1] = ld->src[4] ? ld->src[4]->value->n : ~0ULL;
+         if (val->reg.file >= NV_FILE_MEM_C(0) &&
+             val->reg.file <= NV_FILE_MEM_C(15))
+            rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)];
+         else
+         if (val->reg.file == NV_FILE_MEM_S)
+            rec = &ctx->mem_s;
+         else
+         if (val->reg.file == NV_FILE_MEM_L)
+            rec = &ctx->mem_l;
+      } else
+      if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) {
+         data[0] = val->reg.imm.u32;
+         data[1] = 0;
+         rec = &ctx->imm;
+      }
+
+      if (!rec || !ld->def[0]->refc)
+         continue;
+
+      for (it = *rec; it; it = it->next)
+         if (it->data[0] == data[0] && it->data[1] == data[1])
+            break;
+
+      if (it) {
+         if (ld->def[0]->reg.id >= 0)
+            it->value = ld->def[0];
+         else
+         if (!ld->fixed)
+            nvc0_pc_replace_value(ctx->pc, ld->def[0], it->value);
+      } else {
+         if (ctx->alloc == LOAD_RECORD_POOL_SIZE)
+            continue;
+         it = &ctx->pool[ctx->alloc++];
+         it->next = *rec;
+         it->data[0] = data[0];
+         it->data[1] = data[1];
+         it->value = ld->def[0];
+         *rec = it;
+      }
+   }
+
+   ctx->imm = NULL;
+   ctx->mem_s = NULL;
+   ctx->mem_v = NULL;
+   for (j = 0; j < 16; ++j)
+      ctx->mem_c[j] = NULL;
+   ctx->mem_l = NULL;
+   ctx->alloc = 0;
+
+   DESCEND_ARBITRARY(j, nv_pass_reload_elim);
+#endif
+   return 0;
+}
+
+static int
+nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   int i, c, j;
+
+   for (i = 0; i < ctx->pc->num_instructions; ++i) {
+      struct nv_instruction *nvi = &ctx->pc->instructions[i];
+      struct nv_value *def[4];
+
+      if (!nv_is_texture_op(nvi->opcode))
+         continue;
+      nvi->tex_mask = 0;
+
+      for (c = 0; c < 4; ++c) {
+         if (nvi->def[c]->refc)
+            nvi->tex_mask |= 1 << c;
+         def[c] = nvi->def[c];
+      }
+
+      j = 0;
+      for (c = 0; c < 4; ++c)
+         if (nvi->tex_mask & (1 << c))
+            nvi->def[j++] = def[c];
+      for (c = 0; c < 4; ++c)
+         if (!(nvi->tex_mask & (1 << c)))
+           nvi->def[j++] = def[c];
+      assert(j == 4);
+   }
+   return 0;
+}
+
+struct nv_pass_dce {
+   struct nv_pc *pc;
+   uint removed;
+};
+
+static int
+nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)
+{
+   int j;
+   struct nv_instruction *nvi, *next;
+
+   for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = next) {
+      next = nvi->next;
+
+      if (inst_removable(nvi)) {
+         nvc0_insn_delete(nvi);
+         ++ctx->removed;
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_dce);
+
+   return 0;
+}
+
+#if 0
+/* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE.
+ * Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with
+ * BREAK and dummy ELSE block.
+ */
+static INLINE boolean
+bb_is_if_else_endif(struct nv_basic_block *bb)
+{
+   if (!bb->out[0] || !bb->out[1])
+      return FALSE;
+
+   if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) {
+      return (bb->out[0]->out[1] == bb->out[1]->out[0] &&
+              !bb->out[1]->out[1]);
+   } else {
+      return (bb->out[0]->out[0] == bb->out[1]->out[0] &&
+              !bb->out[0]->out[1] &&
+              !bb->out[1]->out[1]);
+   }
+}
+
+/* predicate instructions and remove branch at the end */
+static void
+predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b,
+                       struct nv_value *p, ubyte cc)
+{
+
+}
+#endif
+
+/* NOTE: Run this after register allocation, we can just cut out the cflow
+ * instructions and hook the predicates to the conditional OPs if they are
+ * not using immediates; better than inserting SELECT to join definitions.
+ *
+ * NOTE: Should adapt prior optimization to make this possible more often.
+ */
+static int
+nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   return 0;
+}
+
+/* local common subexpression elimination, stupid O(n^2) implementation */
+static int
+nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+#if 0
+   struct nv_instruction *ir, *ik, *next;
+   struct nv_instruction *entry = b->phi ? b->phi : b->entry;
+   int s;
+   unsigned int reps;
+
+   do {
+      reps = 0;
+      for (ir = entry; ir; ir = next) {
+         next = ir->next;
+         for (ik = entry; ik != ir; ik = ik->next) {
+            if (ir->opcode != ik->opcode || ir->fixed)
+               continue;
+
+            if (!ir->def[0] || !ik->def[0] ||
+                ik->opcode == NV_OP_LDA ||
+                ik->opcode == NV_OP_STA ||
+                ik->opcode == NV_OP_MOV ||
+                nv_is_vector_op(ik->opcode))
+               continue; /* ignore loads, stores & moves */
+
+            if (ik->src[4] || ir->src[4])
+               continue; /* don't mess with address registers */
+
+            if (ik->flags_src || ir->flags_src ||
+                ik->flags_def || ir->flags_def)
+               continue; /* and also not with flags, for now */
+
+            if (ik->def[0]->reg.file == NV_FILE_OUT ||
+                ir->def[0]->reg.file == NV_FILE_OUT ||
+                !values_equal(ik->def[0], ir->def[0]))
+               continue;
+
+            for (s = 0; s < 3; ++s) {
+               struct nv_value *a, *b;
+
+               if (!ik->src[s]) {
+                  if (ir->src[s])
+                     break;
+                  continue;
+               }
+               if (ik->src[s]->mod != ir->src[s]->mod)
+                  break;
+               a = ik->src[s]->value;
+               b = ir->src[s]->value;
+               if (a == b)
+                  continue;
+               if (a->reg.file != b->reg.file ||
+                   a->reg.id < 0 ||
+                   a->reg.id != b->reg.id)
+                  break;
+            }
+            if (s == 3) {
+               nvc0_insn_delete(ir);
+               ++reps;
+               nvcg_replace_value(ctx->pc, ir->def[0], ik->def[0]);
+               break;
+            }
+         }
+      }
+   } while(reps);
+
+   DESCEND_ARBITRARY(s, nv_pass_cse);
+#endif
+   return 0;
+}
+
+static int
+nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
+{
+   struct pass_reld_elim *reldelim;
+   struct nv_pass pass;
+   struct nv_pass_dce dce;
+   int ret;
+
+   pass.n = 0;
+   pass.pc = pc;
+
+   /* Do this first, so we don't have to pay attention
+    * to whether sources are supported memory loads.
+    */
+   pc->pass_seq++;
+   ret = nv_pass_lower_arith(&pass, root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_lower_mods(&pass, root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nvc0_pass_fold_loads(&pass, root);
+   if (ret)
+      return ret;
+
+   if (pc->opt_reload_elim) {
+      reldelim = CALLOC_STRUCT(pass_reld_elim);
+      reldelim->pc = pc;
+
+      pc->pass_seq++;
+      ret = nv_pass_reload_elim(reldelim, root);
+      if (ret) {
+         FREE(reldelim);
+         return ret;
+      }
+      memset(reldelim, 0, sizeof(struct pass_reld_elim));
+      reldelim->pc = pc;
+   }
+
+   pc->pass_seq++;
+   ret = nv_pass_cse(&pass, root);
+   if (ret)
+      return ret;
+
+   dce.pc = pc;
+   do {
+      dce.removed = 0;
+      pc->pass_seq++;
+      ret = nv_pass_dce(&dce, root);
+      if (ret)
+         return ret;
+   } while (dce.removed);
+
+   if (pc->opt_reload_elim) {
+      pc->pass_seq++;
+      ret = nv_pass_mem_opt(reldelim, root);
+      if (!ret) {
+         memset(reldelim, 0, sizeof(struct pass_reld_elim));
+         reldelim->pc = pc;
+
+         pc->pass_seq++;
+         ret = nv_pass_mem_opt(reldelim, root);
+      }
+      FREE(reldelim);
+      if (ret)
+         return ret;
+   }
+
+   ret = nv_pass_tex_mask(&pass, root);
+   if (ret)
+      return ret;
+
+   return ret;
+}
+
+int
+nvc0_pc_exec_pass0(struct nv_pc *pc)
+{
+   int i, ret;
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i])))
+         return ret;
+   return 0;
+}