From 633f5ac6124b1b57152c09becba92d176e905ae9 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Fri, 23 Jul 2010 21:21:25 +0200
Subject: nv50: import new compiler

---
 src/gallium/drivers/nv50/Makefile              |   11 +-
 src/gallium/drivers/nv50/nv50_pc.c             |  433 ++
 src/gallium/drivers/nv50/nv50_pc.h             |  431 ++
 src/gallium/drivers/nv50/nv50_pc_emit.c        | 1139 ++++++
 src/gallium/drivers/nv50/nv50_pc_optimize.c    |  717 ++++
 src/gallium/drivers/nv50/nv50_pc_print.c       |  287 ++
 src/gallium/drivers/nv50/nv50_pc_regalloc.c    |  973 +++++
 src/gallium/drivers/nv50/nv50_program.c        | 5117 +++---------------------
 src/gallium/drivers/nv50/nv50_program.h        |  169 +-
 src/gallium/drivers/nv50/nv50_push.c           |    2 +-
 src/gallium/drivers/nv50/nv50_shader_state.c   |  619 +++
 src/gallium/drivers/nv50/nv50_state.c          |    3 -
 src/gallium/drivers/nv50/nv50_state_validate.c |    9 +
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c     | 1266 ++++++
 src/gallium/drivers/nv50/nv50_vbo.c            |    4 +-
 15 files changed, 6489 insertions(+), 4691 deletions(-)
 create mode 100644 src/gallium/drivers/nv50/nv50_pc.c
 create mode 100644 src/gallium/drivers/nv50/nv50_pc.h
 create mode 100644 src/gallium/drivers/nv50/nv50_pc_emit.c
 create mode 100644 src/gallium/drivers/nv50/nv50_pc_optimize.c
 create mode 100644 src/gallium/drivers/nv50/nv50_pc_print.c
 create mode 100644 src/gallium/drivers/nv50/nv50_pc_regalloc.c
 create mode 100644 src/gallium/drivers/nv50/nv50_shader_state.c
 create mode 100644 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile
index e31e6f8662..3943a9e257 100644
--- a/src/gallium/drivers/nv50/Makefile
+++ b/src/gallium/drivers/nv50/Makefile
@@ -10,7 +10,6 @@ C_SOURCES = \
 	nv50_draw.c \
 	nv50_miptree.c \
 	nv50_query.c \
-	nv50_program.c \
 	nv50_resource.c \
 	nv50_screen.c \
 	nv50_state.c \
@@ -19,6 +18,14 @@ C_SOURCES = \
 	nv50_tex.c \
 	nv50_transfer.c \
 	nv50_vbo.c \
-	nv50_push.c
+	nv50_push.c \
+	nv50_program.c \
+	nv50_shader_state.c \
+	nv50_pc.c \
+	nv50_pc_print.c \
+	nv50_pc_emit.c \
+	nv50_tgsi_to_nc.c \
+	nv50_pc_optimize.c \
+	nv50_pc_regalloc.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
new file mode 100644
index 0000000000..8aba0a32b7
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -0,0 +1,433 @@
+
+#include "nv50_pc.h"
+#include "nv50_program.h"
+
+#include <stdio.h>
+
+/* returns TRUE if operands 0 and 1 can be swapped */
+boolean
+nv_op_commutative(uint opcode)
+{
+   switch (opcode) {
+   case NV_OP_ADD:
+   case NV_OP_MUL:
+   case NV_OP_MAD:
+   case NV_OP_AND:
+   case NV_OP_OR:
+   case NV_OP_XOR:
+   case NV_OP_MIN:
+   case NV_OP_MAX:
+   case NV_OP_SAD:
+     return TRUE;
+   default:
+     return FALSE;
+   }
+}
+
+/* return operand to which the address register applies */
+int
+nv50_indirect_opnd(struct nv_instruction *i)
+{
+   if (!i->src[4])
+      return -1;
+
+   switch (i->opcode) {
+   case NV_OP_MOV:
+   case NV_OP_LDA:
+      return 0;
+   default:
+      return 1;
+   }
+}
+
+boolean
+nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s)
+{
+   if (nvi->flags_src || nvi->flags_def)
+      return FALSE;
+
+   switch (nvi->opcode) {
+   case NV_OP_ADD:
+   case NV_OP_MUL:
+   case NV_OP_AND:
+   case NV_OP_OR:
+   case NV_OP_XOR:
+   case NV_OP_SHL:
+   case NV_OP_SHR:
+      return (s == 1) && (nvi->def[0]->reg.file == NV_FILE_GPR);
+   case NV_OP_MOV:
+      assert(s == 0);
+      return (nvi->def[0]->reg.file == NV_FILE_GPR);
+   default:
+      return FALSE;
+   }
+}
+
+boolean
+nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)
+{
+   switch (nvi->opcode) {
+   case NV_OP_ABS:
+   case NV_OP_ADD:
+   case NV_OP_CEIL:
+   case NV_OP_FLOOR:
+   case NV_OP_TRUNC:
+   case NV_OP_CVT:
+   case NV_OP_MAD:
+   case NV_OP_MUL:
+   case NV_OP_SAT:
+   case NV_OP_SUB:
+   case NV_OP_MAX:
+   case NV_OP_MIN:
+      if (s == 0 && (value->reg.file == NV_FILE_MEM_S ||
+                     value->reg.file == NV_FILE_MEM_P))
+         return TRUE;
+      if (s == 1 &&
+          value->reg.file >= NV_FILE_MEM_C(0) &&
+          value->reg.file <= NV_FILE_MEM_C(15))
+         return TRUE;
+      if (s == 2 && nvi->src[1]->value->reg.file == NV_FILE_GPR)
+         return TRUE;
+      return FALSE;
+   case NV_OP_MOV:
+      assert(s == 0);
+      return TRUE;
+   default:
+      return FALSE;
+   }
+}
+
+ubyte
+nv50_supported_src_mods(uint opcode, int s)
+{
+   switch (opcode) {
+   case NV_OP_ABS:
+      return NV_MOD_NEG | NV_MOD_ABS; /* obviously */
+   case NV_OP_ADD:
+   case NV_OP_MUL:
+   case NV_OP_MAD:
+      return NV_MOD_NEG;
+   case NV_OP_DFDX:
+   case NV_OP_DFDY:
+      assert(s == 0);
+      return NV_MOD_NEG;
+   case NV_OP_MAX:
+   case NV_OP_MIN:
+      return NV_MOD_ABS;
+   case NV_OP_CVT:
+   case NV_OP_LG2:
+   case NV_OP_NEG:
+   case NV_OP_PREEX2:
+   case NV_OP_PRESIN:
+   case NV_OP_RCP:
+   case NV_OP_RSQ:
+      return NV_MOD_ABS | NV_MOD_NEG;
+   default:
+      return 0;
+   }
+}
+
+int
+nv_nvi_refcount(struct nv_instruction *nvi)
+{
+   int i, rc;
+
+   rc = nvi->flags_def ? nvi->flags_def->refc : 0;
+
+   for (i = 0; i < 4; ++i) {
+      if (!nvi->def[i])
+         return rc;
+      rc += nvi->def[i]->refc;
+   }
+   return rc;
+}
+
+static void
+nv_pc_free_refs(struct nv_pc *pc)
+{
+   int i;
+   for (i = 0; i < pc->num_refs; i += 64)
+      FREE(pc->refs[i]);
+}
+
+void
+nv_print_program(struct nv_basic_block *b)
+{
+   struct nv_instruction *i = b->phi;
+
+   b->priv = 0;
+
+   debug_printf("=== BB %i ", b->id);
+   if (b->out[0])
+      debug_printf("(--0> %i) ", b->out[0]->id);
+   if (b->out[1])
+      debug_printf("(--1> %i) ", b->out[1]->id);
+   debug_printf("===\n");
+
+   if (!i)
+      i = b->entry;
+   for (; i; i = i->next)
+      nv_print_instruction(i);
+
+   if (!b->out[0]) {
+      debug_printf("END\n\n");
+      return;
+   }
+   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
+      return;
+
+   if (b->out[0] != b)
+      nv_print_program(b->out[0]);
+
+   if (b->out[1] && b->out[1] != b)
+      nv_print_program(b->out[1]);
+}
+
+static INLINE void
+nvcg_show_bincode(struct nv_pc *pc)
+{
+   int i;
+
+   for (i = 0; i < pc->bin_size / 4; ++i)
+      debug_printf("0x%08x ", pc->emit[i]);
+   debug_printf("\n");
+}
+
+static int
+nv50_emit_program(struct nv_pc *pc)
+{
+   uint32_t *code = pc->emit;
+   int n;
+
+   debug_printf("emitting program: size = %u\n", pc->bin_size);
+
+   for (n = 0; n < pc->num_blocks; ++n) {
+      struct nv_instruction *i;
+      struct nv_basic_block *b = pc->bb_list[n];
+
+      for (i = b->entry; i; i = i->next) {
+         nv50_emit_instruction(pc, i);
+
+         pc->bin_pos += 1 + (pc->emit[0] & 1);
+         pc->emit += 1 + (pc->emit[0] & 1);
+      }
+   }
+   assert(pc->emit == &code[pc->bin_size / 4]);
+
+   /* XXX: we can do better than this ... */
+   if ((pc->emit[-1] & 3) == 3) {
+      pc->emit[0] = 0xf0000001;
+      pc->emit[1] = 0xe0000000;
+      pc->bin_size += 8;
+   }
+
+   pc->emit = code;
+   code[pc->bin_size / 4 - 1] |= 1;
+
+   nvcg_show_bincode(pc);
+
+   return 0;
+}
+
+int
+nv50_generate_code(struct nv50_translation_info *ti)
+{
+   struct nv_pc *pc;
+   int ret;
+
+   pc = CALLOC_STRUCT(nv_pc);
+   if (!pc)
+      return 1;
+
+   ret = nv50_tgsi_to_nc(pc, ti);
+   if (ret)
+      goto out;
+
+   /* optimization */
+   ret = nv_pc_exec_pass0(pc);
+   if (ret)
+      goto out;
+
+   /* register allocation */
+   ret = nv_pc_exec_pass1(pc);
+   if (ret)
+      goto out;
+
+   /* prepare for emission */
+   ret = nv_pc_exec_pass2(pc);
+   if (ret)
+      goto out;
+
+   pc->emit = CALLOC(pc->bin_size / 4 + 2, 4);
+   if (!pc->emit) {
+      ret = 3;
+      goto out;
+   }
+   ret = nv50_emit_program(pc);
+   if (ret)
+      goto out;
+
+   ti->p->code_size = pc->bin_size;
+   ti->p->code = pc->emit;
+
+   ti->p->immd_size = pc->immd_count * 4;
+   ti->p->immd = pc->immd_buf;
+
+   ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] + 1) >> 1;
+   ti->p->max_gpr++;
+
+   ti->p->fixups = pc->fixups;
+   ti->p->num_fixups = pc->num_fixups;
+
+   debug_printf("SHADER TRANSLATION - %s\n", ret ? "failure" : "success");
+
+out:
+   nv_pc_free_refs(pc);
+   if (ret) {
+      if (pc->emit)
+         free(pc->emit);
+      if (pc->immd_buf)
+         free(pc->immd_buf);
+      if (pc->fixups)
+         free(pc->fixups);
+   }
+   free(pc);
+
+   return ret;
+}
+
+static void
+nvbb_insert_phi(struct nv_basic_block *b, struct nv_instruction *i)
+{
+   if (!b->phi) {
+      i->prev = NULL;
+      b->phi = i;
+      i->next = b->entry;
+      if (b->entry) {
+         assert(!b->entry->prev && b->exit);
+         b->entry->prev = i;
+      } else {
+         b->entry = i;
+	 b->exit = i;
+      }
+   } else {
+      assert(b->entry);
+      if (b->entry->opcode == NV_OP_PHI) { /* insert after entry */
+	 assert(b->entry == b->exit);
+         b->entry->next = i;
+         i->prev = b->entry;
+         b->entry = i;
+	 b->exit = i;
+      } else { /* insert before entry */
+         assert(b->entry->prev && b->exit);
+         i->next = b->entry;
+         i->prev = b->entry->prev;
+         b->entry->prev = i;
+         i->prev->next = i;
+      }
+   }
+}
+
+void
+nvbb_insert_tail(struct nv_basic_block *b, struct nv_instruction *i)
+{
+   if (i->opcode == NV_OP_PHI) {
+      nvbb_insert_phi(b, i);
+   } else {
+      i->prev = b->exit;
+      if (b->exit)
+         b->exit->next = i;
+      b->exit = i;
+      if (!b->entry)
+         b->entry = i;
+      else
+      if (i->prev && i->prev->opcode == NV_OP_PHI)
+         b->entry = i;
+   }
+
+   i->bb = b;
+   b->num_instructions++;
+}
+
+void
+nv_nvi_delete(struct nv_instruction *nvi)
+{
+   struct nv_basic_block *b = nvi->bb;
+   int j;
+
+   debug_printf("REM: "); nv_print_instruction(nvi);
+
+   for (j = 0; j < 4; ++j) {
+      if (!nvi->src[j])
+         break;
+      --(nvi->src[j]->value->refc);
+      nvi->src[j] = NULL;
+   }	       
+
+   if (nvi->next)
+      nvi->next->prev = nvi->prev;
+   else {
+      assert(nvi == b->exit);
+      b->exit = nvi->prev;
+   }
+
+   if (nvi->prev)
+      nvi->prev->next = nvi->next;
+
+   if (nvi == b->entry) {
+      assert(nvi->opcode != NV_OP_PHI || !nvi->next);
+
+      if (!nvi->next || (nvi->opcode == NV_OP_PHI))
+         b->entry = nvi->prev;
+      else
+         b->entry = nvi->next;
+   }
+
+   if (nvi == b->phi) {
+      assert(!nvi->prev);
+      if (nvi->opcode != NV_OP_PHI)
+         debug_printf("WARN: b->phi points to non-PHI instruction\n");
+
+      if (!nvi->next || nvi->next->opcode != NV_OP_PHI)
+         b->phi = NULL;
+      else
+         b->phi = nvi->next;
+   }
+}
+
+void
+nv_nvi_permute(struct nv_instruction *i1, struct nv_instruction *i2)
+{
+   struct nv_basic_block *b = i1->bb;
+
+   assert(i1->opcode != NV_OP_PHI &&
+          i2->opcode != NV_OP_PHI);
+   assert(i1->next == i2);
+
+   if (b->exit == i2)
+      b->exit = i1;
+
+   if (b->entry == i1)
+      b->entry = i2;
+
+   i2->prev = i1->prev;
+   i1->next = i2->next;
+   i2->next = i1;
+   i1->prev = i2;
+
+   if (i2->prev)
+      i2->prev->next = i2;
+   if (i1->next)
+      i1->next->prev = i1;
+}
+
+void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b)
+{
+   if (parent->out[0]) {
+      assert(!parent->out[1]);
+      parent->out[1] = b;
+   } else
+      parent->out[0] = b;
+
+   b->in[b->num_in++] = parent;
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
new file mode 100644
index 0000000000..3ab48d0afd
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -0,0 +1,431 @@
+/*************************************************************************/
+/* Copyright (C) 2010 I                                                  */
+/*                                                                       */
+/* This program is free software: you can redistribute it and/or modify  */
+/* it under the terms of the GNU General Public License as published by  */
+/* the Free Software Foundation, either version 3 of the License, or     */
+/* (at your option) any later version.                                   */
+/*                                                                       */
+/* This program is distributed in the hope that it will be useful,       */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */
+/* GNU General Public License for more details.                          */
+/*                                                                       */
+/* You should have received a copy of the GNU General Public License     */
+/* along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+/*************************************************************************/
+
+#ifndef __NV50_COMPILER_H__
+#define __NV50_COMPILER_H__
+
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#define NV_OP_PHI       0
+#define NV_OP_EXTRACT   1
+#define NV_OP_COMBINE   2
+#define NV_OP_LDA       3
+#define NV_OP_STA       4
+#define NV_OP_MOV       5
+#define NV_OP_ADD       6
+#define NV_OP_SUB       7
+#define NV_OP_NEG       8
+#define NV_OP_MUL       9
+#define NV_OP_MAD       10
+#define NV_OP_CVT       11
+#define NV_OP_SAT       12
+#define NV_OP_NOT       13
+#define NV_OP_AND       14
+#define NV_OP_OR        15
+#define NV_OP_XOR       16
+#define NV_OP_SHL       17
+#define NV_OP_SHR       18
+#define NV_OP_RCP       19
+/* gap */
+#define NV_OP_RSQ       21
+#define NV_OP_LG2       22
+#define NV_OP_SIN       23
+#define NV_OP_COS       24
+#define NV_OP_EX2       25
+#define NV_OP_PRESIN    26
+#define NV_OP_PREEX2    27
+#define NV_OP_MIN       28
+#define NV_OP_MAX       29
+#define NV_OP_SET       30
+#define NV_OP_SAD       31
+#define NV_OP_KIL       32
+#define NV_OP_BRA       33
+#define NV_OP_CALL      34
+#define NV_OP_RET       35
+#define NV_OP_BREAK     36
+#define NV_OP_BREAKADDR 37
+#define NV_OP_JOINAT    38
+#define NV_OP_TEX       39
+#define NV_OP_TXB       40
+#define NV_OP_TXL       41
+#define NV_OP_TXF       42
+#define NV_OP_TXQ       43
+#define NV_OP_DFDX      44
+#define NV_OP_DFDY      45
+#define NV_OP_QUADOP    46
+#define NV_OP_LINTERP   47
+#define NV_OP_PINTERP   48
+#define NV_OP_ABS       49
+#define NV_OP_CEIL      50
+#define NV_OP_FLOOR     51
+#define NV_OP_TRUNC     52
+#define NV_OP_NOP       53
+#define NV_OP_SELECT    54
+#define NV_OP_EXPORT    55
+#define NV_OP_COUNT     56
+
+#define NV_FILE_GPR      0
+#define NV_FILE_OUT      1
+#define NV_FILE_ADDR     2
+#define NV_FILE_FLAGS    3
+#define NV_FILE_IMM      16
+#define NV_FILE_MEM_S    32
+#define NV_FILE_MEM_P    33
+#define NV_FILE_MEM_V    34
+#define NV_FILE_MEM_L    48
+#define NV_FILE_MEM_G(i) (64 + i)
+#define NV_FILE_MEM_C(i) (80 + i)
+
+#define NV_MOD_NEG 1
+#define NV_MOD_ABS 2
+#define NV_MOD_NOT 4
+#define NV_MOD_SAT 8
+
+#define NV_TYPE_U8  0x00
+#define NV_TYPE_S8  0x01
+#define NV_TYPE_U16 0x02
+#define NV_TYPE_S16 0x03
+#define NV_TYPE_U32 0x04
+#define NV_TYPE_S32 0x05
+#define NV_TYPE_P32 0x07
+#define NV_TYPE_F32 0x09
+#define NV_TYPE_F64 0x0b
+#define NV_TYPE_VEC(x, n) (NV_TYPE_##x | (n << 4))
+#define NV_TYPE_LO  0x00
+#define NV_TYPE_HI  0x80
+#define NV_TYPE_ANY 0xff
+
+#define NV_TYPE_ISINT(t) ((t) <= 5)
+#define NV_TYPE_ISFLT(t) ((t) & 0x08)
+
+#define NV_CC_FL 0x0
+#define NV_CC_LT 0x1
+#define NV_CC_EQ 0x2
+#define NV_CC_LE 0x3
+#define NV_CC_GT 0x4
+#define NV_CC_NE 0x5
+#define NV_CC_GE 0x6
+#define NV_CC_U  0x8
+#define NV_CC_TR 0xf
+
+#define NV_PC_MAX_INSTRUCTIONS 2048
+#define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4)
+
+static INLINE boolean
+nv_is_vector_op(uint opcode)
+{
+   return (opcode >= NV_OP_TEX) && (opcode <= NV_OP_TXQ);
+}
+
+static INLINE uint
+nv_type_order(ubyte type)
+{
+   switch (type & 0xf) {
+   case NV_TYPE_U8:
+   case NV_TYPE_S8:
+      return 0;
+   case NV_TYPE_U16:
+   case NV_TYPE_S16:
+      return 1;
+   case NV_TYPE_U32:
+   case NV_TYPE_F32:
+   case NV_TYPE_S32:
+   case NV_TYPE_P32:
+      return 2;
+   case NV_TYPE_F64:
+      return 3;
+   }
+   assert(0);
+}
+
+static INLINE uint
+nv_type_sizeof(ubyte type)
+{
+   if (type & 0xf0)
+      return (1 << nv_type_order(type)) * (type >> 4);
+   return 1 << nv_type_order(type);
+}
+
+static INLINE uint
+nv_type_sizeof_base(ubyte type)
+{
+   return 1 << nv_type_order(type);
+}
+
+struct nv_reg {
+   int id;
+   ubyte file;
+   ubyte type; /* type of generating instruction's result */
+   union {
+      float f32;
+      double f64;
+      int32_t s32;
+      uint32_t u32;
+   } imm;
+};
+
+struct nv_range {
+   struct nv_range *next;
+   int bgn;
+   int end;
+};
+
+struct nv_value {
+   struct nv_reg reg; 
+   struct nv_instruction *insn;
+   struct nv_value *join;
+   int n;
+   struct nv_range *livei;
+   int refc;
+
+   struct nv_value *next;
+   struct nv_value *prev;
+};
+
+struct nv_ref {
+   struct nv_value *value;
+   struct nv_instruction *insn;
+   ubyte mod;
+   ubyte typecast;
+   ubyte flags; /* not used yet */
+};
+
+struct nv_basic_block;
+
+struct nv_instruction {
+   struct nv_instruction *next;
+   struct nv_instruction *prev;
+   uint opcode;
+   int serial;
+   struct nv_value *def[4];
+   struct nv_value *flags_def;
+   struct nv_ref *src[5];
+   struct nv_ref *flags_src;
+   struct nv_basic_block *bb;
+   struct nv_basic_block *target; /* target block of control flow insn */
+   ubyte cc;
+   ubyte set_cond      : 4;
+   ubyte fixed         : 1; /* don't optimize away */
+   ubyte is_terminator : 1;
+   ubyte is_join       : 1;
+   ubyte is_long       : 1; /* for emission */
+   /* */
+   ubyte saturate : 1;
+   ubyte centroid : 1;
+   ubyte flat     : 1;
+   ubyte padding  : 4;
+   ubyte tex_live : 1;
+   /* */
+   ubyte tex_t; /* TIC binding */
+   ubyte tex_s; /* TSC binding */
+   ubyte tex_argc : 3;
+   ubyte tex_cube : 1;
+   ubyte tex_mask : 4;
+   /* */
+   ubyte quadop;
+};
+
+struct nv_basic_block {
+   struct nv_instruction *entry; /* first non-phi instruction */
+   struct nv_instruction *exit;
+   struct nv_instruction *phi; /* very first instruction */
+   int num_instructions;
+
+   struct nv_basic_block *out[2]; /* no indirect branches -> 2 */
+   struct nv_basic_block **in;
+   uint num_in;
+
+   int id;
+   struct nv_basic_block *last_visitor;
+   uint priv;
+   uint pass_seq;
+
+   uint32_t bin_pos; /* position, size in emitted code */
+   uint32_t bin_size;
+
+   uint32_t live_set[NV_PC_MAX_VALUES / 32];
+};
+
+#define NV_FIXUP_CFLOW_RELOC 0
+#define NV_FIXUP_PARAM_RELOC 1
+
+struct nv_fixup {
+   ubyte type;
+   ubyte shift;
+   uint32_t mask;
+   uint32_t data;
+   uint32_t offset;
+};
+
+static INLINE void
+nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data)
+{
+   uint32_t val;
+
+   val = bin[fixup->offset / 4] & ~fixup->mask;
+   data = (fixup->shift < 0) ? (data >> fixup->shift) : (data << fixup->shift);
+   val |= (fixup->data + data) & fixup->mask;
+   bin[fixup->offset / 4] = val;
+}
+
+struct nv_pc {
+   struct nv50_translation_info *ti;
+
+   struct nv_basic_block *root;
+   struct nv_basic_block *current_block;
+   struct nv_basic_block *parent_block;
+
+   int loop_nesting_bound;
+   uint pass_seq;
+
+   struct nv_value values[NV_PC_MAX_VALUES];
+   struct nv_instruction instructions[NV_PC_MAX_INSTRUCTIONS];
+   struct nv_ref **refs;
+   struct nv_basic_block **bb_list;
+   int num_values;
+   int num_instructions;
+   int num_refs;
+   int num_blocks;
+
+   int max_reg[4];
+
+   uint32_t *immd_buf; /* populated on emit */
+   unsigned immd_count;
+
+   uint32_t *emit;
+   unsigned bin_size;
+   unsigned bin_pos;
+
+   struct nv_fixup *fixups;
+   int num_fixups;
+};
+
+void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *);
+
+static INLINE struct nv_instruction *
+new_instruction(struct nv_pc *pc, uint opcode)
+{
+   struct nv_instruction *insn;
+
+   insn = &pc->instructions[pc->num_instructions++];
+   assert(pc->num_instructions < NV_PC_MAX_INSTRUCTIONS);
+
+   insn->cc = NV_CC_TR;
+   insn->opcode = opcode;
+
+   nvbb_insert_tail(pc->current_block, insn);
+   return insn;
+}
+
+static INLINE struct nv_value *
+new_value(struct nv_pc *pc, ubyte file, ubyte type)
+{
+   struct nv_value *value = &pc->values[pc->num_values];
+
+   assert(pc->num_values < NV_PC_MAX_VALUES - 1);
+
+   value->n = pc->num_values++;
+   value->join = value;
+   value->reg.id = -1;
+   value->reg.file = file;
+   value->reg.type = type;
+   return value;
+}
+
+static INLINE struct nv_ref *
+new_ref(struct nv_pc *pc, struct nv_value *val)
+{
+   int i;
+   struct nv_ref *ref;
+
+   if ((pc->num_refs % 64) == 0) {
+      const unsigned old_size = pc->num_refs * sizeof(struct nv_ref *);
+      const unsigned new_size = (pc->num_refs + 64) * sizeof(struct nv_ref *);
+
+	   pc->refs = REALLOC(pc->refs, old_size, new_size);
+
+	   ref = CALLOC(64, sizeof(struct nv_ref));
+	   for (i = 0; i < 64; ++i)
+		   pc->refs[pc->num_refs + i] = &ref[i];
+   }
+
+   ref = pc->refs[pc->num_refs++];
+   ref->value = val;
+   ref->typecast = val->reg.type;
+
+   ++val->refc;
+   return ref;
+}
+
+static INLINE struct nv_basic_block *
+new_basic_block(struct nv_pc *pc)
+{
+   struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block);
+
+   bb->in = CALLOC(sizeof(struct nv_basic_block *), 4);
+   bb->id = pc->num_blocks++;
+   return bb;
+}
+
+static INLINE void
+nv_reference(struct nv_pc *pc, struct nv_ref **d, struct nv_value *s)
+{
+   if (*d)
+      --(*d)->value->refc;
+
+   if (s) {
+      if (!*d)
+         *d = new_ref(pc, s);
+      else {
+         (*d)->value = s;
+         ++(s->refc);
+      }
+   } else {
+      assert(*d);
+      *d = NULL;
+   }
+}
+
+/* nv50_emit.c */
+void nv50_emit_instruction(struct nv_pc *, struct nv_instruction *);
+
+/* nv50_print.c */
+const char *nv_opcode_name(uint opcode);
+void nv_print_instruction(struct nv_instruction *);
+
+/* nv50_pc.c */
+void nv_print_program(struct nv_basic_block *b);
+
+boolean nv_op_commutative(uint opcode);
+int nv50_indirect_opnd(struct nv_instruction *);
+boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s);
+boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *);
+ubyte nv50_supported_src_mods(uint opcode, int s);
+int nv_nvi_refcount(struct nv_instruction *);
+void nv_nvi_delete(struct nv_instruction *);
+void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *);
+void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *);
+
+int nv_pc_exec_pass0(struct nv_pc *pc);
+int nv_pc_exec_pass1(struct nv_pc *pc);
+int nv_pc_exec_pass2(struct nv_pc *pc);
+
+int nv50_tgsi_to_nc(struct nv_pc *, struct nv50_translation_info *);
+
+#endif // NV50_COMPILER_H
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
new file mode 100644
index 0000000000..b917d23232
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -0,0 +1,1139 @@
+/*************************************************************************/
+/* Copyright (C) 2009                                                    */
+/*                                                                       */
+/* This program is free software: you can redistribute it and/or modify  */
+/* it under the terms of the GNU General Public License as published by  */
+/* the Free Software Foundation, either version 3 of the License, or     */
+/* (at your option) any later version.                                   */
+/*                                                                       */
+/* This program is distributed in the hope that it will be useful,       */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */
+/* GNU General Public License for more details.                          */
+/*                                                                       */
+/* You should have received a copy of the GNU General Public License     */
+/* along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+/*************************************************************************/
+
+#include "nv50_context.h"
+#include "nv50_pc.h"
+
+// Definitions
+
+#define FLAGS_CC_SHIFT    7
+#define FLAGS_ID_SHIFT    12
+#define FLAGS_WR_ID_SHIFT 4
+#define FLAGS_CC_MASK     (0x1f << FLAGS_CC_SHIFT)
+#define FLAGS_ID_MASK     (0x03 << FLAGS_ID_SHIFT)
+#define FLAGS_WR_EN       (1 << 6)
+#define FLAGS_WR_ID_MASK  (0x3 << FLAGS_WR_ID_SHIFT)
+
+const ubyte nv50_inst_min_size_tab[NV_OP_COUNT] =
+{
+   0, 0, 0, 8, 8, 4, 4, 4, 8, 4, 4, 8, 8, 8, 8, 8, /* 15 */
+   8, 8, 8, 4, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 31 */
+   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 47 */
+   4, 8, 8, 8, 8, 8, 0, 0
+};
+
+/* XXX: silence, you ! */
+unsigned
+nv50_inst_min_size(struct nv_instruction *i);
+
+unsigned
+nv50_inst_min_size(struct nv_instruction *i)
+{
+   int n;
+
+   if (nv50_inst_min_size_tab[i->opcode] > 4)
+      return 8;
+
+   if (i->def[0] && i->def[0]->reg.file != NV_FILE_GPR)
+      return 8;
+   if (i->def[0]->join->reg.id > 63)
+      return 8;
+
+   for (n = 0; n < 3; ++n) {
+      if (!i->src[n])
+         break;
+      if (i->src[n]->value->reg.file != NV_FILE_GPR &&
+          i->src[n]->value->reg.file != NV_FILE_MEM_V)
+         return 8;
+      if (i->src[n]->value->reg.id > 63)
+         return 8;
+   }
+
+   if (i->flags_def || i->flags_src || i->src[4])
+      return 8;
+
+   if (i->src[2]) {
+      if (i->saturate || i->src[2]->mod)
+         return 8;
+      if (i->src[0]->mod ^ i->src[1]->mod)
+         return 8;
+      if ((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS)
+         return 8;
+      if (i->def[0]->join->reg.id < 0 ||
+          i->def[0]->join->reg.id != i->src[2]->value->join->reg.id)
+         return 8;
+   }
+
+   return nv50_inst_min_size_tab[i->opcode];
+}
+
+static INLINE ubyte
+STYPE(struct nv_instruction *nvi, int s)
+{
+   return nvi->src[s]->typecast;
+}
+
+static INLINE ubyte
+DTYPE(struct nv_instruction *nvi, int d)
+{
+   return nvi->def[d]->reg.type;
+}
+
+static INLINE struct nv_reg *
+SREG(struct nv_ref *ref)
+{
+   return &ref->value->join->reg;
+}
+
+static INLINE struct nv_reg *
+DREG(struct nv_value *val)
+{
+   return &val->join->reg;
+}
+
+static INLINE ubyte
+SFILE(struct nv_instruction *nvi, int s)
+{
+   return nvi->src[s]->value->reg.file;
+}
+
+static INLINE ubyte
+DFILE(struct nv_instruction *nvi, int d)
+{
+   return nvi->def[0]->reg.file;
+}
+
+static INLINE void
+SID(struct nv_pc *pc, struct nv_ref *ref, int pos)
+{
+   pc->emit[pos / 32] |= SREG(ref)->id << (pos % 32);
+}
+
+static INLINE void
+DID(struct nv_pc *pc, struct nv_value *val, int pos)
+{
+   pc->emit[pos / 32] |= DREG(val)->id << (pos % 32);
+}
+
+static INLINE uint32_t
+get_immd_u32(struct nv_ref *ref)
+{
+   assert(ref->value->reg.file == NV_FILE_IMM);
+   return ref->value->reg.imm.u32;
+}
+
+static INLINE void
+set_immd_u32(struct nv_pc *pc, uint32_t u32)
+{
+   pc->emit[1] |= 3;
+   pc->emit[0] |= (u32 & 0x3f) << 16;
+   pc->emit[1] |= (u32 >> 6) << 2;
+}
+
+static INLINE void
+set_immd(struct nv_pc *pc, struct nv_ref *ref)
+{
+   assert(ref->value->reg.file == NV_FILE_IMM);
+   set_immd_u32(pc, get_immd_u32(ref));
+}
+
+static void
+new_fixup(struct nv_pc *pc, unsigned type, uint32_t data, uint32_t m, int s)
+{
+   const unsigned size = sizeof(struct nv_fixup);
+   const unsigned n = pc->num_fixups;
+   return;
+
+   if (!(n % 8))
+      pc->fixups = REALLOC(pc->fixups, n * size, (n + 8) * size);
+
+   pc->fixups[n].offset = pc->bin_pos + (s / 32);
+   pc->fixups[n].type = type;
+   pc->fixups[n].data = data;
+   pc->fixups[n].mask = m << (s % 32);
+   pc->fixups[n].shift = s % 32;
+
+   ++pc->num_fixups;
+
+   assert(((data << (s % 32)) & pc->fixups[n].mask) == (data << (s % 32)));
+}
+
+static void
+nv_pc_alloc_immd(struct nv_pc *pc, struct nv_ref *ref)
+{
+   uint32_t i, val = get_immd_u32(ref);
+
+   for (i = 0; i < pc->immd_count; ++i)
+      if (pc->immd_buf[i] == val)
+         break;
+
+   if (i == pc->immd_count) {
+      if (!(pc->immd_count % 8))
+         pc->immd_buf = REALLOC(pc->immd_buf,
+				pc->immd_count * 4, (pc->immd_count + 8) * 4);
+      pc->immd_buf[pc->immd_count++] = val;
+   }
+
+   SREG(ref)->id = i;
+}
+
+static INLINE void
+set_pred(struct nv_pc *pc, struct nv_instruction *i)
+{
+   assert(!(pc->emit[1] & 0x00003f80));
+
+   pc->emit[1] |= i->cc << 7;
+   if (i->flags_src)
+      pc->emit[1] |= SREG(i->flags_src)->id << 12;
+}
+
+static INLINE void
+set_pred_wr(struct nv_pc *pc, struct nv_instruction *i)
+{
+   assert(!(pc->emit[1] & 0x00000070));
+
+   if (i->flags_def)
+      pc->emit[1] |= (DREG(i->flags_def)->id << 4) | 0x40;
+}
+
+static INLINE void
+set_a16_bits(struct nv_pc *pc, uint id)
+{
+   ++id; /* $a0 is always 0 */
+   pc->emit[0] |= (id & 3) << 26;
+   pc->emit[1] |= id & 4;
+}
+
+static INLINE void
+set_addr(struct nv_pc *pc, struct nv_instruction *i)
+{
+   if (i->src[4])
+      set_a16_bits(pc, SREG(i->src[4])->id);
+}
+
+static void
+set_dst(struct nv_pc *pc, struct nv_value *value)
+{
+   struct nv_reg *reg = &value->join->reg;
+
+   if (reg->id < 0) {
+      debug_printf("WARNING: unused dst, hope we can bucket it !\n");
+      pc->emit[0] |= 127 << 2;
+      pc->emit[1] |= 0x8;
+      return;
+   }
+
+   if (reg->file == NV_FILE_OUT)
+      pc->emit[1] |= 0x8;
+   else
+   if (reg->file == NV_FILE_ADDR)
+	   assert(0);
+
+   pc->emit[0] |= reg->id << 2;
+}
+
+static void
+set_src_0(struct nv_pc *pc, struct nv_ref *ref)
+{
+   struct nv_reg *reg = SREG(ref);
+
+   if (reg->file == NV_FILE_MEM_S)
+      pc->emit[1] |= 0x00200000;
+   else
+   if (reg->file == NV_FILE_MEM_P)
+      pc->emit[0] |= 0x01800000;
+   else
+   if (reg->file != NV_FILE_GPR)
+      NOUVEAU_ERR("invalid src0 register file: %d\n", reg->file);
+
+   assert(reg->id < 128);
+   pc->emit[0] |= reg->id << 9;
+}
+
+static void
+set_src_1(struct nv_pc *pc, struct nv_ref *ref)
+{
+   struct nv_reg *reg = SREG(ref);
+
+   if (reg->file >= NV_FILE_MEM_C(0) &&
+       reg->file <= NV_FILE_MEM_C(15)) {
+      assert(!(pc->emit[1] & 0x01800000));
+
+      pc->emit[0] |= 0x00800000;
+      pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22;
+   } else
+   if (reg->file != NV_FILE_GPR)
+      NOUVEAU_ERR("invalid src1 register file: %d\n", reg->file);
+
+   assert(reg->id < 128);
+   pc->emit[0] |= reg->id << 16;
+}
+
+static void
+set_src_2(struct nv_pc *pc, struct nv_ref *ref)
+{
+   struct nv_reg *reg = SREG(ref);
+
+   if (reg->file >= NV_FILE_MEM_C(0) &&
+       reg->file <= NV_FILE_MEM_C(15)) {
+      assert(!(pc->emit[1] & 0x01800000));
+
+      pc->emit[0] |= 0x01000000;
+      pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22;
+   } else
+   if (reg->file != NV_FILE_GPR)
+      NOUVEAU_ERR("invalid src2 register file: %d\n", reg->file);
+
+   assert(reg->id < 128);
+   pc->emit[1] |= reg->id << 14;
+}
+
+/* the default form:
+ * - long instruction
+ * - 1 to 3 sources in slots 0, 1, 2
+ * - address & flags
+ */
+static void
+emit_form_MAD(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] |= 1;
+
+   set_pred(pc, i);
+   set_pred_wr(pc, i);
+
+   if (i->def[0])
+      set_dst(pc, i->def[0]);
+   else {
+      pc->emit[0] |= 0x01fc;
+      pc->emit[1] |= 0x0008;
+   }
+
+   if (i->src[0])
+      set_src_0(pc, i->src[0]);
+
+   if (i->src[1])
+      set_src_1(pc, i->src[1]);
+
+   if (i->src[2])
+      set_src_2(pc, i->src[2]);
+
+   set_addr(pc, i);
+}
+
+/* like default form, but 2nd source in slot 2, no 3rd source */
+static void
+emit_form_ADD(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] |= 1;
+
+   if (i->def[0])
+      set_dst(pc, i->def[0]);
+   else {
+      pc->emit[0] |= 0x01fc;
+      pc->emit[1] |= 0x0008;
+   }
+
+   set_pred(pc, i);
+   set_pred_wr(pc, i);
+
+   if (i->src[0])
+      set_src_0(pc, i->src[0]);
+
+   if (i->src[1])
+      set_src_2(pc, i->src[1]);
+
+   set_addr(pc, i);
+}
+
+/* short mul */
+static void
+emit_form_MUL(struct nv_pc *pc, struct nv_instruction *i)
+{
+   assert(!i->is_long && !(pc->emit[0] & 1));
+
+   assert(i->def[0]);
+   set_dst(pc, i->def[0]);
+
+   if (i->src[0])
+      set_src_0(pc, i->src[0]);
+
+   if (i->src[1])
+      set_src_1(pc, i->src[1]);
+}
+
+/* default immediate form
+ * - 1 to 3 sources where last is immediate
+ * - no address or predicate possible
+ */
+static void
+emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask)
+{
+   pc->emit[0] |= 1;
+
+   assert(i->def[0]);
+   assert(i->src[0]);
+   set_dst(pc, i->def[0]);
+
+   assert(!i->src[4] && !i->flags_src && !i->flags_def);
+
+   if (i->src[2]) {
+      set_immd(pc, i->src[2]);
+      set_src_0(pc, i->src[1]);
+      set_src_1(pc, i->src[0]);
+   } else
+   if (i->src[1]) {
+      set_immd(pc, i->src[1]);
+      set_src_0(pc, i->src[0]);
+   } else
+      set_immd(pc, i->src[0]);
+
+   assert(!mod_mask);
+}
+
+static void
+set_ld_st_size(struct nv_pc *pc, ubyte type)
+{
+   switch (type) {
+   case NV_TYPE_F64:
+      pc->emit[1] |= 0x8000;
+      break;
+   case NV_TYPE_F32:
+   case NV_TYPE_S32:
+   case NV_TYPE_U32:
+      pc->emit[1] |= 0xc000;
+      break;
+   case NV_TYPE_S16:
+      pc->emit[1] |= 0x6000;
+      break;
+   case NV_TYPE_U16:
+      pc->emit[1] |= 0x4000;
+      break;
+   case NV_TYPE_S8:
+      pc->emit[1] |= 0x2000;
+      break;
+   default:
+      break;
+   }
+}
+
+static void
+emit_ld(struct nv_pc *pc, struct nv_instruction *i)
+{
+   ubyte sf = SFILE(i, 0);
+
+   if (sf == NV_FILE_IMM) {
+      sf = NV_FILE_MEM_C(0);
+      nv_pc_alloc_immd(pc, i->src[0]);
+
+      new_fixup(pc, NV_FIXUP_PARAM_RELOC, SREG(i->src[0])->id, 0xffff, 9);
+   }
+
+   if (sf == NV_FILE_MEM_S ||
+       sf == NV_FILE_MEM_P) {
+      pc->emit[0] = 0x10000001;
+      pc->emit[1] = 0x04200000 | (0x3c << 12);
+      if (sf == NV_FILE_MEM_P)
+         pc->emit[0] |= 0x01800000;
+   } else
+   if (sf >= NV_FILE_MEM_C(0) &&
+       sf <= NV_FILE_MEM_C(15)) {
+      pc->emit[0] = 0x10000001;
+      pc->emit[1] = 0x24000000;
+      pc->emit[1] |= (sf - NV_FILE_MEM_C(0)) << 22;
+   } else
+   if (sf >= NV_FILE_MEM_G(0) &&
+       sf <= NV_FILE_MEM_G(15)) {
+      pc->emit[0] = 0xd0000001 | ((sf - NV_FILE_MEM_G(0)) << 16);
+      pc->emit[1] = 0xa0000000;
+
+      assert(i->src[4] && SREG(i->src[4])->file == NV_FILE_GPR);
+      SID(pc, i->src[4], 9);
+   } else
+   if (sf == NV_FILE_MEM_L) {
+      pc->emit[0] = 0xd0000001;
+      pc->emit[1] = 0x40000000;
+   } else {
+      NOUVEAU_ERR("invalid ld source file\n");
+      abort();
+   }
+
+   set_ld_st_size(pc, STYPE(i, 0));
+
+   set_dst(pc, i->def[0]);
+   set_pred_wr(pc, i);
+
+   set_pred(pc, i);
+
+   if (sf < NV_FILE_MEM_G(0) ||
+       sf > NV_FILE_MEM_G(15)) {
+      SID(pc, i->src[0], 9);
+      set_addr(pc, i);
+   }
+}
+
+static void
+emit_st(struct nv_pc *pc, struct nv_instruction *i)
+{
+
+}
+
+static int
+verify_mov(struct nv_instruction *i)
+{
+   ubyte sf = SFILE(i, 0);
+   ubyte df = DFILE(i, 0);
+
+   if (df == NV_FILE_GPR)
+      return 0;
+
+   if (df != NV_FILE_OUT &&
+       df != NV_FILE_FLAGS &&
+       df != NV_FILE_ADDR)
+      return 1;
+
+   if (sf == NV_FILE_FLAGS)
+      return 2;
+   if (sf == NV_FILE_ADDR)
+      return 3;
+   if (sf == NV_FILE_IMM && df != NV_FILE_OUT)
+      return 4;
+
+   return 0;
+}
+
+static void
+emit_mov(struct nv_pc *pc, struct nv_instruction *i)
+{
+   assert(!verify_mov(i));
+
+   if (SFILE(i, 0) >= NV_FILE_MEM_S)
+      emit_ld(pc, i);
+   else
+   if (SFILE(i, 0) == NV_FILE_FLAGS) {
+      pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2);
+      pc->emit[1] = 0x20000780 | (SREG(i->src[0])->id << 12);
+   } else
+   if (SFILE(i, 0) == NV_FILE_ADDR) {
+      pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2);
+      pc->emit[1] = 0x40000780;
+      set_a16_bits(pc, SREG(i->src[0])->id);
+   } else
+   if (DFILE(i, 0) == NV_FILE_FLAGS) {
+      pc->emit[0] = 0x000001fd;
+      pc->emit[1] = 0xa0000788 | (1 << 6);
+      pc->emit[0] |= SREG(i->src[0])->id << 9;
+      pc->emit[1] |= DREG(i->def[0])->id << 4;
+   } else
+   if (SFILE(i, 0) == NV_FILE_IMM) {
+      if (i->opcode == NV_OP_LDA)
+         emit_ld(pc, i);
+      else {
+         pc->emit[0] = 0x10008001;
+         pc->emit[1] = 0x00000003;
+
+	 emit_form_IMM(pc, i, 0);
+      }
+   } else {
+      pc->emit[0] = 0x10000000;
+      pc->emit[0] |= DREG(i->def[0])->id << 2;
+      pc->emit[0] |= SREG(i->src[0])->id << 9;
+
+      if (!i->is_long)
+         pc->emit[0] |= 0x8000;
+      else {
+         pc->emit[0] |= 0x00000001;
+         pc->emit[1] = 0x0403c000;
+
+	 set_pred(pc, i);
+      }
+   }
+
+   if (DFILE(i, 0) == NV_FILE_OUT)
+      pc->emit[1] |= 0x8;
+}
+
+static void
+emit_interp(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0x80000000;
+
+   assert(DFILE(i, 0) == NV_FILE_GPR);
+   assert(SFILE(i, 0) == NV_FILE_MEM_V);
+
+   DID(pc, i->def[0], 2);
+   SID(pc, i->src[0], 16);
+
+   if (i->flat)
+      pc->emit[0] |= 1 << 8;
+   else
+   if (i->opcode == NV_OP_PINTERP) {
+      pc->emit[0] |= 1 << 25;
+      pc->emit[0] |= SREG(i->src[1])->id << 9;
+   }
+
+   if (i->centroid)
+      pc->emit[0] |= 1 << 24;
+
+   if (i->is_long) {
+      pc->emit[1] |= 0x0780 |
+	      (pc->emit[0] & (3 << 24)) >> (24 - 16) |
+	      (pc->emit[0] & (1 <<  8)) >> (18 -  8);
+
+      pc->emit[0] |= 1;
+      pc->emit[0] &= ~0x03000100;
+   }
+}
+
+static void
+emit_minmax(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0x30000000;
+   pc->emit[1] = (i->opcode == NV_OP_MIN) ? (2 << 28) : 0;
+
+   switch (DTYPE(i, 0)) {
+   case NV_TYPE_F32:
+      pc->emit[0] |= 0x80000000;
+      pc->emit[1] |= 0x80000000;
+      break;
+   case NV_TYPE_S32:
+      pc->emit[1] |= 0x8c000000;
+      break;
+   case NV_TYPE_U32:
+      pc->emit[1] |= 0x84000000;
+      break;
+   }
+	
+   emit_form_MAD(pc, i);
+
+   if (i->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000;
+   if (i->src[1]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00080000;
+}
+
+static void
+emit_add_f32(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0xb0000000;
+
+   if (SFILE(i, 1) == NV_FILE_IMM) {
+      emit_form_IMM(pc, i, 0);
+
+      if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000;
+      if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22;
+   } else
+   if (i->is_long) {
+      emit_form_ADD(pc, i);
+
+      if (i->src[0]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 26;
+      if (i->src[1]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 27;
+   } else {
+      emit_form_MUL(pc, i);
+
+      if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000;
+      if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22;
+   }
+}
+
+static void
+emit_add_b32(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0x20008000;
+
+   if (SFILE(i, 1) == NV_FILE_IMM) {
+      emit_form_IMM(pc, i, 0);
+   } else
+   if (i->is_long) {
+      pc->emit[0] = 0x20000000;
+      pc->emit[1] = 0x04000000;
+      emit_form_ADD(pc, i);
+   } else {
+      emit_form_MUL(pc, i);
+   }
+
+   if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 28;
+   if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22;
+}
+
+static void
+emit_add_a16(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0xd0000001 | (get_immd_u32(i->src[0]) << 9);
+   pc->emit[1] = 0x20000000;
+
+   pc->emit[0] |= (DREG(i->def[0])->id + 1) << 2;
+
+   set_pred(pc, i);
+
+   if (i->src[1])
+      set_a16_bits(pc, SREG(i->src[1])->id);
+}
+
+static void
+emit_flow(struct nv_pc *pc, struct nv_instruction *i, ubyte flow_op)
+{
+   pc->emit[0] = 0x00000003 | (flow_op << 28);
+   pc->emit[1] = 0x00000000;
+
+   set_pred(pc, i);
+
+   if (i->target) {
+      new_fixup(pc, NV_FIXUP_CFLOW_RELOC, i->target->bin_pos, 0x7ff800, 11);
+      pc->emit[0] |= (i->target->bin_pos / 4) << 11;
+   }
+}
+
+static INLINE void
+emit_add(struct nv_pc *pc, struct nv_instruction *i)
+{
+   if (DFILE(i, 0) == NV_FILE_ADDR)
+      emit_add_a16(pc, i);
+   else {
+      switch (DTYPE(i, 0)) {
+      case NV_TYPE_F32:
+         emit_add_f32(pc, i);
+         break;
+      case NV_TYPE_U32:
+      case NV_TYPE_S32:
+         emit_add_b32(pc, i);
+         break;
+      }
+   }
+}
+
+static void
+emit_bitop2(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0xd0000000;
+
+   if (SFILE(i, 0) == NV_FILE_IMM) {
+      emit_form_IMM(pc, i, 0);
+
+      if (i->opcode == NV_OP_OR)
+         pc->emit[0] |= 0x0100;
+      else
+      if (i->opcode == NV_OP_XOR)
+         pc->emit[0] |= 0x8000;
+   } else {
+      emit_form_MAD(pc, i);
+
+      pc->emit[1] |= 0x04000000;
+
+      if (i->opcode == NV_OP_OR)
+         pc->emit[1] |= 0x4000;
+      else
+      if (i->opcode == NV_OP_XOR)
+         pc->emit[1] |= 0x8000;
+   }
+}
+
+static void
+emit_shift(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0x30000001;
+   pc->emit[1] = 0xc4000000;
+
+   if (i->opcode == NV_OP_SHR)
+      pc->emit[1] |= 1 << 29;
+
+   if (SFILE(i, 1) == NV_FILE_IMM) {
+      pc->emit[1] |= 1 << 20;
+      pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x7f) << 16;
+
+      set_pred(pc, i);
+   } else
+      emit_form_MAD(pc, i);
+
+   if (STYPE(i, 0) == NV_TYPE_S32)
+      pc->emit[1] |= 1 << 27;
+}
+
+static void
+emit_flop(struct nv_pc *pc, struct nv_instruction *i)
+{
+   struct nv_ref *src0 = i->src[0];
+
+   pc->emit[0] = 0x90000000;
+
+   assert(SREG(src0)->type == NV_TYPE_F32);
+   assert(SREG(src0)->file == NV_FILE_GPR);
+
+   if (!i->is_long) {
+      emit_form_MUL(pc, i);
+      assert(i->opcode == NV_OP_RCP && !src0->mod);
+      return;
+   }
+
+   pc->emit[1] = (i->opcode - NV_OP_RCP) << 29;
+
+   emit_form_MAD(pc, i);
+
+   if (src0->mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000;
+   if (src0->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000;
+}
+
+static void
+emit_mad_f32(struct nv_pc *pc, struct nv_instruction *i)
+{
+   const boolean neg_mul = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG;
+   const boolean neg_add = (i->src[2]->mod & NV_MOD_NEG);
+
+   pc->emit[0] = 0xe0000000;
+
+   if (!i->is_long) {
+      emit_form_MUL(pc, i);
+      assert(!neg_mul && !neg_add);
+      return;
+   }
+
+   emit_form_MAD(pc, i);
+
+   if (neg_mul) pc->emit[1] |= 0x04000000;
+   if (neg_add) pc->emit[1] |= 0x08000000;
+
+   if (i->saturate)
+      pc->emit[1] |= 0x20000000;
+}
+
+static INLINE void
+emit_mad(struct nv_pc *pc, struct nv_instruction *i)
+{
+   emit_mad_f32(pc, i);
+}
+
+static void
+emit_mul_f32(struct nv_pc *pc, struct nv_instruction *i)
+{
+   boolean neg = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG;
+
+   pc->emit[0] = 0xc0000000;
+
+   if (SFILE(i, 1) == NV_FILE_IMM) {
+      emit_form_IMM(pc, i, 0);
+
+      if (neg)
+         pc->emit[0] |= 0x8000;
+   } else
+   if (i->is_long) {
+      emit_form_MAD(pc, i);
+
+      if (neg)
+         pc->emit[1] |= 0x08 << 24;
+   } else {
+      emit_form_MUL(pc, i);
+
+      if (neg)
+         pc->emit[0] |= 0x8000;
+   }
+}
+
+static void
+emit_set(struct nv_pc *pc, struct nv_instruction *nvi)
+{
+   assert(nvi->is_long);
+
+   pc->emit[0] = 0x30000000;
+   pc->emit[1] = 0x60000000;
+
+   pc->emit[1] |= nvi->set_cond << 14;
+
+   switch (STYPE(nvi, 0)) {
+   case NV_TYPE_U32: pc->emit[1] |= 0x04000000; break;
+   case NV_TYPE_S32: pc->emit[1] |= 0x0c000000; break;
+   case NV_TYPE_F32: pc->emit[0] |= 0x80000000; break;
+   default:
+      assert(0);
+      break;
+   }
+
+   emit_form_MAD(pc, nvi);
+}
+
+#define CVT_RN    (0x00 << 16)
+#define CVT_FLOOR (0x02 << 16)
+#define CVT_CEIL  (0x04 << 16)
+#define CVT_TRUNC (0x06 << 16)
+#define CVT_SAT   (0x08 << 16)
+#define CVT_ABS   (0x10 << 16)
+
+#define CVT_X32_X32 0x04004000
+#define CVT_X32_S32 0x04014000
+#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32)
+#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32)
+#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32)
+#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32)
+#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32)
+#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32)
+#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32)
+#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32)
+#define CVT_U32_U32 ((0x00 << 24) | CVT_X32_X32)
+
+#define CVT_NEG 0x20000000
+#define CVT_RI  0x08000000
+
+static void
+emit_cvt(struct nv_pc *pc, struct nv_instruction *nvi)
+{
+   ubyte dst_type = nvi->def[0] ? DTYPE(nvi, 0) : STYPE(nvi, 0);
+
+   pc->emit[0] = 0xa0000000;
+
+   switch (dst_type) {
+   case NV_TYPE_F32:
+      switch (STYPE(nvi, 0)) {
+      case NV_TYPE_F32: pc->emit[1] = CVT_F32_F32; break;
+      case NV_TYPE_S32: pc->emit[1] = CVT_F32_S32; break;
+      case NV_TYPE_U32: pc->emit[1] = CVT_F32_U32; break;
+      }
+      break;
+   case NV_TYPE_S32:
+      switch (STYPE(nvi, 0)) {
+      case NV_TYPE_F32: pc->emit[1] = CVT_S32_F32; break;
+      case NV_TYPE_S32: pc->emit[1] = CVT_S32_S32; break;
+      case NV_TYPE_U32: pc->emit[1] = CVT_S32_U32; break;
+      }
+      break;
+   case NV_TYPE_U32:
+      switch (STYPE(nvi, 0)) {
+      case NV_TYPE_F32: pc->emit[1] = CVT_U32_F32; break;
+      case NV_TYPE_S32: pc->emit[1] = CVT_U32_S32; break;
+      case NV_TYPE_U32: pc->emit[1] = CVT_U32_U32; break;
+      }
+      break;
+   }
+   if (pc->emit[1] == CVT_F32_F32 &&
+       (nvi->opcode == NV_OP_CEIL || nvi->opcode == NV_OP_FLOOR ||
+	nvi->opcode == NV_OP_TRUNC))
+       pc->emit[1] |= CVT_RI;
+
+   switch (nvi->opcode) {
+   case NV_OP_CEIL:  pc->emit[1] |= CVT_CEIL; break;
+   case NV_OP_FLOOR: pc->emit[1] |= CVT_FLOOR; break;
+   case NV_OP_TRUNC: pc->emit[1] |= CVT_TRUNC; break;
+
+   case NV_OP_ABS: pc->emit[1] |= CVT_ABS; break;
+   case NV_OP_SAT: pc->emit[1] |= CVT_SAT; break;
+   case NV_OP_NEG: pc->emit[1] |= CVT_NEG; break;
+   default:
+      assert(nvi->opcode == NV_OP_CVT);
+      break;
+   }
+   assert(nvi->opcode != NV_OP_ABS || !(nvi->src[0]->mod & NV_MOD_NEG));
+
+   if (nvi->src[0]->mod & NV_MOD_NEG) pc->emit[1] ^= CVT_NEG;
+   if (nvi->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= CVT_ABS;
+
+   emit_form_MAD(pc, nvi);
+}
+
+static void
+emit_tex(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0xf0000001;
+   pc->emit[1] = 0x00000000;
+
+   DID(pc, i->def[0], 2);
+
+   set_pred(pc, i);
+
+   pc->emit[0] |= i->tex_t << 9;
+   pc->emit[0] |= i->tex_s << 17;
+
+   pc->emit[0] |= i->tex_argc << 22;
+
+   pc->emit[0] |= (i->tex_mask & 0x3) << 25;
+   pc->emit[1] |= (i->tex_mask & 0xc) << 12;
+
+   if (i->tex_live)
+      pc->emit[1] |= 4;
+
+   if (i->tex_cube)
+      pc->emit[0] |= 0x08000000;
+
+   if (i->opcode == NV_OP_TXB)
+      pc->emit[1] |= 0x20000000;
+   else
+   if (i->opcode == NV_OP_TXL)
+      pc->emit[1] |= 0x40000000;
+   else
+      pc->emit[0] -= 1 << 22;
+}
+
+static void
+emit_cvt2fixed(struct nv_pc *pc, struct nv_instruction *i)
+{
+   ubyte mod = i->src[0]->mod;
+
+   pc->emit[0] = 0xb0000000;
+   pc->emit[1] = 0xc0000000;
+
+   if (i->opcode == NV_OP_PREEX2)
+      pc->emit[1] |= 0x4000;
+
+   emit_form_MAD(pc, i);
+
+   if (mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000;
+   if (mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000;
+}
+
+static void
+emit_ddx(struct nv_pc *pc, struct nv_instruction *i)
+{
+   assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR);
+
+   pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0240001 : 0xc0140001;
+   pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x86400000 : 0x89800000;
+
+   DID(pc, i->def[0], 2);
+   SID(pc, i->src[0], 9);
+   SID(pc, i->src[0], 32 + 14);
+
+   set_pred(pc, i);
+   set_pred_wr(pc, i);
+}
+
+static void
+emit_ddy(struct nv_pc *pc, struct nv_instruction *i)
+{
+   assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR);
+
+   pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0250001 : 0xc0150001;
+   pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x85800000 : 0x8a400000;
+
+   DID(pc, i->def[0], 2);
+   SID(pc, i->src[0], 9);
+   SID(pc, i->src[0], 32 + 14);
+
+   set_pred(pc, i);
+   set_pred_wr(pc, i);
+}
+
+void
+nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
+{
+   // nv_print_instruction(i);
+
+   switch (i->opcode) {
+   case NV_OP_MOV:
+      if (DFILE(i, 0) == NV_FILE_ADDR)
+         emit_add_a16(pc, i);
+      else
+         emit_mov(pc, i);
+      break;
+   case NV_OP_LDA:
+      emit_mov(pc, i);
+      break;
+   case NV_OP_STA:
+      emit_st(pc, i);
+      break;
+   case NV_OP_LINTERP:
+   case NV_OP_PINTERP:
+      emit_interp(pc, i);
+      break;
+   case NV_OP_ADD:
+      emit_add(pc, i);
+      break;
+   case NV_OP_AND:
+   case NV_OP_OR:
+   case NV_OP_XOR:
+      emit_bitop2(pc, i);
+      break;
+   case NV_OP_CVT:
+   case NV_OP_ABS:
+   case NV_OP_NEG:
+   case NV_OP_SAT:
+   case NV_OP_CEIL:
+   case NV_OP_FLOOR:
+   case NV_OP_TRUNC:
+      emit_cvt(pc, i);
+      break;
+   case NV_OP_DFDX:
+      emit_ddx(pc, i);
+      break;
+   case NV_OP_DFDY:
+      emit_ddy(pc, i);
+      break;
+   case NV_OP_RCP:
+   case NV_OP_RSQ:
+   case NV_OP_LG2:
+   case NV_OP_SIN:
+   case NV_OP_COS:
+   case NV_OP_EX2:
+      emit_flop(pc, i);
+      break;
+   case NV_OP_PRESIN:
+   case NV_OP_PREEX2:
+      emit_cvt2fixed(pc, i);
+      break;
+   case NV_OP_MAD:
+      emit_mad(pc, i);
+      break;
+   case NV_OP_MAX:
+   case NV_OP_MIN:
+      emit_minmax(pc, i);
+      break;
+   case NV_OP_MUL:
+      emit_mul_f32(pc, i);
+      break;
+   case NV_OP_SET:
+      emit_set(pc, i);
+      break;
+   case NV_OP_SHL:
+   case NV_OP_SHR:
+      emit_shift(pc, i);
+      break;
+   case NV_OP_TEX:
+   case NV_OP_TXB:
+   case NV_OP_TXL:
+      emit_tex(pc, i);
+      break;
+   case NV_OP_KIL:
+      emit_flow(pc, i, 0x0);
+      break;
+   case NV_OP_BRA:
+      emit_flow(pc, i, 0x1);
+      break;
+   case NV_OP_CALL:
+      emit_flow(pc, i, 0x2);
+      break;
+   case NV_OP_RET:
+      emit_flow(pc, i, 0x3);
+      break;
+   case NV_OP_BREAKADDR:
+      emit_flow(pc, i, 0x4);
+      break;
+   case NV_OP_BREAK:
+      emit_flow(pc, i, 0x5);
+      break;
+   case NV_OP_JOINAT:
+      emit_flow(pc, i, 0xa);
+      break;
+   case NV_OP_NOP:
+      pc->emit[0] = 0xf0000001;
+      pc->emit[1] = 0xe0000000;
+      break;
+   case NV_OP_PHI:
+   case NV_OP_SUB:
+      NOUVEAU_ERR("operation \"%s\" should have been eliminated\n",
+		  nv_opcode_name(i->opcode));
+      break;
+   default:
+      NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode);
+      abort();
+      break;
+   }
+
+   assert((pc->emit[0] & 1) == i->is_long);
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
new file mode 100644
index 0000000000..0811420e42
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -0,0 +1,717 @@
+
+#include "nv50_pc.h"
+
+#define DESCEND_ARBITRARY(j, f)                                 \
+do {                                                            \
+   b->pass_seq = ctx->pc->pass_seq;                             \
+                                                                \
+   for (j = 0; j < 2; ++j)                                      \
+      if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \
+         f(ctx, b->out[j]);	                                  \
+} while (0)
+
+extern unsigned nv50_inst_min_size(struct nv_instruction *);
+
+struct nv_pc_pass {
+   struct nv_pc *pc;
+};
+
+static INLINE boolean
+values_equal(struct nv_value *a, struct nv_value *b)
+{
+   /* XXX: sizes */
+   return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id);
+}
+
+static INLINE boolean
+inst_commutation_check(struct nv_instruction *a,
+                       struct nv_instruction *b)
+{
+   int si, di;
+
+   for (di = 0; di < 4; ++di) {
+      if (!a->def[di])
+         break;
+      for (si = 0; si < 5; ++si) {
+         if (!b->src[si])
+            continue;
+         if (values_equal(a->def[di], b->src[si]->value))
+            return FALSE;
+      }
+   }
+
+   if (b->flags_src && b->flags_src->value == a->flags_def)
+      return FALSE;
+
+   return TRUE;
+}
+
+/* Check whether we can swap the order of the instructions,
+ * where a & b may be either the earlier or the later one.
+ */
+static boolean
+inst_commutation_legal(struct nv_instruction *a,
+		       struct nv_instruction *b)
+{
+   return inst_commutation_check(a, b) && inst_commutation_check(b, a);
+}
+
+static INLINE boolean
+inst_cullable(struct nv_instruction *nvi)
+{
+   return (!(nvi->is_terminator ||
+             nvi->target ||
+             nvi->fixed ||
+             nv_nvi_refcount(nvi)));
+}
+
+static INLINE boolean
+nvi_isnop(struct nv_instruction *nvi)
+{
+   if (nvi->opcode == NV_OP_EXPORT)
+      return TRUE;
+
+   if (nvi->fixed ||
+       nvi->is_terminator ||
+       nvi->flags_src ||
+       nvi->flags_def)
+      return FALSE;
+
+   if (nvi->def[0]->join->reg.id < 0)
+      return TRUE;
+
+   if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT)
+      return FALSE;
+
+   if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file)
+      return FALSE;
+
+   if (nvi->src[0]->value->join->reg.id < 0) {
+      debug_printf("nvi_isnop: orphaned value detected\n");
+      return TRUE;
+   }
+
+   if (nvi->opcode == NV_OP_SELECT)
+      if (!values_equal(nvi->def[0], nvi->src[1]->value))
+         return FALSE;
+
+   return values_equal(nvi->def[0], nvi->src[0]->value);
+}
+
+static void
+nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *next;
+   int j;
+   uint size, n32 = 0;
+
+   b->priv = 0;
+
+   if (pc->num_blocks)
+      b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos +
+                   pc->bb_list[pc->num_blocks - 1]->bin_size;
+
+   pc->bb_list[pc->num_blocks++] = b;
+
+   /* visit node */
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+      if (nvi_isnop(nvi))
+         nv_nvi_delete(nvi);
+   }
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+
+      size = nv50_inst_min_size(nvi);
+      if (nvi->next && size < 8)
+         ++n32;
+      else
+      if ((n32 & 1) && nvi->next &&
+          nv50_inst_min_size(nvi->next) == 4 &&
+          inst_commutation_legal(nvi, nvi->next)) {
+         ++n32;
+         debug_printf("permuting: ");
+         nv_print_instruction(nvi);
+         nv_print_instruction(nvi->next);
+         nv_nvi_permute(nvi, nvi->next);
+         next = nvi;
+      } else {
+         nvi->is_long = 1;
+
+         b->bin_size += n32 & 1;
+         if (n32 & 1)
+            nvi->prev->is_long = 1;
+         n32 = 0;
+      }
+      b->bin_size += 1 + nvi->is_long;
+   }
+
+   if (!b->entry) {
+      debug_printf("block %p is now empty\n", b);
+   } else
+   if (!b->exit->is_long) {
+      assert(n32);
+      b->exit->is_long = 1;
+      b->bin_size += 1;
+
+      /* might have del'd a hole tail of instructions */
+      if (!b->exit->prev->is_long && !(n32 & 1)) {
+         b->bin_size += 1;
+         b->exit->prev->is_long = 1;
+      }
+   }
+   assert(!b->exit || b->exit->is_long);
+
+   pc->bin_size += b->bin_size *= 4;
+
+   /* descend CFG */
+
+   if (!b->out[0])
+      return;
+   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
+      return;
+
+#if 0
+   /* delete ELSE branch */
+   if (b->entry &&
+       b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) {
+      nv_nvi_delete(b->entry);
+      b->bin_size -= 2;
+      pc->bin_size -= 8;
+   }
+#endif
+   for (j = 0; j < 2; ++j)
+      if (b->out[j] && b->out[j] != b)
+         nv_pc_pass_pre_emission(pc, b->out[j]);
+}
+
+int
+nv_pc_exec_pass2(struct nv_pc *pc)
+{
+   debug_printf("preparing %u blocks for emission\n", pc->num_blocks);
+
+   pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
+  
+   pc->num_blocks = 0;
+   nv_pc_pass_pre_emission(pc, pc->root);
+
+   return 0;
+}
+
+static INLINE boolean
+is_cmem_load(struct nv_instruction *nvi)
+{
+   return (nvi->opcode == NV_OP_LDA &&
+	   nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) &&
+	   nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15));
+}
+
+static INLINE boolean
+is_smem_load(struct nv_instruction *nvi)
+{
+   return (nvi->opcode == NV_OP_LDA &&
+	   (nvi->src[0]->value->reg.file == NV_FILE_MEM_S ||
+	    nvi->src[0]->value->reg.file <= NV_FILE_MEM_P));
+}
+
+static INLINE boolean
+is_immd_move(struct nv_instruction *nvi)
+{
+   return (nvi->opcode == NV_OP_MOV &&
+	   nvi->src[0]->value->reg.file == NV_FILE_IMM);
+}
+
+static INLINE void
+check_swap_src_0_1(struct nv_instruction *nvi)
+{
+   static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
+   struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1];
+
+   if (!nv_op_commutative(nvi->opcode))
+      return;
+   assert(src0 && src1);
+
+   if (is_cmem_load(src0->value->insn)) {
+      if (!is_cmem_load(src1->value->insn)) {
+         nvi->src[0] = src1;
+	 nvi->src[1] = src0;
+	 /* debug_printf("swapping cmem load to 1\n"); */
+      }
+   } else
+   if (is_smem_load(src1->value->insn)) {
+      if (!is_smem_load(src0->value->insn)) {
+         nvi->src[0] = src1;
+	 nvi->src[1] = src0;
+	 /* debug_printf("swapping smem load to 0\n"); */
+      }
+   }
+
+   if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0)
+      nvi->set_cond = cc_swapped[nvi->set_cond];
+}
+
+struct nv_pass {
+   struct nv_pc *pc;
+   int n;
+   void *priv;
+};
+
+static int
+nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *sti;
+   int j;
+
+   for (sti = b->entry; sti; sti = sti->next) {
+      if (!sti->def[0])
+         continue;
+
+      if (sti->def[0]->reg.file != NV_FILE_OUT)
+         continue;
+      if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA)
+         continue;
+
+      nvi = sti->src[0]->value->insn;
+      if (!nvi || nvi->opcode == NV_OP_PHI)
+         continue;
+      assert(nvi->def[0] == sti->src[0]->value);
+
+      if (nvi->def[0]->refc > 1)
+         continue;
+
+      nvi->def[0] = sti->def[0];
+      nvi->fixed = 1;
+      sti->fixed = 0;
+   }
+   DESCEND_ARBITRARY(j, nv_pass_fold_stores);
+
+   return 0;
+}
+
+static int
+nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *ld;
+   int j;
+
+   for (nvi = b->entry; nvi; nvi = nvi->next) {
+      check_swap_src_0_1(nvi);
+
+      for (j = 0; j < 3; ++j) {
+         if (!nvi->src[j])
+            break;
+         ld = nvi->src[j]->value->insn;
+         if (!ld)
+            continue;
+
+         if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) {
+            nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
+            debug_printf("folded immediate %i\n", ld->def[0]->n);
+            continue;
+         }
+
+         if (ld->opcode != NV_OP_LDA)
+            continue;
+         if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value))
+            continue;
+
+         if (j == 0 && ld->src[4]) /* can't load shared mem */
+            continue;
+
+         /* fold it ! */ /* XXX: ref->insn */
+         nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
+         if (ld->src[4])
+            nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value);
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_fold_loads);
+
+   return 0;
+}
+
+static int
+nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   int j;
+   struct nv_instruction *nvi, *mi, *next;
+   ubyte mod;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+      if (nvi->opcode == NV_OP_SUB) {
+         nvi->opcode = NV_OP_ADD;
+         nvi->src[1]->mod ^= NV_MOD_NEG;
+      }
+
+      /* should not put any modifiers on NEG and ABS */
+      assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod);
+      assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod);
+
+      for (j = 0; j < 4; ++j) {
+         if (!nvi->src[j])
+            break;
+
+         mi = nvi->src[j]->value->insn;
+         if (!mi)
+            continue;
+         if (mi->def[0]->refc > 1)
+            continue;
+
+         if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG;
+         else
+         if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS;
+         else
+            continue;
+
+         if (nvi->opcode == NV_OP_ABS)
+            mod &= ~(NV_MOD_NEG | NV_MOD_ABS);
+         else
+         if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) {
+            nvi->opcode = NV_OP_MOV;
+            mod = 0;
+         }
+
+         if (!(nv50_supported_src_mods(nvi->opcode, j) & mod))
+            continue;
+
+         nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value);
+
+         nvi->src[j]->mod ^= mod;
+      }
+
+      if (nvi->opcode == NV_OP_SAT) {
+         mi = nvi->src[0]->value->insn;
+
+         if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) {
+            mi->saturate = 1;
+            mi->def[0] = nvi->def[0];
+            nv_nvi_delete(nvi);
+         }
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_lower_mods);
+
+   return 0;
+}
+
+#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL)
+
+static struct nv_value *
+find_immediate(struct nv_ref *ref)
+{
+   struct nv_value *src;
+
+   if (!ref)
+      return NULL;
+
+   src = ref->value;
+   while (src->insn && src->insn->opcode == NV_OP_MOV) {
+      assert(!src->insn->src[0]->mod);
+      src = src->insn->src[0]->value;
+   }
+   return (src->reg.file == NV_FILE_IMM) ? src : NULL;
+}
+
+static void
+constant_operand(struct nv_pc *pc,
+                 struct nv_instruction *nvi, struct nv_value *val, int s)
+{
+   int t = s ? 0 : 1;
+   ubyte type;
+
+   if (!nvi->def[0])
+      return;
+   type = nvi->def[0]->reg.type;
+
+   switch (nvi->opcode) {
+   case NV_OP_MUL:
+      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) ||
+          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) {
+         nvi->opcode = NV_OP_MOV;
+         nv_reference(pc, &nvi->src[s], NULL);
+         if (!s) {
+            nvi->src[0] = nvi->src[1];
+            nvi->src[1] = NULL;
+         }
+      } else
+      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) ||
+          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) {
+         nvi->opcode = NV_OP_ADD;
+         nv_reference(pc, &nvi->src[s], NULL);
+         if (!s) {
+            nvi->src[0] = nvi->src[1];
+            nvi->src[1] = NULL;
+         }
+      } else
+      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) {
+         nvi->opcode = NV_OP_NEG;
+         nv_reference(pc, &nvi->src[s], NULL);
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
+      } else
+      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) {
+         nvi->opcode = NV_OP_ADD;
+         assert(!nvi->src[s]->mod);
+         nv_reference(pc, &nvi->src[s], nvi->src[t]->value);
+         nvi->src[t]->mod ^= NV_MOD_NEG;
+         nvi->src[s]->mod |= NV_MOD_NEG;
+      } else
+      if (val->reg.imm.u32 == 0) {
+         nvi->opcode = NV_OP_MOV;
+         nv_reference(pc, &nvi->src[t], NULL);
+         if (s) {
+            nvi->src[0] = nvi->src[1];
+            nvi->src[1] = NULL;
+         }
+      }
+      break;
+   case NV_OP_ADD:
+      if (val->reg.imm.u32 == 0) {
+         nvi->opcode = NV_OP_MOV;
+         nv_reference(pc, &nvi->src[s], NULL);
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
+      }
+      break;
+   default:
+      break;
+   }
+}
+
+static int
+nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *nvi, *next;
+   int j;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      struct nv_value *src0, *src1, *src;
+      int mod;
+
+      next = nvi->next;
+
+      if ((src = find_immediate(nvi->src[0])) != NULL)
+         constant_operand(ctx->pc, nvi, src, 0);
+      else
+      if ((src = find_immediate(nvi->src[1])) != NULL)
+         constant_operand(ctx->pc, nvi, src, 1);
+
+      /* try to combine MUL, ADD into MAD */
+      if (nvi->opcode != NV_OP_ADD)
+         continue;
+
+      src0 = nvi->src[0]->value;
+      src1 = nvi->src[1]->value;
+
+      if (SRC_IS_MUL(src0) && src0->refc == 1)
+         src = src0;
+      else
+      if (SRC_IS_MUL(src1) && src1->refc == 1)
+         src = src1;
+      else
+         continue;
+
+      nvi->opcode = NV_OP_MAD;
+      mod = nvi->src[(src == src0) ? 0 : 1]->mod;
+      nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL);
+      nvi->src[2] = nvi->src[(src == src0) ? 1 : 0];
+
+      assert(!(mod & ~NV_MOD_NEG));
+      nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value);
+      nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value);
+      nvi->src[0]->mod = src->insn->src[0]->mod ^ mod;
+      nvi->src[1]->mod = src->insn->src[1]->mod;
+   }
+   DESCEND_ARBITRARY(j, nv_pass_lower_arith);
+
+   return 0;
+}
+
+/*
+set $r2 g f32 $r2 $r3
+cvt abs rn f32 $r2 s32 $r2
+cvt f32 $c0 # f32 $r2
+e $c0 bra 0x80
+*/
+#if 0
+static int
+nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   /* XXX: easier in IR builder for now */
+   return 0;
+}
+#endif
+
+/* TODO: reload elimination, redundant store elimination */
+
+struct nv_pass_reldelim {
+   struct nv_pc *pc;
+};
+
+static int
+nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b)
+{
+   int j;
+   struct nv_instruction *ld, *next;
+
+   for (ld = b->entry; ld; ld = next) {
+      next = ld->next;
+
+      if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) {
+
+      } else
+      if (ld->opcode == NV_OP_LDA) {
+         
+      } else
+      if (ld->opcode == NV_OP_MOV) {
+         
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_reload_elim);
+
+   return 0;
+}
+
+static int
+nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   int i, c, j;
+
+   for (i = 0; i < ctx->pc->num_instructions; ++i) {
+      struct nv_instruction *nvi = &ctx->pc->instructions[i];
+      struct nv_value *def[4];
+
+      if (!nv_is_vector_op(nvi->opcode))
+         continue;
+      nvi->tex_mask = 0;
+
+      for (c = 0; c < 4; ++c) {
+         if (nvi->def[c]->refc)
+            nvi->tex_mask |= 1 << c;
+         def[c] = nvi->def[c];
+      }
+
+      j = 0;
+      for (c = 0; c < 4; ++c)
+         if (nvi->tex_mask & (1 << c))
+            nvi->def[j++] = def[c];
+      for (c = 0; c < 4; ++c)
+         if (!(nvi->tex_mask & (1 << c)))
+           nvi->def[j++] = def[c];
+      assert(j == 4);
+   }
+   return 0;
+}
+
+struct nv_pass_dce {
+   struct nv_pc *pc;
+   uint removed;
+};
+
+static int
+nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)
+{
+   int j;
+   struct nv_instruction *nvi, *next;
+
+   for (nvi = b->entry; nvi; nvi = next) {
+      next = nvi->next;
+
+      if (inst_cullable(nvi)) {
+         nv_nvi_delete(nvi);
+
+         ++ctx->removed;
+      }
+   }
+   DESCEND_ARBITRARY(j, nv_pass_dce);
+
+   return 0;
+}
+
+static INLINE boolean
+bb_simple_if_endif(struct nv_basic_block *bb)
+{
+   return (bb->out[0] && bb->out[1] &&
+           bb->out[0]->out[0] == bb->out[1] &&
+           !bb->out[0]->out[1]);
+}
+
+static int
+nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   int j;
+
+   if (bb_simple_if_endif(b)) {
+      ++ctx->n;
+      debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n);
+   }
+   DESCEND_ARBITRARY(j, nv_pass_flatten);
+
+   return 0;
+}
+
+int
+nv_pc_exec_pass0(struct nv_pc *pc)
+{
+   struct nv_pass_reldelim *reldelim;
+   struct nv_pass pass;
+   struct nv_pass_dce dce;
+   int ret;
+
+   reldelim = CALLOC_STRUCT(nv_pass_reldelim);
+   reldelim->pc = pc;
+
+   ret = nv_pass_reload_elim(reldelim, pc->root);
+
+   FREE(reldelim);
+   if (ret)
+      return ret;
+
+   pass.pc = pc;
+
+   pc->pass_seq++;
+   ret = nv_pass_flatten(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   /* Do this first, so we don't have to pay attention
+    * to whether sources are supported memory loads.
+    */
+   pc->pass_seq++;
+   ret = nv_pass_lower_arith(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_fold_loads(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_fold_stores(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_lower_mods(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   dce.pc = pc;
+   do {
+      dce.removed = 0;
+      pc->pass_seq++;
+      ret = nv_pass_dce(&dce, pc->root);
+      if (ret)
+         return ret;
+   } while (dce.removed);
+
+   ret = nv_pass_tex_mask(&pass, pc->root);
+   if (ret)
+      return ret;
+
+   return ret;
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
new file mode 100644
index 0000000000..09512ffb88
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -0,0 +1,287 @@
+
+#include "nv50_context.h"
+#include "nv50_pc.h"
+
+#define NVXX_DEBUG 0
+
+#define PRINT(args...) debug_printf(args)
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+#endif
+
+static const char *norm = "\x1b[00m";
+static const char *gree = "\x1b[32m";
+static const char *blue = "\x1b[34m";
+static const char *cyan = "\x1b[36m";
+static const char *orng = "\x1b[33m";
+static const char *mgta = "\x1b[35m";
+
+static const char *nv_opcode_names[NV_OP_COUNT + 1] = {
+   "phi",
+   "extract",
+   "combine",
+   "lda",
+   "sta",
+   "mov",
+   "add",
+   "sub",
+   "neg",
+   "mul",
+   "mad",
+   "cvt",
+   "sat",
+   "not",
+   "and",
+   "or",
+   "xor",
+   "shl",
+   "shr",
+   "rcp",
+   "(undefined)",
+   "rsqrt",
+   "lg2",
+   "sin",
+   "cos",
+   "ex2",
+   "presin",
+   "preex2",
+   "min",
+   "max",
+   "set",
+   "sad",
+   "kil",
+   "bra",
+   "call",
+   "ret",
+   "break",
+   "breakaddr",
+   "joinat",
+   "tex",
+   "texbias",
+   "texlod",
+   "texfetch",
+   "texsize",
+   "dfdx",
+   "dfdy",
+   "quadop",
+   "linterp",
+   "pinterp",
+   "abs",
+   "ceil",
+   "floor",
+   "trunc",
+   "nop",
+   "select",
+   "export",
+   "BAD_OP"
+};
+
+static const char *nv_cond_names[] =
+{
+   "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "",
+   "never", "ltu", "equ", "leu", "gtu", "neu", "geu", ""
+};
+
+static const char *nv_modifier_strings[] =
+{
+   "",
+   "neg",
+   "abs",
+   "neg abs",
+   "not",
+   "not neg"
+   "not abs",
+   "not neg abs",
+   "sat",
+   "BAD_MOD"
+};
+
+const char *
+nv_opcode_name(uint opcode)
+{
+   return nv_opcode_names[MIN2(opcode, ARRAY_SIZE(nv_opcode_names) - 1)];
+}
+
+static INLINE const char *
+nv_type_name(ubyte type)
+{
+   switch (type) {
+   case NV_TYPE_U16: return "u16";
+   case NV_TYPE_S16: return "s16";
+   case NV_TYPE_F32: return "f32";
+   case NV_TYPE_U32: return "u32";
+   case NV_TYPE_S32: return "s32";
+   case NV_TYPE_P32: return "p32";
+   case NV_TYPE_F64: return "f64";
+   default:
+      return "BAD_TYPE";
+   }
+}
+
+static INLINE const char *
+nv_cond_name(ubyte cc)
+{
+   return nv_cond_names[MIN2(cc, 15)];
+}
+
+static INLINE const char *
+nv_modifier_string(ubyte mod)
+{
+   return nv_modifier_strings[MIN2(mod, 9)];
+}
+
+static INLINE int
+nv_value_id(struct nv_value *value)
+{
+   if (value->join->reg.id >= 0)
+      return value->join->reg.id;
+   return value->n;
+}
+
+static INLINE boolean
+nv_value_allocated(struct nv_value *value)
+{
+   return (value->reg.id >= 0) ? TRUE : FALSE;
+}
+
+static INLINE void
+nv_print_address(const char c, int buf, struct nv_value *a, int offset)
+{
+   if (buf >= 0)
+      PRINT(" %s%c%i[", cyan, c, buf);
+   else
+      PRINT(" %s%c[", cyan, c);
+   if (a)
+      PRINT("%s$a%i%s+", mgta, nv_value_id(a), cyan);
+   PRINT("%s0x%x%s]", orng, offset, cyan);
+}
+
+static INLINE void
+nv_print_cond(struct nv_instruction *nvi)
+{
+   PRINT("%s%s%s$c%i ",
+         gree, nv_cond_name(nvi->cc),
+         mgta, nv_value_id(nvi->flags_src->value));
+}
+
+static INLINE void
+nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
+{
+   char reg_pfx = '$';
+
+   if (type == NV_TYPE_ANY)
+      type = value->reg.type;
+
+   if (value->reg.file != NV_FILE_FLAGS)
+      PRINT(" %s%s", gree, nv_type_name(type));
+
+   if (!nv_value_allocated(value))
+      reg_pfx = '%';
+
+   switch (value->reg.file) {
+   case NV_FILE_GPR:
+      PRINT(" %s%cr%i", blue, reg_pfx, nv_value_id(value));
+      break;
+   case NV_FILE_OUT:
+      PRINT(" %s%co%i", mgta, reg_pfx, nv_value_id(value));
+      break;
+   case NV_FILE_ADDR:
+      PRINT(" %s%ca%i", mgta, reg_pfx, nv_value_id(value));
+      break;
+   case NV_FILE_FLAGS:
+      PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value));
+      break;
+   case NV_FILE_MEM_S:
+      nv_print_address('s', -1, ind, 4 * nv_value_id(value));
+      break;
+   case NV_FILE_MEM_P:
+      nv_print_address('p', -1, ind, 4 * nv_value_id(value));
+      break;
+   case NV_FILE_MEM_V:
+      nv_print_address('v', -1, ind, 4 * nv_value_id(value));
+      break;
+   case NV_FILE_IMM:
+      switch (type) {
+      case NV_TYPE_U16:
+      case NV_TYPE_S16:
+         PRINT(" %s0x%04x", orng, value->reg.imm.u32);
+         break;
+      case NV_TYPE_F32:
+         PRINT(" %s%f", orng, value->reg.imm.f32);
+         break;
+      case NV_TYPE_F64:
+         PRINT(" %s%f", orng, value->reg.imm.f64);
+         break;
+      case NV_TYPE_U32:
+      case NV_TYPE_S32:
+      case NV_TYPE_P32:
+         PRINT(" %s0x%08x", orng, value->reg.imm.u32);
+         break;
+      }
+      break;
+   default:
+      if (value->reg.file >= NV_FILE_MEM_G(0) &&
+          value->reg.file <= NV_FILE_MEM_G(15))
+         nv_print_address('g', value->reg.file - NV_FILE_MEM_G(0), ind,
+                          nv_value_id(value) * 4);
+      else
+      if (value->reg.file >= NV_FILE_MEM_C(0) &&
+          value->reg.file <= NV_FILE_MEM_C(15))
+         nv_print_address('c', value->reg.file - NV_FILE_MEM_C(0), ind,
+                          nv_value_id(value) * 4);
+      else
+         NOUVEAU_ERR(" BAD_FILE[%i]", nv_value_id(value));
+      break;
+   }
+}
+
+static INLINE void
+nv_print_ref(struct nv_ref *ref, struct nv_value *ind)
+{
+   nv_print_value(ref->value, ind, ref->typecast);
+}
+
+void
+nv_print_instruction(struct nv_instruction *i)
+{
+   int j;
+
+   if (i->flags_src)
+      nv_print_cond(i);
+
+   PRINT("%s", gree);
+   if (i->opcode == NV_OP_SET)
+      PRINT("set %s", nv_cond_name(i->set_cond));
+   else
+   if (i->saturate)
+      PRINT("sat %s", nv_opcode_name(i->opcode));
+   else
+      PRINT("%s", nv_opcode_name(i->opcode));
+
+   if (i->flags_def)
+      nv_print_value(i->flags_def, NULL, NV_TYPE_ANY);
+
+   /* Only STORE & STA can write to MEM, and they do not def
+    * anything, so the address is thus part of the source.
+    */
+   if (i->def[0])
+      nv_print_value(i->def[0], NULL, NV_TYPE_ANY);
+   else
+      PRINT(" #");
+
+   for (j = 0; j < 4; ++j) {
+      if (!i->src[j])
+         continue;
+
+      if (i->src[j]->mod)
+         PRINT(" %s", nv_modifier_string(i->src[j]->mod));
+
+      nv_print_ref(i->src[j],
+                   (j == nv50_indirect_opnd(i)) ?
+                   i->src[4]->value : NULL);
+   }
+   if (!i->is_long)
+      PRINT(" %ss", norm);
+   PRINT("\n");
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
new file mode 100644
index 0000000000..eb446d641a
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -0,0 +1,973 @@
+/*
+ * XXX: phi function live intervals start at first ordinary instruction,
+ *      add_range should be taking care of that already ...
+ *
+ * XXX: TEX must choose TEX's def as representative
+ *
+ * XXX: Aieee! Must materialize MOVs if source is in other basic block!
+ *       -- absolutely, or we cannot execute the MOV conditionally at all
+ * XXX: Aieee! Must include PHIs in LVA so we pull through liveness if
+ *      PHI source is e.g. in dominator block.
+ *       -- seems we lose liveness somehow, track that
+ */
+
+#include "nv50_context.h"
+#include "nv50_pc.h"
+
+#include "util/u_simple_list.h"
+
+#define NUM_REGISTER_FILES 4
+
+struct register_set {
+   struct nv_pc *pc;
+
+   uint32_t last[NUM_REGISTER_FILES];
+   uint32_t bits[NUM_REGISTER_FILES][8];
+};
+
+struct nv_pc_pass {
+   struct nv_pc *pc;
+
+   struct nv_instruction **insns;
+   int num_insns;
+
+   uint pass_seq;
+};
+
+static void
+ranges_coalesce(struct nv_range *range)
+{
+   while (range->next && range->end >= range->next->bgn) {
+      struct nv_range *rnn = range->next->next;
+      assert(range->bgn <= range->next->bgn);
+      range->end = MAX2(range->end, range->next->end);
+      FREE(range->next);
+      range->next = rnn;
+   }
+}
+
+static boolean
+add_range_ex(struct nv_value *val, int bgn, int end, struct nv_range *new_range)
+{
+   struct nv_range *range, **nextp = &val->livei;
+
+   for (range = val->livei; range; range = range->next) {
+      if (end < range->bgn)
+         break; /* insert before */
+
+      if (bgn > range->end) {
+         nextp = &range->next;
+         continue; /* insert after */
+      }
+
+      /* overlap */
+      if (bgn < range->bgn) {
+         range->bgn = bgn;
+         if (end > range->end)
+            range->end = end;
+         ranges_coalesce(range);
+         return TRUE;
+      }
+      if (end > range->end) {
+         range->end = end;
+         ranges_coalesce(range);
+         return TRUE;
+      }
+      assert(bgn >= range->bgn);
+      assert(end <= range->end);
+      return TRUE;
+   }
+
+   if (!new_range)
+      new_range = CALLOC_STRUCT(nv_range);
+
+   new_range->bgn = bgn;
+   new_range->end = end;
+   new_range->next = range;
+   *(nextp) = new_range;
+   return FALSE;
+}
+
+static void
+add_range(struct nv_value *val, struct nv_basic_block *b, int end)
+{
+   int bgn;
+
+   if (!val->insn) /* ignore non-def values */
+      return;
+   assert(b->entry->serial <= b->exit->serial);
+   assert(b->phi->serial <= end);
+   assert(b->exit->serial + 1 >= end);
+
+   bgn = val->insn->serial;
+   if (bgn < b->entry->serial || bgn > b->exit->serial)
+      bgn = b->entry->serial;
+   // debug_printf("add_range(value %i): [%i, %i)\n", val->n, bgn, end);
+
+   if (bgn > end) {
+      debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n",
+                   b->entry->serial, b->exit->serial, bgn, end);
+   }
+   assert(bgn <= end);
+
+   if (bgn < val->insn->serial)
+      debug_printf("WARNING: leaking value %i ?\n", val->n);
+
+   add_range_ex(val, bgn, end, NULL);
+}
+
+#ifdef NV50_RA_DEBUG_JOIN
+static void
+livei_print(struct nv_value *a)
+{
+   struct nv_range *r = a->livei;
+
+   debug_printf("livei %i: ", a->n);
+   while (r) {
+      debug_printf("[%i, %i) ", r->bgn, r->end);
+      r = r->next;
+   }
+   debug_printf("\n");
+}
+#endif
+
+static void
+livei_unify(struct nv_value *dst, struct nv_value *src)
+{
+   struct nv_range *range, *next;
+
+   for (range = src->livei; range; range = next) {
+      next = range->next;
+      if (add_range_ex(dst, range->bgn, range->end, range))
+         FREE(range);
+   }
+   src->livei = NULL;
+}
+
+static void
+livei_release(struct nv_value *val)
+{
+   struct nv_range *range, *next;
+
+   for (range = val->livei; range; range = next) {
+      next = range->next;
+      FREE(range);
+   }
+}
+
+static boolean
+livei_have_overlap(struct nv_value *a, struct nv_value *b)
+{
+   struct nv_range *r_a, *r_b;
+
+   for (r_a = a->livei; r_a; r_a = r_a->next) {
+      for (r_b = b->livei; r_b; r_b = r_b->next) {
+         if (r_b->bgn < r_a->end &&
+             r_b->end > r_a->bgn)
+            return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+static int
+livei_end(struct nv_value *a)
+{
+   struct nv_range *r = a->livei;
+
+   assert(r);
+   while (r->next)
+      r = r->next;
+   return r->end;
+}
+
+static boolean
+livei_contains(struct nv_value *a, int pos)
+{
+   struct nv_range *r;
+
+   for (r = a->livei; r && r->bgn <= pos; r = r->next)
+      if (r->end > pos)
+         return TRUE;
+   return FALSE;
+}
+
+static boolean
+reg_assign(struct register_set *set, struct nv_value **def, int n)
+{
+   int i, id, s;
+   uint m;
+   int f = def[0]->reg.file;
+
+   s = n << (nv_type_order(def[0]->reg.type) - 1);
+   m = (1 << s) - 1;
+
+   id = set->last[f];
+
+   for (i = 0; i * 32 < set->last[f]; ++i) {
+      if (set->bits[f][i] == 0xffffffff)
+         continue;
+
+      for (id = 0; id < 32; id += s)
+         if (!(set->bits[f][i] & (m << id)))
+            break;
+      if (id < 32)
+         break;
+   }
+   if (i * 32 + id > set->last[f])
+      return FALSE;
+
+   set->bits[f][i] |= m << id;
+
+   id += i * 32;
+
+   set->pc->max_reg[f] = MAX2(set->pc->max_reg[f], id + s - 1);
+
+   id >>= nv_type_order(def[0]->reg.type) - 1;
+
+   for (i = 0; i < n; ++i)
+      if (def[i]->livei)
+         def[i]->reg.id = id++;
+
+   return TRUE;
+}
+
+static INLINE void
+reg_occupy(struct register_set *set, struct nv_value *val)
+{
+   int s, id = val->reg.id, f = val->reg.file;
+   uint m;
+
+   if (id < 0)
+      return;
+   s = nv_type_order(val->reg.type) - 1;
+   id <<= s;
+   m = (1 << (1 << s)) - 1;
+
+   set->bits[f][id / 32] |= m << (id % 32);
+
+   if (set->pc->max_reg[f] < id)
+      set->pc->max_reg[f] = id;
+}
+
+static INLINE void
+reg_release(struct register_set *set, struct nv_value *val)
+{
+   int s, id = val->reg.id, f = val->reg.file;
+   uint m;
+
+   if (id < 0)
+      return;
+
+   s = nv_type_order(val->reg.type) - 1;
+   id <<= s;
+   m = (1 << (1 << s)) - 1;
+
+   set->bits[f][id / 32] &= ~(m << (id % 32));
+}
+
+static INLINE boolean
+join_allowed(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b)
+{
+   int i;
+   struct nv_value *val;
+
+   if (a->reg.file != b->reg.file ||
+       nv_type_sizeof(a->reg.type) != nv_type_sizeof(b->reg.type))
+      return FALSE;
+
+   if (a->join->reg.id == b->join->reg.id)
+      return TRUE;
+
+#if 1
+   /* either a or b or both have been assigned */
+
+   if (a->join->reg.id >= 0 && b->join->reg.id >= 0)
+      return FALSE;
+   else
+   if (b->join->reg.id >= 0) {
+      if (a->join->reg.id >= 0)
+         return FALSE;
+      val = a;
+      a = b;
+      b = val;
+   }
+
+   for (i = 0; i < ctx->pc->num_values; ++i) {
+      val = &ctx->pc->values[i];
+
+      if (val->join->reg.id != a->join->reg.id)
+         continue;
+      if (val->join != a->join && livei_have_overlap(val->join, b->join))
+         return FALSE;
+   }
+   return TRUE;
+#endif
+   return FALSE;
+}
+
+static INLINE void
+do_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b)
+{
+   int j;
+   struct nv_value *bjoin = b->join;
+
+   if (b->join->reg.id >= 0)
+      a->join->reg.id = b->join->reg.id;
+
+   livei_unify(a->join, b->join);
+
+#ifdef NV50_RA_DEBUG_JOIN
+   debug_printf("joining %i to %i\n", b->n, a->n);
+#endif
+   
+   /* make a->join the new representative */
+   for (j = 0; j < ctx->pc->num_values; ++j) 
+      if (ctx->pc->values[j].join == bjoin)
+         ctx->pc->values[j].join = a->join;
+
+   assert(b->join == a->join);
+}
+
+static INLINE void
+try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b)
+{
+   if (!join_allowed(ctx, a, b)) {
+#ifdef NV50_RA_DEBUG_JOIN
+      debug_printf("cannot join %i to %i: not allowed\n", b->n, a->n);
+#endif
+      return;
+   }
+   if (livei_have_overlap(a->join, b->join)) {
+#ifdef NV50_RA_DEBUG_JOIN
+      debug_printf("cannot join %i to %i: livei overlap\n", b->n, a->n);
+      livei_print(a);
+      livei_print(b);
+#endif
+      return;
+   }
+
+   do_join_values(ctx, a, b);
+}
+
+/* For each operand of each phi in b, generate a new value by inserting a MOV
+ * at the end of the block it is coming from and replace the operand with it.
+ * This eliminates liveness conflicts.
+ */
+static int
+pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *i, *i2;
+   struct nv_basic_block *p, *pn;
+   struct nv_value *val;
+   int n, j;
+
+   b->pass_seq = ctx->pc->pass_seq;
+
+   for (n = 0; n < b->num_in; ++n) {
+      p = b->in[n];
+      assert(p);
+
+      if (b->num_in > 1 && p->out[0] && p->out[1]) { /* if without else */
+         pn = new_basic_block(ctx->pc);
+
+         if (p->out[0] == b)
+            p->out[0] = pn;
+         else
+            p->out[1] = pn;
+
+         if (p->exit->target == b) /* target to new else-block */
+            p->exit->target = pn;
+
+         for (j = 0; j < b->num_in; ++j) {
+            if (b->in[j] == p) {
+               b->in[j] = pn;
+               break;
+            }
+         }
+
+         pn->out[0] = b;
+         pn->in[0] = p;
+         pn->num_in = 1;
+      } else
+         pn = p;
+
+      ctx->pc->current_block = pn;
+
+      /* every block with PHIs will also have other operations */
+      for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
+         for (j = 0; j < 4; ++j) {
+            if (!i->src[j])
+               j = 3;
+            else
+            if (i->src[j]->value->insn->bb == p)
+               break;
+         }
+         if (j >= 4)
+            continue;
+         assert(i->src[j]);
+         val = i->src[j]->value;
+
+         /* XXX: should probably not insert this after terminator */
+         i2 = new_instruction(ctx->pc, NV_OP_MOV);
+
+         i2->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type);
+         i2->src[0] = new_ref  (ctx->pc, val);
+         i2->def[0]->insn = i2;
+
+         nv_reference(ctx->pc, &i->src[j], i2->def[0]);
+      }
+      if (pn != p && pn->exit) {
+         /* XXX: this branch should probably be eliminated */
+         ctx->pc->current_block = b->in[n ? 0 : 1];
+         i2 = new_instruction(ctx->pc, NV_OP_BRA);
+         i2->target = b;
+         i2->is_terminator = 1;
+      }
+   }
+
+   if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) {
+      pass_generate_phi_movs(ctx, b->out[0]);
+   }
+
+   if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) {
+      pass_generate_phi_movs(ctx, b->out[1]);
+   }
+
+   return 0;
+}
+
+static int
+pass_join_values(struct nv_pc_pass *ctx, int iter)
+{
+   int c, n;
+
+   for (n = 0; n < ctx->num_insns; ++n) {
+      struct nv_instruction *i = ctx->insns[n];
+
+      switch (i->opcode) {
+      case NV_OP_PHI:
+         if (!iter)
+            continue;
+         try_join_values(ctx, i->src[0]->value, i->src[1]->value);
+         try_join_values(ctx, i->def[0], i->src[0]->value);
+         break;
+      case NV_OP_MOV:
+         if (iter && i->src[0]->value->insn &&
+             !nv_is_vector_op(i->src[0]->value->join->insn->opcode))
+            try_join_values(ctx, i->def[0], i->src[0]->value);
+         break;
+      case NV_OP_SELECT:
+         if (!iter)
+            break;
+         assert(join_allowed(ctx, i->def[0], i->src[0]->value));
+         assert(join_allowed(ctx, i->def[0], i->src[1]->value));
+         do_join_values(ctx, i->def[0], i->src[0]->value);
+         do_join_values(ctx, i->def[0], i->src[1]->value);
+         break;
+      case NV_OP_TEX:
+      case NV_OP_TXB:
+      case NV_OP_TXL:
+      case NV_OP_TXQ:
+         if (iter)
+            break;
+         for (c = 0; c < 4; ++c) {
+            if (!i->src[c])
+               break;
+            do_join_values(ctx, i->def[c], i->src[c]->value);
+         }
+         break;
+      default:
+         break;
+      }
+   }
+   return 0;
+}
+
+static int
+pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *i;
+
+   b->priv = 0;
+
+   assert(!b->exit || !b->exit->next);
+   for (i = b->phi; i; i = i->next) {
+      i->serial = ctx->num_insns;
+      ctx->insns[ctx->num_insns++] = i;
+   }
+
+   b->pass_seq = ctx->pc->pass_seq;
+
+   if (!b->out[0])
+      return 0;
+   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
+      return 0;
+
+   if (b->out[0] != b)
+      pass_order_instructions(ctx, b->out[0]);
+   if (b->out[1] && b->out[1] != b)
+      pass_order_instructions(ctx, b->out[1]);
+
+   return 0;
+}
+
+static void
+bb_live_set_print(struct nv_pc *pc, struct nv_basic_block *b)
+{
+#ifdef NV50_RA_DEBUG_LIVE_SETS
+   int j;
+   struct nv_value *val;
+
+   debug_printf("live_set of %p: ", b);
+
+   for (j = 0; j < pc->num_values; ++j) {
+      if (!(b->live_set[j / 32] & (1 << (j % 32))))
+         continue;
+      val = &pc->values[j];
+      if (!val->insn)
+         continue;
+      debug_printf("%i ", val->n);
+   }
+   debug_printf("\n");
+#endif
+}
+
+static INLINE void
+live_set_add(struct nv_basic_block *b, struct nv_value *val)
+{
+   if (!val->insn) /* don't add non-def values */
+      return;
+   /* debug_printf("live[%p] <- %i\n", b, val->n); */
+
+   b->live_set[val->n / 32] |= 1 << (val->n % 32);
+}
+
+static INLINE void
+live_set_rem(struct nv_basic_block *b, struct nv_value *val)
+{
+   /* if (val->insn)
+      debug_printf("live[%p] -> %i\n", b, val->n); */
+   b->live_set[val->n / 32] &= ~(1 << (val->n % 32));
+}
+
+static INLINE boolean
+live_set_test(struct nv_basic_block *b, struct nv_ref *ref)
+{
+   int n = ref->value->n;
+   return b->live_set[n / 32] & (1 << (n % 32));
+}
+
+/* check if bf (future) can be reached from bp (past) */
+static boolean
+bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
+		struct nv_basic_block *bt)
+{
+   if (bf == bp)
+      return TRUE;
+   if (bp == bt)
+      return FALSE;
+
+   if (bp->out[0] && bp->out[0] != bp &&
+       bb_reachable_by(bf, bp->out[0], bt))
+      return TRUE;
+   if (bp->out[1] && bp->out[1] != bp &&
+       bb_reachable_by(bf, bp->out[1], bt))
+      return TRUE;
+   return FALSE;
+}
+
+/* The live set of a block contains those values that are live immediately
+ * before the beginning of the block.
+ */
+static int
+pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *i;
+   int j, n, ret = 0;
+
+   /* slight hack for undecidedness: set phi = entry if it's undefined */
+   if (!b->phi)
+      b->phi = b->entry;
+
+   for (n = 0; n < 2; ++n) {
+      if (!b->out[n] || b->out[n] == b)
+         continue;
+      ret = pass_build_live_sets(ctx, b->out[n]);
+      if (ret)
+         return ret;
+
+      if (n == 0) {
+         for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j)
+            b->live_set[j] = b->out[n]->live_set[j];
+      } else {
+         for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j)
+            b->live_set[j] |= b->out[n]->live_set[j];
+      }
+
+      /* Kick values out of our live set that are created in incoming
+       * blocks of our successors that are not us.
+       */
+      for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
+         for (j = 0; j < 4; ++j) {
+            if (!i->src[j])
+               break;
+            assert(i->src[j]->value->insn);
+
+            if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) {
+               live_set_add(b, i->src[j]->value);
+               debug_printf("%p: live set + %i\n", b, i->src[j]->value->n);
+            } else {
+               live_set_rem(b, i->src[j]->value);
+               debug_printf("%p: live set - %i\n", b, i->src[j]->value->n);
+            }
+         }
+      }
+   }
+
+   if (b->pass_seq >= ctx->pc->pass_seq)
+      return 0;
+   b->pass_seq = ctx->pc->pass_seq;
+
+   debug_printf("%s: visiting block %p\n", __FUNCTION__, b);
+
+   if (!b->entry)
+      return 0;
+   bb_live_set_print(ctx->pc, b);
+
+   for (i = b->exit; i; i = i->prev) {
+      for (j = 0; j < 4; j++) {
+         if (!i->def[j])
+            break;
+         live_set_rem(b, i->def[j]);
+      }
+      for (j = 0; j < 4; j++) {
+         if (!i->src[j])
+            break;
+         live_set_add(b, i->src[j]->value);
+      }
+      if (i->src[4])
+         live_set_add(b, i->src[4]->value);
+      if (i->flags_def)
+         live_set_rem(b, i->flags_def);
+      if (i->flags_src)
+         live_set_add(b, i->flags_src->value);
+   }
+   bb_live_set_print(ctx->pc, b);
+
+   return 0;
+}
+
+static void collect_live_values(struct nv_basic_block *b, const int n)
+{
+   int i;
+
+   if (b->out[0]) {
+      if (b->out[1]) { /* what to do about back-edges ? */
+         for (i = 0; i < n; ++i)
+            b->live_set[i] = b->out[0]->live_set[i] | b->out[1]->live_set[i];
+      } else {
+         memcpy(b->live_set, b->out[0]->live_set, n * sizeof(uint32_t));
+      }
+   } else
+   if (b->out[1]) {
+      memcpy(b->live_set, b->out[1]->live_set, n * sizeof(uint32_t));
+   } else {
+      memset(b->live_set, 0, n * sizeof(uint32_t));
+   }
+}
+
+/* NOTE: the live intervals of phi functions start the the first non-phi instruction */
+static int
+pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *i, *i_stop;
+   int j, s;
+   const int n = (ctx->pc->num_values + 31) / 32;
+
+   debug_printf("building intervals for BB %i\n", b->id);
+
+   /* verify that first block does not have live-in values */
+   if (b->num_in == 0)
+      for (j = 0; j < n; ++j)
+         assert(b->live_set[j] == 0);
+
+   collect_live_values(b, n);
+
+   /* remove live-outs def'd in a parallel block, hopefully they're all phi'd */
+   for (j = 0; j < 2; ++j) {
+      if (!b->out[j] || !b->out[j]->phi)
+         continue;
+      for (i = b->out[j]->phi; i->opcode == NV_OP_PHI; i = i->next) {
+         live_set_rem(b, i->def[0]);
+
+         for (s = 0; s < 4; ++s) {
+            if (!i->src[s])
+               break;
+            assert(i->src[s]->value->insn);
+            if (bb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j]))
+               live_set_add(b, i->src[s]->value);
+            else
+               live_set_rem(b, i->src[s]->value);
+         }
+      }
+   }
+
+   /* remaining live-outs are live until the end */
+   for (j = 0; j < ctx->pc->num_values; ++j) {
+      if (!(b->live_set[j / 32] & (1 << (j % 32))))
+         continue;
+#ifdef NV50_RA_DEBUG_LIVEI
+      debug_printf("adding range for live value %i\n", j);
+#endif
+      add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
+   }
+   debug_printf("%s: looping through instructions now\n", __func__);
+
+   i_stop = b->entry ? b->entry->prev : NULL;
+
+   /* don't have to include phi functions here (will have 0 live range) */
+   for (i = b->exit; i != i_stop; i = i->prev) {
+      assert(i->serial >= b->phi->serial && i->serial <= b->exit->serial);
+      for (j = 0; j < 4; ++j) {
+         if (i->def[j])
+            live_set_rem(b, i->def[j]);
+      }
+      if (i->flags_def)
+         live_set_rem(b, i->flags_def);
+
+      for (j = 0; j < 5; ++j) {
+         if (i->src[j] && !live_set_test(b, i->src[j])) {
+            live_set_add(b, i->src[j]->value);
+#ifdef NV50_RA_DEBUG_LIVEI
+            debug_printf("adding range for source that ends living: %i\n",
+                         i->src[j]->value->n);
+#endif
+            add_range(i->src[j]->value, b, i->serial);
+         }
+      }
+      if (i->flags_src && !live_set_test(b, i->flags_src)) {
+         live_set_add(b, i->flags_src->value);
+#ifdef NV50_RA_DEBUG_LIVEI
+         debug_printf("adding range for source that ends living: %i\n",
+                      i->flags_src->value->n);
+#endif
+         add_range(i->flags_src->value, b, i->serial);
+      }
+   }
+
+   b->pass_seq = ctx->pc->pass_seq;
+
+   if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq)
+      pass_build_intervals(ctx, b->out[0]);
+
+   if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq)
+      pass_build_intervals(ctx, b->out[1]);
+
+   debug_printf("built intervals for block %p\n", b);
+
+   return 0;
+}
+
+static INLINE void
+nv50_ctor_register_set(struct nv_pc *pc, struct register_set *set)
+{
+   memset(set, 0, sizeof(*set));
+
+   set->last[NV_FILE_GPR] = 255;
+   set->last[NV_FILE_OUT] = 127;
+   set->last[NV_FILE_FLAGS] = 4;
+   set->last[NV_FILE_ADDR] = 4;
+
+   set->pc = pc;
+}
+
+static void
+insert_ordered_tail(struct nv_value *list, struct nv_value *nval)
+{
+   struct nv_value *elem = list->prev;
+
+   // debug_printf("inserting value %i\n", nval->n);
+
+   for (elem = list->prev;
+	elem != list && elem->livei->bgn > nval->livei->bgn;
+	elem = elem->prev);
+   /* now elem begins before or at the same time as val */
+
+   nval->prev = elem;
+   nval->next = elem->next;
+   elem->next->prev = nval;
+   elem->next = nval;
+}
+
+static int
+pass_linear_scan(struct nv_pc_pass *ctx, int iter)
+{
+   struct nv_instruction *i;
+   struct register_set f, free;
+   int k, n;
+   struct nv_value *cur, *val, *tmp[2];
+   struct nv_value active, inactive, handled, unhandled;
+
+   make_empty_list(&active);
+   make_empty_list(&inactive);
+   make_empty_list(&handled);
+   make_empty_list(&unhandled);
+
+   nv50_ctor_register_set(ctx->pc, &free);
+
+   /* joined values should have range = NULL and thus not be added;
+    * also, fixed memory values won't be added because they're not
+    * def'd, just used
+    */
+   for (n = 0; n < ctx->num_insns; ++n) {
+      i = ctx->insns[n];
+
+      for (k = 0; k < 4; ++k) {
+         if (i->def[k] && i->def[k]->livei)
+            insert_ordered_tail(&unhandled, i->def[k]);
+         else
+         if (0 && i->def[k])
+            debug_printf("skipping def'd value %i: no livei\n", i->def[k]->n);
+      }
+      if (i->flags_def && i->flags_def->livei)
+         insert_ordered_tail(&unhandled, i->flags_def);
+   }
+
+   for (val = unhandled.next; val != unhandled.prev; val = val->next) {
+      assert(val->join == val);
+      assert(val->livei->bgn <= val->next->livei->bgn);
+   }
+
+   foreach_s(cur, tmp[0], &unhandled) {
+      remove_from_list(cur);
+
+      /* debug_printf("handling value %i\n", cur->n); */
+
+      foreach_s(val, tmp[1], &active) {
+         if (livei_end(val) <= cur->livei->bgn) {
+            reg_release(&free, val);
+            move_to_head(&handled, val);
+         } else
+         if (!livei_contains(val, cur->livei->bgn)) {
+            reg_release(&free, val);
+            move_to_head(&inactive, val);
+         }
+      }
+
+      foreach_s(val, tmp[1], &inactive) {
+         if (livei_end(val) <= cur->livei->bgn)
+            move_to_head(&handled, val);
+         else
+         if (livei_contains(val, cur->livei->bgn)) {
+            reg_occupy(&free, val);
+            move_to_head(&active, val);
+         }
+      }
+
+      f = free;
+
+      foreach(val, &inactive)
+         if (livei_have_overlap(val, cur))
+            reg_occupy(&f, val);
+
+      foreach(val, &unhandled)
+         if (val->reg.id >= 0 && livei_have_overlap(val, cur))
+            reg_occupy(&f, val);
+
+      if (cur->reg.id < 0) {
+         boolean mem = FALSE;
+
+         if (nv_is_vector_op(cur->insn->opcode))
+            mem = !reg_assign(&f, &cur->insn->def[0], 4);
+         else
+         if (iter)
+            mem = !reg_assign(&f, &cur, 1);
+
+         if (mem) {
+            NOUVEAU_ERR("out of registers\n");
+            abort();
+         }
+      }
+      insert_at_head(&active, cur);
+      reg_occupy(&free, cur);
+   }
+
+   return 0;
+}
+
+static int
+pass_eliminate_moves(struct nv_pc_pass *ctx)
+{
+   return 0;
+}
+
+int
+nv_pc_exec_pass1(struct nv_pc *pc)
+{
+   struct nv_pc_pass *ctx;
+   int i, ret;
+
+   debug_printf("REGISTER ALLOCATION - entering\n");
+
+   ctx = CALLOC_STRUCT(nv_pc_pass);
+   if (!ctx)
+      return -1;
+   ctx->pc = pc;
+
+   nv_print_program(ctx->pc->root);
+
+   ctx->insns = CALLOC(pc->num_instructions, sizeof(struct nv_instruction *));
+
+   pc->pass_seq++;
+   ret = pass_generate_phi_movs(ctx, pc->root);
+   assert(!ret);
+
+   nv_print_program(ctx->pc->root);
+
+   for (i = 0; i < pc->loop_nesting_bound; ++i) {
+      pc->pass_seq++;
+      ret = pass_build_live_sets(ctx, pc->root);
+      assert(!ret && "live sets");
+      if (ret) {
+         NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i);
+         goto out;
+      }
+   }
+
+   pc->pass_seq++;
+   ret = pass_order_instructions(ctx, pc->root);
+   assert(!ret && "order instructions");
+   if (ret)
+      goto out;
+
+   pc->pass_seq++;
+   ret = pass_build_intervals(ctx, pc->root);
+   assert(!ret && "build intervals");
+   if (ret) {
+      NOUVEAU_ERR("failed to build live intervals\n");
+      goto out;
+   }
+
+   for (i = 0; i < 2; ++i) {
+      ret = pass_join_values(ctx, i);
+      if (ret)
+         goto out;
+      ret = pass_linear_scan(ctx, i);
+      if (ret)
+         goto out;
+   }
+   assert(!ret && "joining");
+
+   ret = pass_eliminate_moves(ctx);
+
+   for (i = 0; i < pc->num_values; ++i)
+      livei_release(&pc->values[i]);
+
+   debug_printf("REGISTER ALLOCATION - leaving\n");
+   nv_print_program(ctx->pc->root);
+
+out:
+   FREE(ctx);
+   return ret;
+}
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 8cb1639013..26d1be8db8 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -1,5 +1,5 @@
 /*
- * Copyright 2008 Ben Skeggs
+ * Copyright 2010 Chrsitoph Bumiller
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,4674 +20,553 @@
  * SOFTWARE.
  */
 
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_state.h"
-#include "util/u_inlines.h"
+#include "nv50_program.h"
+#include "nv50_pc.h"
+#include "nv50_context.h"
 
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 
-#include "nv50_context.h"
-#include "nv50_transfer.h"
-
-#define NV50_SU_MAX_TEMP 127
-#define NV50_SU_MAX_ADDR 4
-//#define NV50_PROGRAM_DUMP
-
-/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */
-
-/* ARL - gallium craps itself on progs/vp/arl.txt
- *
- * MSB - Like MAD, but MUL+SUB
- * 	- Fuck it off, introduce a way to negate args for ops that
- * 	  support it.
- *
- * Look into inlining IMMD for ops other than MOV (make it general?)
- * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
- * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
- *
- * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
- * case, if the emit_src() causes the inst to suddenly become long.
- *
- * Verify half-insns work where expected - and force disable them where they
- * don't work - MUL has it forcibly disabled atm as it fixes POW..
- *
- * FUCK! watch dst==src vectors, can overwrite components that are needed.
- * 	ie. SUB R0, R0.yzxw, R0
- *
- * Things to check with renouveau:
- * 	FP attr/result assignment - how?
- * 		attrib
- * 			- 0x16bc maps vp output onto fp hpos
- * 			- 0x16c0 maps vp output onto fp col0
- * 		result
- * 			- colr always 0-3
- * 			- depr always 4
- * 0x16bc->0x16e8 --> some binding between vp/fp regs
- * 0x16b8 --> VP output count
- *
- * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
- * 	      "MOV rcol.x, fcol.y" = 0x00000004
- * 0x19a8 --> as above but 0x00000100 and 0x00000000
- * 	- 0x00100000 used when KIL used
- * 0x196c --> as above but 0x00000011 and 0x00000000
- *
- * 0x1988 --> 0xXXNNNNNN
- * 	- XX == FP high something
- */
-struct nv50_reg {
-	enum {
-		P_TEMP,
-		P_ATTR,
-		P_RESULT,
-		P_CONST,
-		P_IMMD,
-		P_ADDR
-	} type;
-	int index;
-
-	int hw;
-	int mod;
-
-	int rhw; /* result hw for FP outputs, or interpolant index */
-	int acc; /* instruction where this reg is last read (first insn == 1) */
-
-	int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */
-	int indirect[2]; /* index into pc->addr, or -1 */
-
-	ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */
-};
-
-#define NV50_MOD_NEG 1
-#define NV50_MOD_ABS 2
-#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS)
-#define NV50_MOD_SAT 4
-#define NV50_MOD_I32 8
-
-/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */
-
-/* STACK: Conditionals and loops have to use the (per warp) stack.
- * Stack entries consist of an entry type (divergent path, join at),
- * a mask indicating the active threads of the warp, and an address.
- * MPs can store 12 stack entries internally, if we need more (and
- * we probably do), we have to create a stack buffer in VRAM.
- */
-/* impose low limits for now */
-#define NV50_MAX_COND_NESTING 4
-#define NV50_MAX_LOOP_NESTING 3
-
-#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2
-
-struct nv50_pc {
-	struct nv50_program *p;
-
-	/* hw resources */
-	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
-	struct nv50_reg r_addr[NV50_SU_MAX_ADDR];
-
-	/* tgsi resources */
-	struct nv50_reg *temp;
-	int temp_nr;
-	struct nv50_reg *attr;
-	int attr_nr;
-	struct nv50_reg *result;
-	int result_nr;
-	struct nv50_reg *param;
-	int param_nr;
-	struct nv50_reg *immd;
-	uint32_t *immd_buf;
-	int immd_nr;
-	struct nv50_reg **addr;
-	int addr_nr;
-	struct nv50_reg *sysval;
-	int sysval_nr;
-
-	struct nv50_reg *temp_temp[16];
-	struct nv50_program_exec *temp_temp_exec[16];
-	unsigned temp_temp_nr;
-
-	/* broadcast and destination replacement regs */
-	struct nv50_reg *r_brdc;
-	struct nv50_reg *r_dst[4];
-
-	struct nv50_reg reg_instances[16];
-	unsigned reg_instance_nr;
-
-	unsigned interp_mode[32];
-	/* perspective interpolation registers */
-	struct nv50_reg *iv_p;
-	struct nv50_reg *iv_c;
-
-	struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING];
-	struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING];
-	struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING];
-	int if_lvl, loop_lvl;
-	unsigned loop_pos[NV50_MAX_LOOP_NESTING];
-
-	unsigned *insn_pos; /* actual program offset of each TGSI insn */
-	boolean in_subroutine;
-
-	/* current instruction and total number of insns */
-	unsigned insn_cur;
-	unsigned insn_nr;
-
-	boolean allow32;
-
-	uint8_t edgeflag_out;
-};
-
-static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *);
-
-static INLINE void
-ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
-{
-	reg->type = type;
-	reg->index = index;
-	reg->hw = hw;
-	reg->mod = 0;
-	reg->rhw = -1;
-	reg->vtx = -1;
-	reg->acc = 0;
-	reg->indirect[0] = reg->indirect[1] = -1;
-	reg->buf_index = (type == P_CONST) ? 1 : 0;
-}
-
 static INLINE unsigned
-popcnt4(uint32_t val)
-{
-	static const unsigned cnt[16]
-	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
-	return cnt[val & 0xf];
-}
-
-static void
-terminate_mbb(struct nv50_pc *pc)
-{
-	int i;
-
-	/* remove records of temporary address register values */
-	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
-		if (pc->r_addr[i].index < 0)
-			pc->r_addr[i].acc = 0;
-}
-
-static void
-alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
-{
-	int i = 0;
-
-	if (reg->type == P_RESULT) {
-		if (pc->p->cfg.high_result < (reg->hw + 1))
-			pc->p->cfg.high_result = reg->hw + 1;
-	}
-
-	if (reg->type != P_TEMP)
-		return;
-
-	if (reg->hw >= 0) {
-		/*XXX: do this here too to catch FP temp-as-attr usage..
-		 *     not clean, but works */
-		if (pc->p->cfg.high_temp < (reg->hw + 1))
-			pc->p->cfg.high_temp = reg->hw + 1;
-		return;
-	}
-
-	if (reg->rhw != -1) {
-		/* try to allocate temporary with index rhw first */
-		if (!(pc->r_temp[reg->rhw])) {
-			pc->r_temp[reg->rhw] = reg;
-			reg->hw = reg->rhw;
-			if (pc->p->cfg.high_temp < (reg->rhw + 1))
-				pc->p->cfg.high_temp = reg->rhw + 1;
-			return;
-		}
-		/* make sure we don't get things like $r0 needs to go
-		 * in $r1 and $r1 in $r0
-		 */
-		i = pc->result_nr * 4;
-	}
-
-	for (; i < NV50_SU_MAX_TEMP; i++) {
-		if (!(pc->r_temp[i])) {
-			pc->r_temp[i] = reg;
-			reg->hw = i;
-			if (pc->p->cfg.high_temp < (i + 1))
-				pc->p->cfg.high_temp = i + 1;
-			return;
-		}
-	}
-
-	NOUVEAU_ERR("out of registers\n");
-	abort();
-}
-
-static INLINE struct nv50_reg *
-reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
+bitcount4(const uint32_t val)
 {
-	struct nv50_reg *ri;
-
-	assert(pc->reg_instance_nr < 16);
-	ri = &pc->reg_instances[pc->reg_instance_nr++];
-	if (reg) {
-		alloc_reg(pc, reg);
-		*ri = *reg;
-		reg->indirect[0] = reg->indirect[1] = -1;
-		reg->mod = 0;
-	}
-	return ri;
+   static const unsigned cnt[16]
+   = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+   return cnt[val & 0xf];
 }
 
-/* XXX: For shaders that aren't executed linearly (e.g. shaders that
- * contain loops), we need to assign all hw regs to TGSI TEMPs early,
- * lest we risk temp_temps overwriting regs alloc'd "later".
- */
-static struct nv50_reg *
-alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
-{
-	struct nv50_reg *r;
-	int i;
+static unsigned
+nv50_tgsi_src_mask(const struct tgsi_full_instruction *inst, int c)
+{
+   unsigned mask = inst->Dst[0].Register.WriteMask;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+      return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+   case TGSI_OPCODE_DP3:
+      return 0x7;
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_KIL: /* WriteMask ignored */
+      return 0xf;
+   case TGSI_OPCODE_DST:
+      return mask & (c ? 0xa : 0x6);
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_SCS:
+      return 0x1;
+   case TGSI_OPCODE_IF:
+      return 0x1;
+   case TGSI_OPCODE_LIT:
+      return 0xb;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+   {
+      const struct tgsi_instruction_texture *tex;
+
+      assert(inst->Instruction.Texture);
+      tex = &inst->Texture;
+
+      mask = 0x7;
+      if (inst->Instruction.Opcode != TGSI_OPCODE_TEX &&
+          inst->Instruction.Opcode != TGSI_OPCODE_TXD)
+         mask |= 0x8; /* bias, lod or proj */
+
+      switch (tex->Texture) {
+      case TGSI_TEXTURE_1D:
+         mask &= 0x9;
+         break;
+      case TGSI_TEXTURE_SHADOW1D:
+         mask &= 0x5;
+         break;
+      case TGSI_TEXTURE_2D:
+         mask &= 0xb;
+         break;
+      default:
+         break;
+      }
+   }
+  	   return mask;
+   case TGSI_OPCODE_XPD:
+   {
+      unsigned x = 0;
+      if (mask & 1) x |= 0x6;
+      if (mask & 2) x |= 0x5;
+      if (mask & 4) x |= 0x3;
+      return x;
+   }
+   default:
+      break;
+   }
+
+   return mask;
+}
+
+static void
+nv50_indirect_inputs(struct nv50_translation_info *ti, int id)
+{
+   int i, c;
+
+   for (i = 0; i < PIPE_MAX_SHADER_INPUTS; ++i)
+      for (c = 0; c < 4; ++c)
+         ti->input_access[i][c] = id;
+
+   ti->indirect_inputs = TRUE;
+}
+
+static void
+nv50_indirect_outputs(struct nv50_translation_info *ti, int id)
+{
+   int i, c;
+
+   for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i)
+      for (c = 0; c < 4; ++c)
+         ti->output_access[i][c] = id;
+
+   ti->indirect_outputs = TRUE;
+}
+
+static void
+prog_inst(struct nv50_translation_info *ti,
+          const struct tgsi_full_instruction *inst, int id)
+{
+   const struct tgsi_dst_register *dst;
+   const struct tgsi_src_register *src;
+   int s, c, k;
+   unsigned mask;
+
+   if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+      for (c = 0; c < 4; ++c) {
+         dst = &inst->Dst[0].Register;
+         if (inst->Dst[0].Register.Indirect)
+            nv50_indirect_outputs(ti, id);
+         if (!(dst->WriteMask & (1 << c)))
+            continue;
+         ti->output_access[dst->Index][c] = id;
+      }
+
+      if (inst->Instruction.Opcode == TGSI_OPCODE_MOV &&
+          inst->Src[0].Register.File == TGSI_FILE_INPUT &&
+          dst->Index == ti->edgeflag_out)
+         ti->p->vp.edgeflag = inst->Src[0].Register.Index;
+   }
 
-	if (dst && dst->type == P_TEMP && dst->hw == -1)
-		return dst;
+   for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
+      src = &inst->Src[s].Register;
+      if (src->File != TGSI_FILE_INPUT)
+         continue;
+      mask = nv50_tgsi_src_mask(inst, s);
 
-	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
-		if (!pc->r_temp[i]) {
-			r = MALLOC_STRUCT(nv50_reg);
-			ctor_reg(r, P_TEMP, -1, i);
-			pc->r_temp[i] = r;
-			return r;
-		}
-	}
+      if (inst->Src[s].Register.Indirect)
+         nv50_indirect_inputs(ti, id);
 
-	NOUVEAU_ERR("out of registers\n");
-	abort();
-	return NULL;
+      for (c = 0; c < 4; ++c) {
+         if (!(mask & (1 << c)))
+            continue;
+         k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c);
+         if (k <= TGSI_SWIZZLE_W)
+            ti->input_access[src->Index][k] = id;
+      }
+   }
 }
 
-/* release the hardware resource held by r */
 static void
-release_hw(struct nv50_pc *pc, struct nv50_reg *r)
+prog_immediate(struct nv50_translation_info *ti,
+               const struct tgsi_full_immediate *imm)
 {
-	assert(r->type == P_TEMP);
-	if (r->hw == -1)
-		return;
+   int c;
+   unsigned n = ++ti->immd32_nr;
 
-	assert(pc->r_temp[r->hw] == r);
-	pc->r_temp[r->hw] = NULL;
+   if (n == (1 << (ffs(n) - 1)))
+      ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16);
 
-	r->acc = 0;
-	if (r->index == -1)
-		FREE(r);
+   for (c = 0; c < 4; ++c)
+      ti->immd32[(n - 1) * 4 + c] = imm->u[c].Uint;
 }
 
-static void
-free_temp(struct nv50_pc *pc, struct nv50_reg *r)
-{
-	if (r->index == -1) {
-		unsigned hw = r->hw;
-
-		FREE(pc->r_temp[hw]);
-		pc->r_temp[hw] = NULL;
-	}
+static INLINE unsigned
+translate_interpolate(const struct tgsi_full_declaration *decl)
+{
+   unsigned mode;
+
+   if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_CONSTANT)
+      mode = NV50_INTERP_FLAT;
+   else
+   if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
+      mode = 0;
+   else
+      mode = NV50_INTERP_LINEAR;
+
+   if (decl->Declaration.Centroid)
+      mode |= NV50_INTERP_CENTROID;
+
+   return mode;
+}
+
+static void
+prog_decl(struct nv50_translation_info *ti,
+          const struct tgsi_full_declaration *decl)
+{
+   unsigned i, first, last, sn = 0, si = 0;
+
+   first = decl->Range.First;
+   last = decl->Range.Last;
+
+   if (decl->Declaration.Semantic) {
+      sn = decl->Semantic.Name;
+      si = decl->Semantic.Index;
+   }
+   tgsi_dump_declaration(decl);
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_INPUT:
+      for (i = first; i <= last; ++i)
+         ti->interp_mode[i] = translate_interpolate(decl);
+
+      if (!decl->Declaration.Semantic)
+         break;
+
+      for (i = first; i <= last; ++i) {
+         ti->p->in[i].sn = sn;
+         ti->p->in[i].si = si;
+      }
+
+      switch (sn) {
+      case TGSI_SEMANTIC_FACE:
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         if (ti->p->type == PIPE_SHADER_FRAGMENT)
+            ti->p->vp.bfc[si] = first;
+         break;
+      }
+      break;
+   case TGSI_FILE_OUTPUT:
+      if (!decl->Declaration.Semantic)
+         break;
+
+      for (i = first; i <= last; ++i) {
+         ti->p->out[i].sn = sn;
+         ti->p->out[i].si = si;
+      }
+
+      switch (sn) {
+      case TGSI_SEMANTIC_BCOLOR:
+         ti->p->vp.bfc[si] = first;
+         break;
+      case TGSI_SEMANTIC_PSIZE:
+         ti->p->vp.psiz = first;
+         break;
+      case TGSI_SEMANTIC_EDGEFLAG:
+         ti->edgeflag_out = first;
+         break;
+      default:
+         break;
+      }
+      break;
+   case TGSI_FILE_SYSTEM_VALUE:
+      switch (decl->Semantic.Name) {
+      case TGSI_SEMANTIC_FACE:
+         break;
+      case TGSI_SEMANTIC_INSTANCEID:
+         break;
+      case TGSI_SEMANTIC_PRIMID:
+         break;
+         /*
+      case TGSI_SEMANTIC_PRIMIDIN:
+         break;
+      case TGSI_SEMANTIC_VERTEXID:
+         break;
+         */
+      default:
+         break;
+      }
+      break;
+   case TGSI_FILE_CONSTANT:
+      ti->p->parm_size = MAX2(ti->p->parm_size, (last + 1) * 16);
+      break;
+   case TGSI_FILE_ADDRESS:
+   case TGSI_FILE_SAMPLER:
+   case TGSI_FILE_TEMPORARY:
+      break;
+   default:
+      assert(0);
+      break;
+   }
 }
 
 static int
-alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx)
+nv50_vertprog_prepare(struct nv50_translation_info *ti)
 {
-	int i;
-
-	if ((idx + 4) >= NV50_SU_MAX_TEMP)
-		return 1;
-
-	if (pc->r_temp[idx] || pc->r_temp[idx + 1] ||
-	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3])
-		return alloc_temp4(pc, dst, idx + 4);
+   struct nv50_program *p = ti->p;
+   int i, c;
+   unsigned num_inputs = 0;
 
-	for (i = 0; i < 4; i++) {
-		dst[i] = MALLOC_STRUCT(nv50_reg);
-		ctor_reg(dst[i], P_TEMP, -1, idx + i);
-		pc->r_temp[idx + i] = dst[i];
-	}
+   ti->input_file = NV_FILE_MEM_S;
+   ti->output_file = NV_FILE_OUT;
 
-	return 0;
-}
-
-static void
-free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
-{
-	int i;
+   for (i = 0; i <= ti->scan.file_max[TGSI_FILE_INPUT]; ++i) {
+      p->in[i].id = i;
+      p->in[i].hw = num_inputs;
 
-	for (i = 0; i < 4; i++)
-		free_temp(pc, reg[i]);
-}
+      for (c = 0; c < 4; ++c) {
+         if (!ti->input_access[i][c])
+            continue;
+         ti->input_map[i][c] = num_inputs++;
+         p->vp.attrs[(4 * i + c) / 32] |= 1 << ((i * 4 + c) % 32);
+      }
+   }
 
-static struct nv50_reg *
-temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
-{
-	if (pc->temp_temp_nr >= 16)
-		assert(0);
+   for (i = 0; i <= ti->scan.file_max[TGSI_FILE_OUTPUT]; ++i) {
+      p->out[i].id = i;
+      p->out[i].hw = p->max_out;
 
-	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
-	pc->temp_temp_exec[pc->temp_temp_nr] = e;
-	return pc->temp_temp[pc->temp_temp_nr++];
-}
+      for (c = 0; c < 4; ++c) {
+         if (!ti->output_access[i][c])
+            continue;
+         ti->output_map[i][c] = p->max_out++;
+         p->out[i].mask |= 1 << c;
+      }
+   }
 
-/* This *must* be called for all nv50_program_exec that have been
- * given as argument to temp_temp, or the temps will be leaked !
- */
-static void
-kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
-{
-	int i;
+   if (p->vp.psiz < 0x40)
+      p->vp.psiz = p->out[p->vp.psiz].hw;
 
-	for (i = 0; i < pc->temp_temp_nr; i++)
-		if (pc->temp_temp_exec[i] == e)
-			free_temp(pc, pc->temp_temp[i]);
-	if (!e)
-		pc->temp_temp_nr = 0;
+   return 0;
 }
 
 static int
-ctor_immd_4u32(struct nv50_pc *pc,
-	       uint32_t x, uint32_t y, uint32_t z, uint32_t w)
-{
-	unsigned size = pc->immd_nr * 4 * sizeof(uint32_t);
-
-	pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t));
-
-	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
-	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
-	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
-	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
-
-	return pc->immd_nr++;
-}
-
-static INLINE int
-ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w)
-{
-	return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w));
-}
-
-static struct nv50_reg *
-alloc_immd(struct nv50_pc *pc, float f)
-{
-	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg);
-	unsigned hw;
-
-	for (hw = 0; hw < pc->immd_nr * 4; hw++)
-		if (pc->immd_buf[hw] == fui(f))
-			break;
-
-	if (hw == pc->immd_nr * 4)
-		hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4;
-
-	ctor_reg(r, P_IMMD, -1, hw);
-	return r;
-}
-
-static struct nv50_program_exec *
-exec(struct nv50_pc *pc)
-{
-	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
-
-	e->param.index = -1;
-	return e;
-}
-
-static void
-emit(struct nv50_pc *pc, struct nv50_program_exec *e)
-{
-	struct nv50_program *p = pc->p;
-
-	if (p->exec_tail)
-		p->exec_tail->next = e;
-	if (!p->exec_head)
-		p->exec_head = e;
-	p->exec_tail = e;
-	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
-
-	kill_temp_temp(pc, e);
-}
-
-static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
-
-static boolean
-is_long(struct nv50_program_exec *e)
-{
-	if (e->inst[0] & 1)
-		return TRUE;
-	return FALSE;
-}
-
-static boolean
-is_immd(struct nv50_program_exec *e)
-{
-	if (is_long(e) && (e->inst[1] & 3) == 3)
-		return TRUE;
-	return FALSE;
-}
-
-static boolean
-is_join(struct nv50_program_exec *e)
-{
-	if (is_long(e) && (e->inst[1] & 3) == 2)
-		return TRUE;
-	return FALSE;
-}
-
-static INLINE boolean
-is_control_flow(struct nv50_program_exec *e)
-{
-	return (e->inst[0] & 2);
-}
-
-static INLINE void
-set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
-	 struct nv50_program_exec *e)
-{
-	assert(!is_immd(e));
-	set_long(pc, e);
-	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
-	e->inst[1] |= (pred << 7) | (idx << 12);
-}
-
-static INLINE void
-set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
-	    struct nv50_program_exec *e)
-{
-	set_long(pc, e);
-	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
-	e->inst[1] |= (idx << 4) | (on << 6);
-}
-
-static INLINE void
-set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
-{
-	if (is_long(e))
-		return;
-
-	e->inst[0] |= 1;
-	set_pred(pc, 0xf, 0, e);
-	set_pred_wr(pc, 0, 0, e);
-}
-
-static INLINE void
-set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
-{
-	if (dst->type == P_RESULT) {
-		set_long(pc, e);
-		e->inst[1] |= 0x00000008;
-	}
-
-	alloc_reg(pc, dst);
-	if (dst->hw > 63)
-		set_long(pc, e);
-	e->inst[0] |= (dst->hw << 2);
+nv50_fragprog_prepare(struct nv50_translation_info *ti)
+{
+   struct nv50_program *p = ti->p;
+   int i, j, c;
+   unsigned nvary, nintp, depr;
+   unsigned n = 0, m = 0, skip = 0;
+   ubyte sn[16], si[16];
+
+   /* FP flags */
+
+   if (ti->scan.writes_z) {
+      p->fp.flags[1] = 0x11;
+      p->fp.flags[0] |= NV50TCL_FP_CONTROL_EXPORTS_Z;
+   }
+
+   if (ti->scan.uses_kill)
+      p->fp.flags[0] |= NV50TCL_FP_CONTROL_USES_KIL;
+
+   /* FP inputs */
+
+   ti->input_file = NV_FILE_MEM_V;
+   ti->output_file = NV_FILE_GPR;
+
+   /* count non-flat inputs, save semantic info */
+   for (i = 0; i < p->in_nr; ++i) {
+      m += (ti->interp_mode[i] & NV50_INTERP_FLAT) ? 0 : 1;
+      sn[i] = p->in[i].sn;
+      si[i] = p->in[i].si;
+   }
+
+   /* reorder p->in[] so that non-flat inputs are first and
+    * kick out special inputs that don't use VP/GP_RESULT_MAP
+    */
+   nintp = 0;
+   for (i = 0; i < p->in_nr; ++i) {
+      if (sn[i] == TGSI_SEMANTIC_POSITION) {
+         for (c = 0; c < 4; ++c) {
+            ti->input_map[i][c] = nintp;
+            if (ti->input_access[i][c]) {
+               p->fp.interp |= 1 << (24 + c);
+               ++nintp;
+            }
+         }
+         skip++;
+         continue;
+      } else
+      if (sn[i] == TGSI_SEMANTIC_FACE) {
+         ti->input_map[i][0] = 255;
+         skip++;
+         continue;
+      }
+
+      j = (ti->interp_mode[i] & NV50_INTERP_FLAT) ? m++ : n++;
+
+      if (sn[i] == TGSI_SEMANTIC_COLOR)
+         p->vp.bfc[si[i]] = j;
+	   
+      p->in[j].linear = (ti->interp_mode[i] & NV50_INTERP_LINEAR) ? 1 : 0;
+      p->in[j].id = i;
+      p->in[j].sn = sn[i];
+      p->in[j].si = si[i];
+   }
+   assert(n <= m);
+   p->in_nr -= skip;
+
+   if (!(p->fp.interp & (8 << 24))) {
+      p->fp.interp |= (8 << 24);
+      ++nintp;
+   }
+
+   p->fp.colors = (1 << 24) | 4; /* CLAMP, FFC0_ID = 4 */
+
+   for (i = 0; i < p->in_nr; ++i) {
+      int j = p->in[i].id;
+      p->in[i].hw = nintp;
+
+      for (c = 0; c < 4; ++c) {
+         if (!ti->input_access[j][c])
+            continue;
+         p->in[i].mask |= 1 << c;
+         ti->input_map[j][c] = nintp++;
+      }
+      /* count color inputs */
+      if (i == p->vp.bfc[0] || i == p->vp.bfc[1])
+         p->fp.colors += bitcount4(p->in[i].mask) << 16;
+   }
+   nintp -= bitcount4(p->fp.interp >> 24); /* subtract position inputs */
+   nvary = nintp;
+   if (n < m)
+      nvary -= p->in[n].hw;
+
+   p->fp.interp |= nvary << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_SHIFT;
+   p->fp.interp |= nintp << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_SHIFT;
+
+   /* FP outputs */
+
+   if (p->out_nr > (1 + (ti->scan.writes_z ? 1 : 0)))
+      p->fp.flags[0] |= NV50TCL_FP_CONTROL_MULTIPLE_RESULTS;
+
+   depr = p->out_nr;
+   for (i = 0; i < p->out_nr; ++i) {
+      p->out[i].id = i;
+      if (p->out[i].sn == TGSI_SEMANTIC_POSITION) {
+         depr = i;
+         continue;
+      }
+      p->out[i].hw = p->max_out;
+      p->out[i].mask = 0xf;
+
+      for (c = 0; c < 4; ++c)
+         ti->output_map[i][c] = p->max_out++;
+   }
+   if (depr < p->out_nr) {
+      p->out[depr].mask = 0x4;
+      p->out[depr].hw = p->max_out++;
+   }
+
+   return 0;
 }
 
-static INLINE void
-set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
-{
-	set_long(pc, e);
-	/* XXX: can't be predicated - bits overlap; cases where both
-	 * are required should be avoided by using pc->allow32 */
-	set_pred(pc, 0, 0, e);
-	set_pred_wr(pc, 0, 0, e);
-
-	e->inst[1] |= 0x00000002 | 0x00000001;
-	e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16;
-	e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2;
-}
-
-static INLINE void
-set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
-{
-	assert(a->type == P_ADDR);
-
-	assert(!(e->inst[0] & 0x0c000000));
-	assert(!(e->inst[1] & 0x00000004));
-
-	e->inst[0] |= (a->hw & 3) << 26;
-	e->inst[1] |= a->hw & 4;
-}
-
-static void
-emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t);
-
-static void
-emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int);
-
-static void
-emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst,
-		   struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[1] = 0x40000000;
-	set_long(pc, e);
-	set_dst(pc, dst, e);
-	set_addr(e, src);
-
-	emit(pc, e);
-}
-
-static void
-emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
-		  struct nv50_reg *src0, uint16_t src1_val)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xd0000000 | (src1_val << 9);
-	e->inst[1] = 0x20000000;
-	set_long(pc, e);
-	e->inst[0] |= dst->hw << 2;
-	if (src0) /* otherwise will add to $a0, which is always 0 */
-		set_addr(e, src0);
-
-	emit(pc, e);
-}
-
-#define INTERP_LINEAR		0
-#define INTERP_FLAT		1
-#define INTERP_PERSPECTIVE	2
-#define INTERP_CENTROID		4
-
-/* interpolant index has been stored in dst->rhw */
-static void
-emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,
-		unsigned mode)
-{
-	struct nv50_program_exec *e = exec(pc);
-	assert(dst->rhw != -1);
-
-	e->inst[0] |= 0x80000000;
-	set_dst(pc, dst, e);
-	e->inst[0] |= (dst->rhw << 16);
-
-	if (mode & INTERP_FLAT) {
-		e->inst[0] |= (1 << 8);
-	} else {
-		if (mode & INTERP_PERSPECTIVE) {
-			e->inst[0] |= (1 << 25);
-			alloc_reg(pc, iv);
-			e->inst[0] |= (iv->hw << 9);
-		}
-
-		if (mode & INTERP_CENTROID)
-			e->inst[0] |= (1 << 24);
-	}
-
-	emit(pc, e);
-}
-
-static void
-set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
-	 struct nv50_program_exec *e)
-{
-	set_long(pc, e);
-
-	e->param.index = src->hw & 127;
-	e->param.shift = s;
-	e->param.mask = m << (s % 32);
-
-	if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */
-		set_addr(e, get_address_reg(pc, src));
-	else
-	if (src->acc < 0) {
-		assert(src->type == P_CONST);
-		set_addr(e, pc->addr[src->indirect[0]]);
-	}
-
-	e->inst[1] |= (src->buf_index << 22);
-}
-
-/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */
-static void
-emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x10000000;
-	if (!pc->allow32)
-		set_long(pc, e);
-
-	set_dst(pc, dst, e);
-
-	if (!is_long(e) && src->type == P_IMMD) {
-		set_immd(pc, src, e);
-		/*XXX: 32-bit, but steals part of "half" reg space - need to
-		 *     catch and handle this case if/when we do half-regs
-		 */
-	} else
-	if (src->type == P_IMMD || src->type == P_CONST) {
-		set_long(pc, e);
-		set_data(pc, src, 0x7f, 9, e);
-		e->inst[1] |= 0x20000000; /* mov from c[] */
-	} else {
-		if (src->type == P_ATTR) {
-			set_long(pc, e);
-			e->inst[1] |= 0x00200000;
-
-			if (src->vtx >= 0) {
-				/* indirect (vertex base + c) load from p[] */
-				e->inst[0] |= 0x01800000;
-				set_addr(e, get_address_reg(pc, src));
-			}
-		}
-
-		alloc_reg(pc, src);
-		if (src->hw > 63)
-			set_long(pc, e);
-		e->inst[0] |= (src->hw << 9);
-	}
-
-	if (is_long(e) && !is_immd(e)) {
-		e->inst[1] |= 0x04000000; /* 32-bit */
-		e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */
-		if (!(e->inst[1] & 0x20000000))
-			e->inst[1] |= 0x00030000; /* lane mask 2:3 */
-	} else
-		e->inst[0] |= 0x00008000;
-
-	emit(pc, e);
-}
-
-static INLINE void
-emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
-{
-	struct nv50_reg *imm = alloc_immd(pc, f);
-	emit_mov(pc, dst, imm);
-	FREE(imm);
-}
-
-/* Assign the hw of the discarded temporary register src
- * to the tgsi register dst and free src.
- */
-static void
-assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	assert(src->index == -1 && src->hw != -1);
-
-	if (pc->if_lvl || pc->loop_lvl ||
-	    (dst->type != P_TEMP) ||
-	    (src->hw < pc->result_nr * 4 &&
-	     pc->p->type == PIPE_SHADER_FRAGMENT) ||
-	    pc->p->info.opcode_count[TGSI_OPCODE_CAL] ||
-	    pc->p->info.opcode_count[TGSI_OPCODE_BRA]) {
-
-		emit_mov(pc, dst, src);
-		free_temp(pc, src);
-		return;
-	}
-
-	if (dst->hw != -1)
-		pc->r_temp[dst->hw] = NULL;
-	pc->r_temp[src->hw] = dst;
-	dst->hw = src->hw;
-
-	FREE(src);
-}
-
-static void
-emit_nop(struct nv50_pc *pc)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xf0000000;
-	set_long(pc, e);
-	e->inst[1] = 0xe0000000;
-	emit(pc, e);
-}
-
-static boolean
-check_swap_src_0_1(struct nv50_pc *pc,
-		   struct nv50_reg **s0, struct nv50_reg **s1)
-{
-	struct nv50_reg *src0 = *s0, *src1 = *s1;
-
-	if (src0->type == P_CONST) {
-		if (src1->type != P_CONST) {
-			*s0 = src1;
-			*s1 = src0;
-			return TRUE;
-		}
-	} else
-	if (src1->type == P_ATTR) {
-		if (src0->type != P_ATTR) {
-			*s0 = src1;
-			*s1 = src0;
-			return TRUE;
-		}
-	}
-
-	return FALSE;
-}
-
-static void
-set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
-		     struct nv50_program_exec *e)
-{
-	struct nv50_reg *temp;
-
-	if (src->type != P_TEMP) {
-		temp = temp_temp(pc, e);
-		emit_mov(pc, temp, src);
-		src = temp;
-	}
-
-	alloc_reg(pc, src);
-	if (src->hw > 63)
-		set_long(pc, e);
-	e->inst[0] |= (src->hw << 9);
-}
-
-static void
-set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
-{
-	if (src->type == P_ATTR) {
-		set_long(pc, e);
-		e->inst[1] |= 0x00200000;
-
-		if (src->vtx >= 0) {
-			e->inst[0] |= 0x01800000; /* src from p[] */
-			set_addr(e, get_address_reg(pc, src));
-		}
-	} else
-	if (src->type == P_CONST || src->type == P_IMMD) {
-		struct nv50_reg *temp = temp_temp(pc, e);
-
-		emit_mov(pc, temp, src);
-		src = temp;
-	}
-
-	alloc_reg(pc, src);
-	if (src->hw > 63)
-		set_long(pc, e);
-	e->inst[0] |= (src->hw << 9);
-}
-
-static void
-set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
-{
-	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc, e);
-
-		emit_mov(pc, temp, src);
-		src = temp;
-	} else
-	if (src->type == P_CONST || src->type == P_IMMD) {
-		if (e->inst[0] & 0x01800000) {
-			struct nv50_reg *temp = temp_temp(pc, e);
-
-			emit_mov(pc, temp, src);
-			src = temp;
-		} else {
-			assert(!(e->inst[0] & 0x00800000));
-			set_data(pc, src, 0x7f, 16, e);
-			e->inst[0] |= 0x00800000;
-		}
-	}
-
-	alloc_reg(pc, src);
-	if (src->hw > 63)
-		set_long(pc, e);
-	e->inst[0] |= ((src->hw & 127) << 16);
-}
-
-static void
-set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
-{
-	set_long(pc, e);
-
-	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc, e);
-
-		emit_mov(pc, temp, src);
-		src = temp;
-	} else
-	if (src->type == P_CONST || src->type == P_IMMD) {
-		if (e->inst[0] & 0x01800000) {
-			struct nv50_reg *temp = temp_temp(pc, e);
-
-			emit_mov(pc, temp, src);
-			src = temp;
-		} else {
-			assert(!(e->inst[0] & 0x01000000));
-			set_data(pc, src, 0x7f, 32+14, e);
-			e->inst[0] |= 0x01000000;
-		}
-	}
-
-	alloc_reg(pc, src);
-	e->inst[1] |= ((src->hw & 127) << 14);
-}
-
-static void
-set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh,
-	     struct nv50_program_exec *e, int pos)
-{
-	struct nv50_reg *r = src;
-
-	alloc_reg(pc, r);
-	if (r->type != P_TEMP) {
-		r = temp_temp(pc, e);
-		emit_mov(pc, r, src);
-	}
-
-	if (r->hw > (NV50_SU_MAX_TEMP / 2)) {
-		NOUVEAU_ERR("out of low GPRs\n");
-		abort();
-	}
-
-	e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32);
-}
-
-static void
-emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
+static int
+nv50_geomprog_prepare(struct nv50_translation_info *ti)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	assert(dst->type == P_TEMP);
-	e->inst[1] = 0x20000000 | (pred << 12);
-	set_long(pc, e);
-	set_dst(pc, dst, e);
+   ti->input_file = NV_FILE_MEM_S;
+   ti->output_file = NV_FILE_OUT;
 
-	emit(pc, e);
+   assert(0);
+   return 1;
 }
 
-static void
-emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src)
+static int
+nv50_prog_scan(struct nv50_translation_info *ti)
+{
+   struct nv50_program *p = ti->p;
+   struct tgsi_parse_context parse;
+   int ret;
+
+   p->vp.psiz = 0x40;
+   p->vp.bfc[0] = 0x40;
+   p->vp.bfc[1] = 0x40;
+   p->gp.primid = 0x80;
+
+   tgsi_scan_shader(p->pipe.tokens, &ti->scan);
+
+   tgsi_parse_init(&parse, p->pipe.tokens);
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         prog_immediate(ti, &parse.FullToken.FullImmediate);
+         break;
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         prog_decl(ti, &parse.FullToken.FullDeclaration);
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr);
+         break;
+      }
+   }
+
+   p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1;
+   p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1;
+
+   switch (p->type) {
+   case PIPE_SHADER_VERTEX:
+      ret = nv50_vertprog_prepare(ti);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      ret = nv50_fragprog_prepare(ti);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      ret = nv50_geomprog_prepare(ti);
+      break;
+   default:
+      assert(!"unsupported program type");
+      ret = -1;
+      break;
+   }
+
+   assert(!ret);
+   return ret;
+}
+
+boolean
+nv50_program_tx(struct nv50_program *p)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x000001fc;
-	e->inst[1] = 0xa0000008;
-	set_long(pc, e);
-	set_pred_wr(pc, 1, pred, e);
-	set_src_0_restricted(pc, src, e);
+   struct nv50_translation_info *ti;
+   int ret;
 
-	emit(pc, e);
-}
-
-static void
-emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
-	 struct nv50_reg *src1)
-{
-	struct nv50_program_exec *e = exec(pc);
+   ti = CALLOC_STRUCT(nv50_translation_info);
+   ti->p = p;
 
-	e->inst[0] |= 0xc0000000;
+   ti->edgeflag_out = PIPE_MAX_SHADER_OUTPUTS;
 
-	if (!pc->allow32)
-		set_long(pc, e);
+   ret = nv50_prog_scan(ti);
+   if (ret) {
+      NOUVEAU_ERR("unsupported shader program\n");
+      goto out;
+   }
 
-	check_swap_src_0_1(pc, &src0, &src1);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	if (src1->type == P_IMMD && !is_long(e)) {
-		if (src0->mod ^ src1->mod)
-			e->inst[0] |= 0x00008000;
-		set_immd(pc, src1, e);
-	} else {
-		set_src_1(pc, src1, e);
-		if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) {
-			if (is_long(e))
-				e->inst[1] |= 0x08000000;
-			else
-				e->inst[0] |= 0x00008000;
-		}
-	}
+   ret = nv50_generate_code(ti);
+   if (ret) {
+      NOUVEAU_ERR("error during shader translation\n");
+      goto out;
+   }
 
-	emit(pc, e);
+out:
+   if (ti->immd32)
+      FREE(ti->immd32);
+   FREE(ti);
+   return ret ? FALSE : TRUE;
 }
 
-static void
-emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
-	 struct nv50_reg *src0, struct nv50_reg *src1)
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xb0000000;
-
-	alloc_reg(pc, src1);
-	check_swap_src_0_1(pc, &src0, &src1);
-
-	if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) {
-		set_long(pc, e);
-		e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) |
-			      ((src1->mod & NV50_MOD_NEG) << 27);
-	}
-
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e))
-		set_src_2(pc, src1, e);
-	else
-	if (src1->type == P_IMMD)
-		set_immd(pc, src1, e);
-	else
-		set_src_1(pc, src1, e);
-
-	emit(pc, e);
-}
-
-static void
-emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 uint8_t s)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	set_long(pc, e);
-	e->inst[1] |= 0xc0000000;
-
-	e->inst[0] |= dst->hw << 2;
-	e->inst[0] |= s << 16; /* shift left */
-	set_src_0(pc, src, e);
-
-	emit(pc, e);
-}
-
-static boolean
-address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r)
-{
-	if (!r)
-		return FALSE;
-
-	if (r->vtx != a->vtx)
-		return FALSE;
-	if (r->vtx >= 0)
-		return (r->indirect[1] == a->indirect[1]);
-
-	if (r->hw < a->rhw || (r->hw - a->rhw) >= 128)
-		return FALSE;
-
-	if (a->index >= 0)
-		return (a->index == r->indirect[0]);
-	return (a->indirect[0] == r->indirect[0]);
-}
-
-static void
-load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst,
-		 struct nv50_reg *a, int shift)
-{
-	struct nv50_reg mem, *temp;
-
-	ctor_reg(&mem, P_ATTR, -1, dst->vtx);
-
-	assert(dst->type == P_ADDR);
-	if (!a) {
-		emit_arl(pc, dst, &mem, 0);
-		return;
-	}
-	temp = alloc_temp(pc, NULL);
-
-	if (shift) {
-		emit_mov_from_addr(pc, temp, a);
-		if (shift < 0)
-			emit_shl_imm(pc, temp, temp, shift);
-		emit_arl(pc, dst, temp, MAX2(shift, 0));
-	}
-	emit_mov(pc, temp, &mem);
-	set_addr(pc->p->exec_tail, dst);
-
-	emit_arl(pc, dst, temp, 0);
-	free_temp(pc, temp);
-}
-
-/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS
- * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX
- * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX
- * case (vtx < 0, acc >= 0): memory address too high to encode
- * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS
- */
-static struct nv50_reg *
-get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref)
-{
-	int i;
-	struct nv50_reg *a_ref, *a = NULL;
-
-	for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
-		if (pc->r_addr[i].acc == 0)
-			a = &pc->r_addr[i]; /* an unused address reg */
-		else
-		if (address_reg_suitable(&pc->r_addr[i], ref)) {
-			pc->r_addr[i].acc = pc->insn_cur;
-			return &pc->r_addr[i];
-		} else
-		if (!a && pc->r_addr[i].index < 0 &&
-		    pc->r_addr[i].acc < pc->insn_cur)
-			a = &pc->r_addr[i];
-	}
-	if (!a) {
-		/* We'll be able to spill address regs when this
-		 * mess is replaced with a proper compiler ...
-		 */
-		NOUVEAU_ERR("out of address regs\n");
-		abort();
-		return NULL;
-	}
-
-	/* initialize and reserve for this TGSI instruction */
-	a->rhw = 0;
-	a->index = a->indirect[0] = a->indirect[1] = -1;
-	a->acc = pc->insn_cur;
-
-	if (!ref) {
-		a->vtx = -1;
-		return a;
-	}
-	a->vtx = ref->vtx;
-
-	/* now put in the correct value ... */
-
-	if (ref->vtx >= 0) {
-		a->indirect[1] = ref->indirect[1];
-
-		/* For an indirect vertex index, we need to shift address right
-		 * by 2, the address register will contain vtx * 16, we need to
-		 * load from a[vtx * 4].
-		 */
-		load_vertex_base(pc, a, (ref->acc < 0) ?
-				 pc->addr[ref->indirect[1]] : NULL, -2);
-	} else {
-		assert(ref->acc < 0 || ref->indirect[0] < 0);
-
-		a->rhw = ref->hw & ~0x7f;
-		a->indirect[0] = ref->indirect[0];
-		a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL;
-
-		emit_add_addr_imm(pc, a, a_ref, a->rhw * 4);
-	}
-	return a;
-}
-
-#define NV50_MAX_F32 0x880
-#define NV50_MAX_S32 0x08c
-#define NV50_MAX_U32 0x084
-#define NV50_MIN_F32 0x8a0
-#define NV50_MIN_S32 0x0ac
-#define NV50_MIN_U32 0x0a4
-
-static void
-emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
-	    struct nv50_reg *src0, struct nv50_reg *src1)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	set_long(pc, e);
-	e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20);
-	e->inst[1] |= (sub << 24);
-
-	check_swap_src_0_1(pc, &src0, &src1);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
-
-	if (src0->mod & NV50_MOD_ABS)
-		e->inst[1] |= 0x00100000;
-	if (src1->mod & NV50_MOD_ABS)
-		e->inst[1] |= 0x00080000;
-
-	emit(pc, e);
-}
-
-static INLINE void
-emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
-	 struct nv50_reg *src1)
-{
-	src1->mod ^= NV50_MOD_NEG;
-	emit_add(pc, dst, src0, src1);
-	src1->mod ^= NV50_MOD_NEG;
-}
-
-static void
-emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
-	    struct nv50_reg *src1, unsigned op)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xd0000000;
-	set_long(pc, e);
-
-	check_swap_src_0_1(pc, &src0, &src1);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-
-	if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR &&
-	    op != TGSI_OPCODE_XOR)
-		assert(!"invalid bit op");
-
-	assert(!(src0->mod | src1->mod));
-
-	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
-		set_immd(pc, src1, e);
-		if (op == TGSI_OPCODE_OR)
-			e->inst[0] |= 0x0100;
-		else
-		if (op == TGSI_OPCODE_XOR)
-			e->inst[0] |= 0x8000;
-	} else {
-		set_src_1(pc, src1, e);
-		e->inst[1] |= 0x04000000; /* 32 bit */
-		if (op == TGSI_OPCODE_OR)
-			e->inst[1] |= 0x4000;
-		else
-		if (op == TGSI_OPCODE_XOR)
-			e->inst[1] |= 0x8000;
-	}
-
-	emit(pc, e);
-}
-
-static void
-emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xd0000000;
-	e->inst[1] = 0x0402c000;
-	set_long(pc, e);
-	set_dst(pc, dst, e);
-	set_src_1(pc, src, e);
-
-	emit(pc, e);
-}
-
-static void
-emit_shift(struct nv50_pc *pc, struct nv50_reg *dst,
-	   struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x30000000;
-	e->inst[1] = 0xc4000000;
-
-	set_long(pc, e);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-
-	if (src1->type == P_IMMD) {
-		e->inst[1] |= (1 << 20);
-		e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16;
-	} else
-		set_src_1(pc, src1, e);
-
-	if (dir != TGSI_OPCODE_SHL)
-		e->inst[1] |= (1 << 29);
-
-	if (dir == TGSI_OPCODE_ISHR)
-		e->inst[1] |= (1 << 27);
-
-	emit(pc, e);
-}
-
-static void
-emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst,
-	     struct nv50_reg *src, int s)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x30000000;
-	e->inst[1] = 0xc4100000;
-	if (s < 0) {
-		e->inst[1] |= 1 << 29;
-		s = -s;
-	}
-	e->inst[1] |= ((s & 0x7f) << 16);
-
-	set_long(pc, e);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-
-	emit(pc, e);
-}
-
-static void
-emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
-	 struct nv50_reg *src1, struct nv50_reg *src2)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0xe0000000;
-
-	check_swap_src_0_1(pc, &src0, &src1);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
-	set_src_2(pc, src2, e);
-
-	if ((src0->mod ^ src1->mod) & NV50_MOD_NEG)
-		e->inst[1] |= 0x04000000;
-	if (src2->mod & NV50_MOD_NEG)
-		e->inst[1] |= 0x08000000;
-
-	emit(pc, e);
-}
-
-static INLINE void
-emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
-	 struct nv50_reg *src1, struct nv50_reg *src2)
-{
-	src2->mod ^= NV50_MOD_NEG;
-	emit_mad(pc, dst, src0, src1, src2);
-	src2->mod ^= NV50_MOD_NEG;
-}
-
-#define NV50_FLOP_RCP 0
-#define NV50_FLOP_RSQ 2
-#define NV50_FLOP_LG2 3
-#define NV50_FLOP_SIN 4
-#define NV50_FLOP_COS 5
-#define NV50_FLOP_EX2 6
-
-/* rcp, rsqrt, lg2 support neg and abs */
-static void
-emit_flop(struct nv50_pc *pc, unsigned sub,
-	  struct nv50_reg *dst, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0x90000000;
-	if (sub || src->mod) {
-		set_long(pc, e);
-		e->inst[1] |= (sub << 29);
-	}
-
-	set_dst(pc, dst, e);
-	set_src_0_restricted(pc, src, e);
-
-	assert(!src->mod || sub < 4);
-
-	if (src->mod & NV50_MOD_NEG)
-		e->inst[1] |= 0x04000000;
-	if (src->mod & NV50_MOD_ABS)
-		e->inst[1] |= 0x00100000;
-
-	emit(pc, e);
-}
-
-static void
-emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0xb0000000;
-
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-	set_long(pc, e);
-	e->inst[1] |= (6 << 29) | 0x00004000;
-
-	if (src->mod & NV50_MOD_NEG)
-		e->inst[1] |= 0x04000000;
-	if (src->mod & NV50_MOD_ABS)
-		e->inst[1] |= 0x00100000;
-
-	emit(pc, e);
-}
-
-static void
-emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] |= 0xb0000000;
-
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-	set_long(pc, e);
-	e->inst[1] |= (6 << 29);
-
-	if (src->mod & NV50_MOD_NEG)
-		e->inst[1] |= 0x04000000;
-	if (src->mod & NV50_MOD_ABS)
-		e->inst[1] |= 0x00100000;
-
-	emit(pc, e);
-}
-
-#define CVT_RN    (0x00 << 16)
-#define CVT_FLOOR (0x02 << 16)
-#define CVT_CEIL  (0x04 << 16)
-#define CVT_TRUNC (0x06 << 16)
-#define CVT_SAT   (0x08 << 16)
-#define CVT_ABS   (0x10 << 16)
-
-#define CVT_X32_X32 0x04004000
-#define CVT_X32_S32 0x04014000
-#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32)
-#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32)
-#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32)
-#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32)
-#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32)
-#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32)
-#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32)
-#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32)
-
-#define CVT_NEG 0x20000000
-#define CVT_RI  0x08000000
-
-static void
-emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, uint32_t cvn)
-{
-	struct nv50_program_exec *e;
-
-	e = exec(pc);
-
-	if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG;
-	if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS;
-
-	e->inst[0] = 0xa0000000;
-	e->inst[1] = cvn;
-	set_long(pc, e);
-	set_src_0(pc, src, e);
-
-	if (wp >= 0)
-		set_pred_wr(pc, 1, wp, e);
-
-	if (dst)
-		set_dst(pc, dst, e);
-	else {
-		e->inst[0] |= 0x000001fc;
-		e->inst[1] |= 0x00000008;
-	}
-
-	emit(pc, e);
-}
-
-/* nv50 Condition codes:
- *  0x1 = LT
- *  0x2 = EQ
- *  0x3 = LE
- *  0x4 = GT
- *  0x5 = NE
- *  0x6 = GE
- *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
- *  0x8 = unordered bit (allows NaN)
- *
- *  mode = 0x04 (u32), 0x0c (s32), 0x80 (f32)
- */
-static void
-emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
-	 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode)
-{
-	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
-
-	struct nv50_program_exec *e = exec(pc);
-	struct nv50_reg *rdst;
-
-	assert(ccode < 16);
-	if (check_swap_src_0_1(pc, &src0, &src1))
-		ccode = cc_swapped[ccode & 7] | (ccode & 8);
-
-	rdst = dst;
-	if (dst && dst->type != P_TEMP)
-		dst = alloc_temp(pc, NULL);
-
-	set_long(pc, e);
-	e->inst[0] |= 0x30000000 | (mode << 24);
-	e->inst[1] |= 0x60000000 | (ccode << 14);
-
-	if (wp >= 0)
-		set_pred_wr(pc, 1, wp, e);
-	if (dst)
-		set_dst(pc, dst, e);
-	else {
-		e->inst[0] |= 0x000001fc;
-		e->inst[1] |= 0x00000008;
-	}
-
-	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
-
-	emit(pc, e);
-
-	if (rdst && mode == 0x80) /* convert to float ? */
-		emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32);
-	if (rdst && rdst != dst)
-		free_temp(pc, dst);
-}
-
-static INLINE void
-map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty)
-{
-	switch (op) {
-	case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break;
-	case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break;
-	case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break;
-	case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break;
-	case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break;
-	case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break;
-
-	case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break;
-	case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break;
-	case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break;
-	case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break;
-	case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break;
-	case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break;
-	default:
-		assert(0);
-		return;
-	}
-}
-
-static void
-emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst,
-	     struct nv50_reg *src0, struct nv50_reg *rsrc1)
-{
-	struct nv50_program_exec *e = exec(pc);
-	struct nv50_reg *src1;
-
-	e->inst[0] = 0x20000000;
-
-	alloc_reg(pc, rsrc1);
-	check_swap_src_0_1(pc, &src0, &rsrc1);
-
-	src1 = rsrc1;
-	if (src0->mod & rsrc1->mod & NV50_MOD_NEG) {
-		src1 = temp_temp(pc, e);
-		emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32);
-	}
-
-	if (!pc->allow32 || src1->hw > 63 ||
-	    (src1->type != P_TEMP && src1->type != P_IMMD))
-		set_long(pc, e);
-
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-
-	if (is_long(e)) {
-		e->inst[1] |= 1 << 26;
-		set_src_2(pc, src1, e);
-	} else {
-		e->inst[0] |= 0x8000;
-		if (src1->type == P_IMMD)
-			set_immd(pc, src1, e);
-		else
-			set_src_1(pc, src1, e);
-	}
-
-	if (src0->mod & NV50_MOD_NEG)
-		e->inst[0] |= 1 << 28;
-	else
-	if (src1->mod & NV50_MOD_NEG)
-		e->inst[0] |= 1 << 22;
-
-	emit(pc, e);
-}
-
-static void
-emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst,
-	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1,
-	     struct nv50_reg *src2)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x60000000;
-	if (!pc->allow32)
-		set_long(pc, e);
-	set_dst(pc, dst, e);
-
-	set_half_src(pc, src0, lh_0, e, 9);
-	set_half_src(pc, src1, lh_1, e, 16);
-	alloc_reg(pc, src2);
-	if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw))
-		set_src_2(pc, src2, e);
-
-	emit(pc, e);
-}
-
-static void
-emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst,
-	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x40000000;
-	set_long(pc, e);
-	set_dst(pc, dst, e);
-
-	set_half_src(pc, src0, lh_0, e, 9);
-	set_half_src(pc, src1, lh_1, e, 16);
-
-	emit(pc, e);
-}
-
-static void
-emit_sad(struct nv50_pc *pc, struct nv50_reg *dst,
-	 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x50000000;
-	if (!pc->allow32)
-		set_long(pc, e);
-	check_swap_src_0_1(pc, &src0, &src1);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src0, e);
-	set_src_1(pc, src1, e);
-	alloc_reg(pc, src2);
-	if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw))
-		set_src_2(pc, src2, e);
-
-	if (is_long(e))
-		e->inst[1] |= 0x0c << 24;
-	else
-		e->inst[0] |= 0x81 << 8;
-
-	emit(pc, e);
-}
-
-static INLINE void
-emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI);
-}
-
-static void
-emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
-	 struct nv50_reg *v, struct nv50_reg *e)
-{
-	struct nv50_reg *temp = alloc_temp(pc, NULL);
-
-	emit_flop(pc, NV50_FLOP_LG2, temp, v);
-	emit_mul(pc, temp, temp, e);
-	emit_preex2(pc, temp, temp);
-	emit_flop(pc, NV50_FLOP_EX2, dst, temp);
-
-	free_temp(pc, temp);
-}
-
-static INLINE void
-emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32);
-}
-
-static void
-emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
-	 struct nv50_reg **src)
-{
-	struct nv50_reg *one = alloc_immd(pc, 1.0);
-	struct nv50_reg *zero = alloc_immd(pc, 0.0);
-	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
-	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
-	struct nv50_reg *tmp[4] = { 0 };
-	boolean allow32 = pc->allow32;
-
-	pc->allow32 = FALSE;
-
-	if (mask & (3 << 1)) {
-		tmp[0] = alloc_temp(pc, NULL);
-		emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero);
-	}
-
-	if (mask & (1 << 2)) {
-		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
-
-		tmp[1] = temp_temp(pc, NULL);
-		emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero);
-
-		tmp[3] = temp_temp(pc, NULL);
-		emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128);
-		emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128);
-
-		emit_pow(pc, dst[2], tmp[1], tmp[3]);
-		emit_mov(pc, dst[2], zero);
-		set_pred(pc, 3, 0, pc->p->exec_tail);
-	}
-
-	if (mask & (1 << 1))
-		assimilate_temp(pc, dst[1], tmp[0]);
-	else
-	if (mask & (1 << 2))
-		free_temp(pc, tmp[0]);
-
-	pc->allow32 = allow32;
-
-	/* do this last, in case src[i,j] == dst[0,3] */
-	if (mask & (1 << 0))
-		emit_mov(pc, dst[0], one);
-
-	if (mask & (1 << 3))
-		emit_mov(pc, dst[3], one);
-
-	FREE(pos128);
-	FREE(neg128);
-	FREE(zero);
-	FREE(one);
-}
-
-static void
-emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e;
-	const int r_pred = 1;
-
-	e = exec(pc);
-	e->inst[0] = 0x00000002; /* discard */
-	set_long(pc, e); /* sets cond code to ALWAYS */
-
-	if (src) {
-		set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
-		/* write to predicate reg */
-		emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32);
-	}
-
-	emit(pc, e);
-}
-
-static struct nv50_program_exec *
-emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = (op << 28) | 2;
-	set_long(pc, e);
-	if (pred >= 0)
-		set_pred(pc, cc, pred, e);
-
-	emit(pc, e);
-	return e;
-}
-
-static INLINE struct nv50_program_exec *
-emit_breakaddr(struct nv50_pc *pc)
-{
-	return emit_control_flow(pc, 0x4, -1, 0);
-}
-
-static INLINE void
-emit_break(struct nv50_pc *pc, int pred, unsigned cc)
-{
-	emit_control_flow(pc, 0x5, pred, cc);
-}
-
-static INLINE struct nv50_program_exec *
-emit_joinat(struct nv50_pc *pc)
-{
-	return emit_control_flow(pc, 0xa, -1, 0);
-}
-
-static INLINE struct nv50_program_exec *
-emit_branch(struct nv50_pc *pc, int pred, unsigned cc)
-{
-	return emit_control_flow(pc, 0x1, pred, cc);
-}
-
-static INLINE struct nv50_program_exec *
-emit_call(struct nv50_pc *pc, int pred, unsigned cc)
-{
-	return emit_control_flow(pc, 0x2, pred, cc);
-}
-
-static INLINE void
-emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
-{
-	emit_control_flow(pc, 0x3, pred, cc);
-}
-
-static void
-emit_prim_cmd(struct nv50_pc *pc, unsigned cmd)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xf0000000 | (cmd << 9);
-	e->inst[1] = 0xc0000000;
-	set_long(pc, e);
-
-	emit(pc, e);
-}
-
-#define QOP_ADD 0
-#define QOP_SUBR 1
-#define QOP_SUB 2
-#define QOP_MOV_SRC1 3
-
-/* For a quad of threads / top left, top right, bottom left, bottom right
- * pixels, do a different operation, and take src0 from a specific thread.
- */
-static void
-emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0,
-	    struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop)
-{
-       struct nv50_program_exec *e = exec(pc);
-
-       e->inst[0] = 0xc0000000;
-       e->inst[1] = 0x80000000;
-       set_long(pc, e);
-       e->inst[0] |= lane_src0 << 16;
-       set_src_0(pc, src0, e);
-       set_src_2(pc, src1, e);
-
-       if (wp >= 0)
-	       set_pred_wr(pc, 1, wp, e);
-
-       if (dst)
-	       set_dst(pc, dst, e);
-       else {
-	       e->inst[0] |= 0x000001fc;
-	       e->inst[1] |= 0x00000008;
-       }
-
-       e->inst[0] |= (qop & 3) << 20;
-       e->inst[1] |= (qop >> 2) << 22;
-
-       emit(pc, e);
-}
-
-static void
-load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
-		     struct nv50_reg **src, unsigned arg, boolean proj)
-{
-	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
-
-	src[0]->mod |= NV50_MOD_ABS;
-	src[1]->mod |= NV50_MOD_ABS;
-	src[2]->mod |= NV50_MOD_ABS;
-
-	emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]);
-	emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]);
-
-	src[0]->mod = mod[0];
-	src[1]->mod = mod[1];
-	src[2]->mod = mod[2];
-
-	if (proj && 0 /* looks more correct without this */)
-		emit_mul(pc, t[2], t[2], src[3]);
-	else
-	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
-		emit_mov(pc, t[3], src[3]);
-
-	emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]);
-
-	emit_mul(pc, t[0], src[0], t[2]);
-	emit_mul(pc, t[1], src[1], t[2]);
-	emit_mul(pc, t[2], src[2], t[2]);
-}
-
-static void
-load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
-		     struct nv50_reg **src, unsigned dim, unsigned arg)
-{
-	unsigned c, mode;
-
-	if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
-		mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE;
-
-		t[3]->rhw = src[3]->rhw;
-		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
-		emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]);
-
-		for (c = 0; c < dim; ++c) {
-			t[c]->rhw = src[c]->rhw;
-			emit_interp(pc, t[c], t[3], mode);
-		}
-		if (arg != dim) { /* depth reference value */
-			t[dim]->rhw = src[2]->rhw;
-			emit_interp(pc, t[dim], t[3], mode);
-		}
-	} else {
-		/* XXX: for some reason the blob sometimes uses MAD
-		 * (mad f32 $rX $rY $rZ neg $r63)
-		 */
-		emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]);
-		for (c = 0; c < dim; ++c)
-			emit_mul(pc, t[c], src[c], t[3]);
-		if (arg != dim) /* depth reference value */
-			emit_mul(pc, t[dim], src[2], t[3]);
-	}
-}
-
-static INLINE void
-get_tex_dim(unsigned type, unsigned *dim, unsigned *arg)
-{
-	switch (type) {
-	case TGSI_TEXTURE_1D:
-		*arg = *dim = 1;
-		break;
-	case TGSI_TEXTURE_SHADOW1D:
-		*dim = 1;
-		*arg = 2;
-		break;
-	case TGSI_TEXTURE_UNKNOWN:
-	case TGSI_TEXTURE_2D:
-	case TGSI_TEXTURE_RECT:
-		*arg = *dim = 2;
-		break;
-	case TGSI_TEXTURE_SHADOW2D:
-	case TGSI_TEXTURE_SHADOWRECT:
-		*dim = 2;
-		*arg = 3;
-		break;
-	case TGSI_TEXTURE_3D:
-	case TGSI_TEXTURE_CUBE:
-		*dim = *arg = 3;
-		break;
-	default:
-		assert(0);
-		break;
-	}
-}
-
-/* We shouldn't execute TEXLOD if any of the pixels in a quad have
- * different LOD values, so branch off groups of equal LOD.
- */
-static void
-emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
-		     struct nv50_reg *src, struct nv50_program_exec *tex)
-{
-	struct nv50_program_exec *join_at;
-	unsigned i, target = pc->p->exec_size + 9 * 2;
-
-	if (pc->p->type != PIPE_SHADER_FRAGMENT) {
-		emit(pc, tex);
-		return;
-	}
-	pc->allow32 = FALSE;
-
-	/* Subtract lod of each pixel from lod of top left pixel, jump
-	 * texlod insn if result is 0, then repeat for 2 other pixels.
-	 */
-	join_at = emit_joinat(pc);
-	emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
-	emit_branch(pc, 0, 2)->param.index = target;
-
-	for (i = 1; i < 4; ++i) {
-		emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
-		emit_branch(pc, 0, 2)->param.index = target;
-	}
-
-	emit_mov(pc, tlod, src); /* target */
-	emit(pc, tex); /* texlod */
-
-	join_at->param.index = target + 2 * 2;
-	JOIN_ON(emit_nop(pc)); /* join _after_ tex */
-}
-
-static void
-emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
-		      struct nv50_program_exec *tex)
-{
-	struct nv50_program_exec *e;
-	struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
-	int r_pred = 0;
-	unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
-
-	pc->allow32 = FALSE;
-	ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
-
-	/* Subtract bias value of thread i from bias values of each thread,
-	 * store result in r_pred, and set bit i in r_bits if result was 0.
-	 */
-	assert(arg < 4);
-	for (i = 0; i < 4; ++i, ++imm_1248.hw) {
-		emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
-		emit_mov(pc, r_bits, &imm_1248);
-		set_pred(pc, 2, r_pred, pc->p->exec_tail);
-	}
-	emit_mov_to_pred(pc, r_pred, r_bits);
-
-	/* The lanes of a quad are now grouped by the bit in r_pred they have
-	 * set. Put the input values for TEX into a new register set for each
-	 * group and execute TEX only for a specific group.
-	 * We cannot use the same register set for each group because we need
-	 * the derivatives, which are implicitly calculated, to be correct.
-	 */
-	for (i = 1; i < 4; ++i) {
-		alloc_temp4(pc, t123[i], 0);
-
-		for (c = 0; c <= arg; ++c)
-			emit_mov(pc, t123[i][c], t[c]);
-
-		*(e = exec(pc)) = *(tex);
-		e->inst[0] &= ~0x01fc;
-		set_dst(pc, t123[i][0], e);
-		set_pred(pc, cc[i], r_pred, e);
-		emit(pc, e);
-	}
-	/* finally TEX on the original regs (where we kept the input) */
-	set_pred(pc, cc[0], r_pred, tex);
-	emit(pc, tex);
-
-	/* put the 3 * n other results into regs for lane 0 */
-	n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
-	for (i = 1; i < 4; ++i) {
-		for (c = 0; c < n; ++c) {
-			emit_mov(pc, t[c], t123[i][c]);
-			set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
-		}
-		free_temp4(pc, t123[i]);
-	}
-
-	emit_nop(pc);
-	free_temp(pc, r_bits);
-}
-
-static void
-emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
-	 struct nv50_reg **src, unsigned unit, unsigned type,
-	 boolean proj, int bias_lod)
-{
-	struct nv50_reg *t[4];
-	struct nv50_program_exec *e;
-	unsigned c, dim, arg;
-
-	/* t[i] must be within a single 128 bit super-reg */
-	alloc_temp4(pc, t, 0);
-
-	e = exec(pc);
-	e->inst[0] = 0xf0000000;
-	set_long(pc, e);
-	set_dst(pc, t[0], e);
-
-	/* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */
-	e->inst[0] |= (unit << 9) /* | (unit << 17) */;
-
-	/* live flag (don't set if TEX results affect input to another TEX): */
-	/* e->inst[0] |= 0x00000004; */
-
-	get_tex_dim(type, &dim, &arg);
-
-	if (type == TGSI_TEXTURE_CUBE) {
-		e->inst[0] |= 0x08000000;
-		load_cube_tex_coords(pc, t, src, arg, proj);
-	} else
-	if (proj)
-		load_proj_tex_coords(pc, t, src, dim, arg);
-	else {
-		for (c = 0; c < dim; c++)
-			emit_mov(pc, t[c], src[c]);
-		if (arg != dim) /* depth reference value (always src.z here) */
-			emit_mov(pc, t[dim], src[2]);
-	}
-
-	e->inst[0] |= (mask & 0x3) << 25;
-	e->inst[1] |= (mask & 0xc) << 12;
-
-	if (!bias_lod) {
-		e->inst[0] |= (arg - 1) << 22;
-		emit(pc, e);
-	} else
-	if (bias_lod < 0) {
-		assert(pc->p->type == PIPE_SHADER_FRAGMENT);
-		e->inst[0] |= arg << 22;
-		e->inst[1] |= 0x20000000; /* texbias */
-		emit_mov(pc, t[arg], src[3]);
-		emit_texbias_sequence(pc, t, arg, e);
-	} else {
-		e->inst[0] |= arg << 22;
-		e->inst[1] |= 0x40000000; /* texlod */
-		emit_mov(pc, t[arg], src[3]);
-		emit_texlod_sequence(pc, t[arg], src[3], e);
-	}
-
-#if 1
-	c = 0;
-	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
-	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
-	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
-	if (mask & 8) emit_mov(pc, dst[3], t[c]);
-
-	free_temp4(pc, t);
-#else
-	/* XXX: if p.e. MUL is used directly after TEX, it would still use
-	 * the texture coordinates, not the fetched values: latency ? */
-
-	for (c = 0; c < 4; c++) {
-		if (mask & (1 << c))
-			assimilate_temp(pc, dst[c], t[c]);
-		else
-			free_temp(pc, t[c]);
-	}
-#endif
-}
-
-static void
-emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	assert(src->type == P_TEMP);
-
-	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000;
-	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000;
-	set_long(pc, e);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-	set_src_2(pc, src, e);
-
-	emit(pc, e);
-}
-
-static void
-emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	assert(src->type == P_TEMP);
-
-	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000;
-	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000;
-	set_long(pc, e);
-	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
-	set_src_2(pc, src, e);
-
-	emit(pc, e);
-}
-
-static void
-convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
-{
-	unsigned q = 0, m = ~0;
-
-	assert(!is_long(e));
-
-	switch (e->inst[0] >> 28) {
-	case 0x1:
-		/* MOV */
-		q = 0x0403c000;
-		m = 0xffff7fff;
-		break;
-	case 0x2:
-	case 0x3:
-		/* ADD, SUB, SUBR b32 */
-		m = ~(0x8000 | (127 << 16));
-		q = ((e->inst[0] & (~m)) >> 2) | (1 << 26);
-		break;
-	case 0x5:
-		/* SAD */
-		m = ~(0x81 << 8);
-		q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12);
-		break;
-	case 0x6:
-		/* MAD u16 */
-		q = (e->inst[0] & (0x7f << 2)) << 12;
-		break;
-	case 0x8:
-		/* INTERP (move centroid, perspective and flat bits) */
-		m = ~0x03000100;
-		q = (e->inst[0] & (3 << 24)) >> (24 - 16);
-		q |= (e->inst[0] & (1 << 8)) << (18 - 8);
-		break;
-	case 0x9:
-		/* RCP */
-		break;
-	case 0xB:
-		/* ADD */
-		m = ~(127 << 16);
-		q = ((e->inst[0] & (~m)) >> 2);
-		break;
-	case 0xC:
-		/* MUL */
-		m = ~0x00008000;
-		q = ((e->inst[0] & (~m)) << 12);
-		break;
-	case 0xE:
-		/* MAD (if src2 == dst) */
-		q = ((e->inst[0] & 0x1fc) << 12);
-		break;
-	default:
-		assert(0);
-		break;
-	}
-
-	set_long(pc, e);
-	pc->p->exec_size++;
-
-	e->inst[0] &= m;
-	e->inst[1] |= q;
-}
-
-/* Some operations support an optional negation flag. */
-static int
-get_supported_mods(const struct tgsi_full_instruction *insn, int i)
-{
-	switch (insn->Instruction.Opcode) {
-	case TGSI_OPCODE_ADD:
-	case TGSI_OPCODE_COS:
-	case TGSI_OPCODE_DDX:
-	case TGSI_OPCODE_DDY:
-	case TGSI_OPCODE_DP3:
-	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_EX2:
-	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_LG2:
-	case TGSI_OPCODE_MAD:
-	case TGSI_OPCODE_MUL:
-	case TGSI_OPCODE_POW:
-	case TGSI_OPCODE_RCP:
-	case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */
-	case TGSI_OPCODE_SCS:
-	case TGSI_OPCODE_SIN:
-	case TGSI_OPCODE_SUB:
-		return NV50_MOD_NEG;
-	case TGSI_OPCODE_MAX:
-	case TGSI_OPCODE_MIN:
-	case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */
-		return NV50_MOD_ABS;
-	case TGSI_OPCODE_CEIL:
-	case TGSI_OPCODE_FLR:
-	case TGSI_OPCODE_TRUNC:
-		return NV50_MOD_NEG | NV50_MOD_ABS;
-	case TGSI_OPCODE_F2I:
-	case TGSI_OPCODE_F2U:
-	case TGSI_OPCODE_I2F:
-	case TGSI_OPCODE_U2F:
-		return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32;
-	case TGSI_OPCODE_UADD:
-		return NV50_MOD_NEG | NV50_MOD_I32;
-	case TGSI_OPCODE_SAD:
-	case TGSI_OPCODE_SHL:
-	case TGSI_OPCODE_IMAX:
-	case TGSI_OPCODE_IMIN:
-	case TGSI_OPCODE_ISHR:
-	case TGSI_OPCODE_NOT:
-	case TGSI_OPCODE_UMAD:
-	case TGSI_OPCODE_UMAX:
-	case TGSI_OPCODE_UMIN:
-	case TGSI_OPCODE_UMUL:
-	case TGSI_OPCODE_USHR:
-		return NV50_MOD_I32;
-	default:
-		return 0;
-	}
-}
-
-/* Return a read mask for source registers deduced from opcode & write mask. */
-static unsigned
-nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
-{
-	unsigned x, mask = insn->Dst[0].Register.WriteMask;
-
-	switch (insn->Instruction.Opcode) {
-	case TGSI_OPCODE_COS:
-	case TGSI_OPCODE_SIN:
-		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
-	case TGSI_OPCODE_DP3:
-		return 0x7;
-	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_DPH:
-	case TGSI_OPCODE_KIL: /* WriteMask ignored */
-		return 0xf;
-	case TGSI_OPCODE_DST:
-		return mask & (c ? 0xa : 0x6);
-	case TGSI_OPCODE_EX2:
-	case TGSI_OPCODE_EXP:
-	case TGSI_OPCODE_LG2:
-	case TGSI_OPCODE_LOG:
-	case TGSI_OPCODE_POW:
-	case TGSI_OPCODE_RCP:
-	case TGSI_OPCODE_RSQ:
-	case TGSI_OPCODE_SCS:
-		return 0x1;
-	case TGSI_OPCODE_IF:
-		return 0x1;
-	case TGSI_OPCODE_LIT:
-		return 0xb;
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXB:
-	case TGSI_OPCODE_TXL:
-	case TGSI_OPCODE_TXP:
-	{
-		const struct tgsi_instruction_texture *tex;
-
-		assert(insn->Instruction.Texture);
-		tex = &insn->Texture;
-
-		mask = 0x7;
-		if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
-		    insn->Instruction.Opcode != TGSI_OPCODE_TXD)
-			mask |= 0x8; /* bias, lod or proj */
-
-		switch (tex->Texture) {
-		case TGSI_TEXTURE_1D:
-			mask &= 0x9;
-			break;
-		case TGSI_TEXTURE_SHADOW1D:
-			mask &= 0x5;
-			break;
-		case TGSI_TEXTURE_2D:
-			mask &= 0xb;
-			break;
-		default:
-			break;
-		}
-	}
-		return mask;
-	case TGSI_OPCODE_XPD:
-		x = 0;
-		if (mask & 1) x |= 0x6;
-		if (mask & 2) x |= 0x5;
-		if (mask & 4) x |= 0x3;
-		return x;
-	default:
-		break;
-	}
-
-	return mask;
-}
-
-static struct nv50_reg *
-tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
-{
-	switch (dst->Register.File) {
-	case TGSI_FILE_TEMPORARY:
-		return &pc->temp[dst->Register.Index * 4 + c];
-	case TGSI_FILE_OUTPUT:
-		return &pc->result[dst->Register.Index * 4 + c];
-	case TGSI_FILE_ADDRESS:
-	{
-		struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
-		if (!r) {
-			r = get_address_reg(pc, NULL);
-			r->index = dst->Register.Index * 4 + c;
-			pc->addr[r->index] = r;
-		}
-		assert(r);
-		return r;
-	}
-	case TGSI_FILE_NULL:
-		return NULL;
-	case TGSI_FILE_SYSTEM_VALUE:
-		assert(pc->sysval[dst->Register.Index].type == P_RESULT);
-		assert(c == 0);
-		return &pc->sysval[dst->Register.Index];
-	default:
-		break;
-	}
-
-	return NULL;
-}
-
-static struct nv50_reg *
-tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
-	 int mod)
-{
-	struct nv50_reg *r = NULL;
-	struct nv50_reg *temp = NULL;
-	unsigned sgn, c, swz, cvn;
-
-	if (src->Register.File != TGSI_FILE_CONSTANT)
-		assert(!src->Register.Indirect);
-
-	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
-
-	c = tgsi_util_get_full_src_register_swizzle(src, chan);
-	switch (c) {
-	case TGSI_SWIZZLE_X:
-	case TGSI_SWIZZLE_Y:
-	case TGSI_SWIZZLE_Z:
-	case TGSI_SWIZZLE_W:
-		switch (src->Register.File) {
-		case TGSI_FILE_INPUT:
-			r = &pc->attr[src->Register.Index * 4 + c];
-
-			if (!src->Dimension.Dimension)
-				break;
-			r = reg_instance(pc, r);
-			r->vtx = src->Dimension.Index;
-
-			if (!src->Dimension.Indirect)
-				break;
-			swz = tgsi_util_get_src_register_swizzle(
-				&src->DimIndirect, 0);
-			r->acc = -1;
-			r->indirect[1] = src->DimIndirect.Index * 4 + swz;
-			break;
-		case TGSI_FILE_TEMPORARY:
-			r = &pc->temp[src->Register.Index * 4 + c];
-			break;
-		case TGSI_FILE_CONSTANT:
-			if (!src->Register.Indirect) {
-				r = &pc->param[src->Register.Index * 4 + c];
-				break;
-			}
-			/* Indicate indirection by setting r->acc < 0 and
-			 * use the index field to select the address reg.
-			 */
-			r = reg_instance(pc, NULL);
-			ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c);
-
-			swz = tgsi_util_get_src_register_swizzle(
-				&src->Indirect, 0);
-			r->acc = -1;
-			r->indirect[0] = src->Indirect.Index * 4 + swz;
-			break;
-		case TGSI_FILE_IMMEDIATE:
-			r = &pc->immd[src->Register.Index * 4 + c];
-			break;
-		case TGSI_FILE_SAMPLER:
-			return NULL;
-		case TGSI_FILE_ADDRESS:
-			r = pc->addr[src->Register.Index * 4 + c];
-			assert(r);
-			break;
-		case TGSI_FILE_SYSTEM_VALUE:
-			assert(c == 0);
-			r = &pc->sysval[src->Register.Index];
-			break;
-		default:
-			assert(0);
-			break;
-		}
-		break;
-	default:
-		assert(0);
-		break;
-	}
-
-	cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32;
-
-	switch (sgn) {
-	case TGSI_UTIL_SIGN_CLEAR:
-		r->mod = NV50_MOD_ABS;
-		break;
-	case TGSI_UTIL_SIGN_SET:
-		r->mod = NV50_MOD_NEG_ABS;
-		break;
-	case TGSI_UTIL_SIGN_TOGGLE:
-		r->mod = NV50_MOD_NEG;
-		break;
-	default:
-		assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP);
-		break;
-	}
-
-	if ((r->mod & mod) != r->mod) {
-		temp = temp_temp(pc, NULL);
-		emit_cvt(pc, temp, r, -1, cvn);
-		r->mod = 0;
-		r = temp;
-	} else
-		r->mod |= mod & NV50_MOD_I32;
-
-	assert(r);
-	if (r->acc >= 0 && r->vtx < 0 && r != temp)
-		return reg_instance(pc, r); /* will clear r->mod */
-	return r;
-}
-
-/* return TRUE for ops that produce only a single result */
-static boolean
-is_scalar_op(unsigned op)
-{
-	switch (op) {
-	case TGSI_OPCODE_COS:
-	case TGSI_OPCODE_DP2:
-	case TGSI_OPCODE_DP3:
-	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_DPH:
-	case TGSI_OPCODE_EX2:
-	case TGSI_OPCODE_LG2:
-	case TGSI_OPCODE_POW:
-	case TGSI_OPCODE_RCP:
-	case TGSI_OPCODE_RSQ:
-	case TGSI_OPCODE_SIN:
-		/*
-	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_LIT:
-	case TGSI_OPCODE_SCS:
-		*/
-		return TRUE;
-	default:
-		return FALSE;
-	}
-}
-
-/* Returns a bitmask indicating which dst components depend
- * on source s, component c (reverse of nv50_tgsi_src_mask).
- */
-static unsigned
-nv50_tgsi_dst_revdep(unsigned op, int s, int c)
-{
-	if (is_scalar_op(op))
-		return 0x1;
-
-	switch (op) {
-	case TGSI_OPCODE_DST:
-		return (1 << c) & (s ? 0xa : 0x6);
-	case TGSI_OPCODE_XPD:
-		switch (c) {
-		case 0: return 0x6;
-		case 1: return 0x5;
-		case 2: return 0x3;
-		case 3: return 0x0;
-		default:
-			assert(0);
-			return 0x0;
-		}
-	case TGSI_OPCODE_EXP:
-	case TGSI_OPCODE_LOG:
-	case TGSI_OPCODE_LIT:
-	case TGSI_OPCODE_SCS:
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXB:
-	case TGSI_OPCODE_TXL:
-	case TGSI_OPCODE_TXP:
-		/* these take care of dangerous swizzles themselves */
-		return 0x0;
-	case TGSI_OPCODE_IF:
-	case TGSI_OPCODE_KIL:
-		/* don't call this function for these ops */
-		assert(0);
-		return 0;
-	default:
-		/* linear vector instruction */
-		return (1 << c);
-	}
-}
-
-static INLINE boolean
-has_pred(struct nv50_program_exec *e, unsigned cc)
-{
-	if (!is_long(e) || is_immd(e))
-		return FALSE;
-	return ((e->inst[1] & 0x780) == (cc << 7));
-}
-
-/* on ENDIF see if we can do "@p0.neu single_op" instead of:
- *        join_at ENDIF
- *        @p0.eq bra ENDIF
- *        single_op
- * ENDIF: nop.join
- */
-static boolean
-nv50_kill_branch(struct nv50_pc *pc)
-{
-	int lvl = pc->if_lvl;
-
-	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
-		return FALSE;
-	if (is_immd(pc->p->exec_tail))
-		return FALSE;
-
-	/* if ccode == 'true', the BRA is from an ELSE and the predicate
-	 * reg may no longer be valid, since we currently always use $p0
-	 */
-	if (has_pred(pc->if_insn[lvl], 0xf))
-		return FALSE;
-	assert(pc->if_insn[lvl] && pc->if_join[lvl]);
-
-	/* We'll use the exec allocated for JOIN_AT (we can't easily
-	 * access nv50_program_exec's prev).
-	 */
-	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
-
-	*pc->if_join[lvl] = *pc->p->exec_tail;
-
-	FREE(pc->if_insn[lvl]);
-	FREE(pc->p->exec_tail);
-
-	pc->p->exec_tail = pc->if_join[lvl];
-	pc->p->exec_tail->next = NULL;
-	set_pred(pc, 0xd, 0, pc->p->exec_tail);
-
-	return TRUE;
-}
-
-static void
-nv50_fp_move_results(struct nv50_pc *pc)
-{
-	struct nv50_reg reg;
-	unsigned i;
-
-	ctor_reg(&reg, P_TEMP, -1, -1);
-
-	for (i = 0; i < pc->result_nr * 4; ++i) {
-		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
-			continue;
-		if (pc->result[i].rhw != pc->result[i].hw) {
-			reg.hw = pc->result[i].rhw;
-			emit_mov(pc, &reg, &pc->result[i]);
-		}
-	}
-}
-
-static boolean
-nv50_program_tx_insn(struct nv50_pc *pc,
-		     const struct tgsi_full_instruction *inst)
-{
-	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp;
-	unsigned mask, sat, unit = 0;
-	int i, c;
-
-	mask = inst->Dst[0].Register.WriteMask;
-	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
-
-	memset(src, 0, sizeof(src));
-
-	for (c = 0; c < 4; c++) {
-		if ((mask & (1 << c)) && !pc->r_dst[c])
-			dst[c] = tgsi_dst(pc, c, &inst->Dst[0]);
-		else
-			dst[c] = pc->r_dst[c];
-		rdst[c] = dst[c];
-	}
-
-	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-		const struct tgsi_full_src_register *fs = &inst->Src[i];
-		unsigned src_mask;
-		int mod_supp;
-
-		src_mask = nv50_tgsi_src_mask(inst, i);
-		mod_supp = get_supported_mods(inst, i);
-
-		if (fs->Register.File == TGSI_FILE_SAMPLER)
-			unit = fs->Register.Index;
-
-		for (c = 0; c < 4; c++)
-			if (src_mask & (1 << c))
-				src[i][c] = tgsi_src(pc, c, fs, mod_supp);
-	}
-
-	brdc = temp = pc->r_brdc;
-	if (brdc && brdc->type != P_TEMP) {
-		temp = temp_temp(pc, NULL);
-		if (sat)
-			brdc = temp;
-	} else
-	if (sat) {
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
-				continue;
-			/* rdst[c] = dst[c]; */ /* done above */
-			dst[c] = temp_temp(pc, NULL);
-		}
-	}
-
-	assert(brdc || !is_scalar_op(inst->Instruction.Opcode));
-
-	switch (inst->Instruction.Opcode) {
-	case TGSI_OPCODE_ABS:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVT_ABS | CVT_F32_F32);
-		}
-		break;
-	case TGSI_OPCODE_ADD:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_add(pc, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_AND:
-	case TGSI_OPCODE_XOR:
-	case TGSI_OPCODE_OR:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_bitop2(pc, dst[c], src[0][c], src[1][c],
-				    inst->Instruction.Opcode);
-		}
-		break;
-	case TGSI_OPCODE_ARL:
-		temp = temp_temp(pc, NULL);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, temp, src[0][c], -1,
-				 CVT_FLOOR | CVT_S32_F32);
-			emit_arl(pc, dst[c], temp, 4);
-		}
-		break;
-	case TGSI_OPCODE_BGNLOOP:
-		pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc);
-		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
-		terminate_mbb(pc);
-		break;
-	case TGSI_OPCODE_BGNSUB:
-		assert(!pc->in_subroutine);
-		pc->in_subroutine = TRUE;
-		/* probably not necessary, but align to 8 byte boundary */
-		if (!is_long(pc->p->exec_tail))
-			convert_to_long(pc, pc->p->exec_tail);
-		break;
-	case TGSI_OPCODE_BRK:
-		assert(pc->loop_lvl > 0);
-		emit_break(pc, -1, 0);
-		break;
-	case TGSI_OPCODE_CAL:
-		assert(inst->Label.Label < pc->insn_nr);
-		emit_call(pc, -1, 0)->param.index = inst->Label.Label;
-		/* replaced by actual offset in nv50_program_fixup_insns */
-		break;
-	case TGSI_OPCODE_CEIL:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVT_CEIL | CVT_F32_F32 | CVT_RI);
-		}
-		break;
-	case TGSI_OPCODE_CMP:
-		pc->allow32 = FALSE;
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32);
-			emit_mov(pc, dst[c], src[1][c]);
-			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
-			emit_mov(pc, dst[c], src[2][c]);
-			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
-		}
-		break;
-	case TGSI_OPCODE_CONT:
-		assert(pc->loop_lvl > 0);
-		emit_branch(pc, -1, 0)->param.index =
-			pc->loop_pos[pc->loop_lvl - 1];
-		break;
-	case TGSI_OPCODE_COS:
-		if (mask & 8) {
-			emit_precossin(pc, temp, src[0][3]);
-			emit_flop(pc, NV50_FLOP_COS, dst[3], temp);
-			if (!(mask &= 7))
-				break;
-			if (temp == dst[3])
-				temp = brdc = temp_temp(pc, NULL);
-		}
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, NV50_FLOP_COS, brdc, temp);
-		break;
-	case TGSI_OPCODE_DDX:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_ddx(pc, dst[c], src[0][c]);
-		}
-		break;
-	case TGSI_OPCODE_DDY:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_ddy(pc, dst[c], src[0][c]);
-		}
-		break;
-	case TGSI_OPCODE_DP3:
-		emit_mul(pc, temp, src[0][0], src[1][0]);
-		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, brdc, src[0][2], src[1][2], temp);
-		break;
-	case TGSI_OPCODE_DP4:
-		emit_mul(pc, temp, src[0][0], src[1][0]);
-		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_mad(pc, brdc, src[0][3], src[1][3], temp);
-		break;
-	case TGSI_OPCODE_DPH:
-		emit_mul(pc, temp, src[0][0], src[1][0]);
-		emit_mad(pc, temp, src[0][1], src[1][1], temp);
-		emit_mad(pc, temp, src[0][2], src[1][2], temp);
-		emit_add(pc, brdc, src[1][3], temp);
-		break;
-	case TGSI_OPCODE_DST:
-		if (mask & (1 << 1))
-			emit_mul(pc, dst[1], src[0][1], src[1][1]);
-		if (mask & (1 << 2))
-			emit_mov(pc, dst[2], src[0][2]);
-		if (mask & (1 << 3))
-			emit_mov(pc, dst[3], src[1][3]);
-		if (mask & (1 << 0))
-			emit_mov_immdval(pc, dst[0], 1.0f);
-		break;
-	case TGSI_OPCODE_ELSE:
-		emit_branch(pc, -1, 0);
-		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
-		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
-		terminate_mbb(pc);
-		break;
-	case TGSI_OPCODE_EMIT:
-		emit_prim_cmd(pc, 1);
-		break;
-	case TGSI_OPCODE_ENDIF:
-		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
-
-		/* try to replace branch over 1 insn with a predicated insn */
-		if (nv50_kill_branch(pc) == TRUE)
-			break;
-
-		if (pc->if_join[pc->if_lvl]) {
-			pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size;
-			pc->if_join[pc->if_lvl] = NULL;
-		}
-		terminate_mbb(pc);
-		/* emit a NOP as join point, we could set it on the next
-		 * one, but would have to make sure it is long and !immd
-		 */
-		JOIN_ON(emit_nop(pc));
-		break;
-	case TGSI_OPCODE_ENDLOOP:
-		emit_branch(pc, -1, 0)->param.index =
-			pc->loop_pos[--pc->loop_lvl];
-		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
-		terminate_mbb(pc);
-		break;
-	case TGSI_OPCODE_ENDPRIM:
-		emit_prim_cmd(pc, 2);
-		break;
-	case TGSI_OPCODE_ENDSUB:
-		assert(pc->in_subroutine);
-		terminate_mbb(pc);
-		pc->in_subroutine = FALSE;
-		break;
-	case TGSI_OPCODE_EX2:
-		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, NV50_FLOP_EX2, brdc, temp);
-		break;
-	case TGSI_OPCODE_EXP:
-	{
-		struct nv50_reg *t[2];
-
-		assert(!temp);
-		t[0] = temp_temp(pc, NULL);
-		t[1] = temp_temp(pc, NULL);
-
-		if (mask & 0x6)
-			emit_mov(pc, t[0], src[0][0]);
-		if (mask & 0x3)
-			emit_flr(pc, t[1], src[0][0]);
-
-		if (mask & (1 << 1))
-			emit_sub(pc, dst[1], t[0], t[1]);
-		if (mask & (1 << 0)) {
-			emit_preex2(pc, t[1], t[1]);
-			emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]);
-		}
-		if (mask & (1 << 2)) {
-			emit_preex2(pc, t[0], t[0]);
-			emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]);
-		}
-		if (mask & (1 << 3))
-			emit_mov_immdval(pc, dst[3], 1.0f);
-	}
-		break;
-	case TGSI_OPCODE_F2I:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVT_TRUNC | CVT_S32_F32);
-		}
-		break;
-	case TGSI_OPCODE_F2U:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVT_TRUNC | CVT_U32_F32);
-		}
-		break;
-	case TGSI_OPCODE_FLR:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flr(pc, dst[c], src[0][c]);
-		}
-		break;
-	case TGSI_OPCODE_FRC:
-		temp = temp_temp(pc, NULL);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_flr(pc, temp, src[0][c]);
-			emit_sub(pc, dst[c], src[0][c], temp);
-		}
-		break;
-	case TGSI_OPCODE_I2F:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32);
-		}
-		break;
-	case TGSI_OPCODE_IF:
-		assert(pc->if_lvl < NV50_MAX_COND_NESTING);
-		emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32);
-		pc->if_join[pc->if_lvl] = emit_joinat(pc);
-		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
-		terminate_mbb(pc);
-		break;
-	case TGSI_OPCODE_IMAX:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_IMIN:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_INEG:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVT_S32_S32 | CVT_NEG);
-		}
-		break;
-	case TGSI_OPCODE_KIL:
-		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
-		emit_kil(pc, src[0][0]);
-		emit_kil(pc, src[0][1]);
-		emit_kil(pc, src[0][2]);
-		emit_kil(pc, src[0][3]);
-		break;
-	case TGSI_OPCODE_KILP:
-		emit_kil(pc, NULL);
-		break;
-	case TGSI_OPCODE_LIT:
-		emit_lit(pc, &dst[0], mask, &src[0][0]);
-		break;
-	case TGSI_OPCODE_LG2:
-		emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]);
-		break;
-	case TGSI_OPCODE_LOG:
-	{
-		struct nv50_reg *t[2];
-
-		t[0] = temp_temp(pc, NULL);
-		if (mask & (1 << 1))
-			t[1] = temp_temp(pc, NULL);
-		else
-			t[1] = t[0];
-
-		emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32);
-		emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]);
-		if (mask & (1 << 2))
-			emit_mov(pc, dst[2], t[1]);
-		emit_flr(pc, t[1], t[1]);
-		if (mask & (1 << 0))
-			emit_mov(pc, dst[0], t[1]);
-		if (mask & (1 << 1)) {
-			t[1]->mod = NV50_MOD_NEG;
-			emit_preex2(pc, t[1], t[1]);
-			t[1]->mod = 0;
-			emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]);
-			emit_mul(pc, dst[1], t[0], t[1]);
-		}
-		if (mask & (1 << 3))
-			emit_mov_immdval(pc, dst[3], 1.0f);
-	}
-		break;
-	case TGSI_OPCODE_LRP:
-		temp = temp_temp(pc, NULL);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_sub(pc, temp, src[1][c], src[2][c]);
-			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);
-		}
-		break;
-	case TGSI_OPCODE_MAD:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
-		}
-		break;
-	case TGSI_OPCODE_MAX:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_MIN:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_MOV:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mov(pc, dst[c], src[0][c]);
-		}
-		break;
-	case TGSI_OPCODE_MUL:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mul(pc, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_NOT:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_not(pc, dst[c], src[0][c]);
-		}
-		break;
-	case TGSI_OPCODE_POW:
-		emit_pow(pc, brdc, src[0][0], src[1][0]);
-		break;
-	case TGSI_OPCODE_RCP:
-		if (!sat && popcnt4(mask) == 1)
-			brdc = dst[ffs(mask) - 1];
-		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]);
-		break;
-	case TGSI_OPCODE_RET:
-		if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine)
-			nv50_fp_move_results(pc);
-		emit_ret(pc, -1, 0);
-		break;
-	case TGSI_OPCODE_RSQ:
-		if (!sat && popcnt4(mask) == 1)
-			brdc = dst[ffs(mask) - 1];
-		src[0][0]->mod |= NV50_MOD_ABS;
-		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]);
-		break;
-	case TGSI_OPCODE_SAD:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
-		}
-		break;
-	case TGSI_OPCODE_SCS:
-		temp = temp_temp(pc, NULL);
-		if (mask & 3)
-			emit_precossin(pc, temp, src[0][0]);
-		if (mask & (1 << 0))
-			emit_flop(pc, NV50_FLOP_COS, dst[0], temp);
-		if (mask & (1 << 1))
-			emit_flop(pc, NV50_FLOP_SIN, dst[1], temp);
-		if (mask & (1 << 2))
-			emit_mov_immdval(pc, dst[2], 0.0);
-		if (mask & (1 << 3))
-			emit_mov_immdval(pc, dst[3], 1.0);
-		break;
-	case TGSI_OPCODE_SHL:
-	case TGSI_OPCODE_ISHR:
-	case TGSI_OPCODE_USHR:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_shift(pc, dst[c], src[0][c], src[1][c],
-				   inst->Instruction.Opcode);
-		}
-		break;
-	case TGSI_OPCODE_SIN:
-		if (mask & 8) {
-			emit_precossin(pc, temp, src[0][3]);
-			emit_flop(pc, NV50_FLOP_SIN, dst[3], temp);
-			if (!(mask &= 7))
-				break;
-			if (temp == dst[3])
-				temp = brdc = temp_temp(pc, NULL);
-		}
-		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, NV50_FLOP_SIN, brdc, temp);
-		break;
-	case TGSI_OPCODE_SLT:
-	case TGSI_OPCODE_SGE:
-	case TGSI_OPCODE_SEQ:
-	case TGSI_OPCODE_SGT:
-	case TGSI_OPCODE_SLE:
-	case TGSI_OPCODE_SNE:
-	case TGSI_OPCODE_ISLT:
-	case TGSI_OPCODE_ISGE:
-	case TGSI_OPCODE_USEQ:
-	case TGSI_OPCODE_USGE:
-	case TGSI_OPCODE_USLT:
-	case TGSI_OPCODE_USNE:
-	{
-		uint8_t cc, ty;
-
-		map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty);
-
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty);
-		}
-	}
-		break;
-	case TGSI_OPCODE_SUB:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_sub(pc, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_TEX:
-		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->Texture.Texture, FALSE, 0);
-		break;
-	case TGSI_OPCODE_TXB:
-		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->Texture.Texture, FALSE, -1);
-		break;
-	case TGSI_OPCODE_TXL:
-		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->Texture.Texture, FALSE, 1);
-		break;
-	case TGSI_OPCODE_TXP:
-		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->Texture.Texture, TRUE, 0);
-		break;
-	case TGSI_OPCODE_TRUNC:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVT_TRUNC | CVT_F32_F32 | CVT_RI);
-		}
-		break;
-	case TGSI_OPCODE_U2F:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32);
-		}
-		break;
-	case TGSI_OPCODE_UADD:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_add_b32(pc, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_UMAX:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_UMIN:
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]);
-		}
-		break;
-	case TGSI_OPCODE_UMAD:
-	{
-		assert(!temp);
-		temp = temp_temp(pc, NULL);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
-			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
-				     temp);
-			emit_shl_imm(pc, temp, temp, 16);
-			emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0,
-				     temp);
-			emit_add_b32(pc, dst[c], temp, src[2][c]);
-		}
-	}
-		break;
-	case TGSI_OPCODE_UMUL:
-	{
-		assert(!temp);
-		temp = temp_temp(pc, NULL);
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
-			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
-				     temp);
-			emit_shl_imm(pc, temp, temp, 16);
-			emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0,
-				     temp);
-		}
-	}
-		break;
-	case TGSI_OPCODE_XPD:
-		temp = temp_temp(pc, NULL);
-		if (mask & (1 << 0)) {
-			emit_mul(pc, temp, src[0][2], src[1][1]);
-			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
-		}
-		if (mask & (1 << 1)) {
-			emit_mul(pc, temp, src[0][0], src[1][2]);
-			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
-		}
-		if (mask & (1 << 2)) {
-			emit_mul(pc, temp, src[0][1], src[1][0]);
-			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
-		}
-		if (mask & (1 << 3))
-			emit_mov_immdval(pc, dst[3], 1.0);
-		break;
-	case TGSI_OPCODE_END:
-		if (pc->p->type == PIPE_SHADER_FRAGMENT)
-			nv50_fp_move_results(pc);
-
-		if (!pc->p->exec_tail ||
-		    is_immd(pc->p->exec_tail) ||
-		    is_join(pc->p->exec_tail) ||
-		    is_control_flow(pc->p->exec_tail))
-			emit_nop(pc);
-
-		/* last insn must be long so it can have the exit bit set */
-		if (!is_long(pc->p->exec_tail))
-			convert_to_long(pc, pc->p->exec_tail);
-
-		pc->p->exec_tail->inst[1] |= 1; /* set exit bit */
-
-		terminate_mbb(pc);
-		break;
-	default:
-		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
-		return FALSE;
-	}
-
-	if (brdc) {
-		if (sat)
-			emit_sat(pc, brdc, brdc);
-		for (c = 0; c < 4; c++)
-			if ((mask & (1 << c)) && dst[c] != brdc)
-				emit_mov(pc, dst[c], brdc);
-	} else
-	if (sat) {
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			/* In this case we saturate later, and dst[c] won't
-			 * be another temp_temp (and thus lost), since rdst
-			 * already is TEMP (see above). */
-			if (rdst[c]->type == P_TEMP && rdst[c]->index < 0)
-				continue;
-			emit_sat(pc, rdst[c], dst[c]);
-		}
-	}
-
-	kill_temp_temp(pc, NULL);
-	pc->reg_instance_nr = 0;
-
-	return TRUE;
-}
-
-static void
-prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
-{
-	struct nv50_reg *r, *reg = NULL;
-	const struct tgsi_full_src_register *src;
-	const struct tgsi_dst_register *dst;
-	unsigned i, c, k, mask;
-
-	dst = &insn->Dst[0].Register;
-	mask = dst->WriteMask;
-
-        if (dst->File == TGSI_FILE_TEMPORARY)
-		reg = pc->temp;
-        else
-	if (dst->File == TGSI_FILE_OUTPUT) {
-		reg = pc->result;
-
-		if (insn->Instruction.Opcode == TGSI_OPCODE_MOV &&
-		    dst->Index == pc->edgeflag_out &&
-		    insn->Src[0].Register.File == TGSI_FILE_INPUT)
-			pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index;
-	}
-
-	if (reg) {
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			reg[dst->Index * 4 + c].acc = pc->insn_nr;
-		}
-	}
-
-	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
-		src = &insn->Src[i];
-
-		if (src->Register.File == TGSI_FILE_TEMPORARY)
-			reg = pc->temp;
-		else
-		if (src->Register.File == TGSI_FILE_INPUT)
-			reg = pc->attr;
-		else
-			continue;
-
-		mask = nv50_tgsi_src_mask(insn, i);
-
-		for (c = 0; c < 4; c++) {
-			if (!(mask & (1 << c)))
-				continue;
-			k = tgsi_util_get_full_src_register_swizzle(src, c);
-
-			r = &reg[src->Register.Index * 4 + k];
-
-			/* If used before written, pre-allocate the reg,
-			 * lest we overwrite results from a subroutine.
-			 */
-			if (!r->acc && r->type == P_TEMP)
-				alloc_reg(pc, r);
-
-			r->acc = pc->insn_nr;
-		}
-	}
-}
-
-/* Returns a bitmask indicating which dst components need to be
- * written to temporaries first to avoid 'corrupting' sources.
- *
- * m[i]   (out) indicate component to write in the i-th position
- * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source
- */
-static unsigned
-nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
-{
-	unsigned i, c, x, unsafe = 0;
-
-	for (c = 0; c < 4; c++)
-		m[c] = c;
-
-	/* Swap as long as a dst component written earlier is depended on
-	 * by one written later, but the next one isn't depended on by it.
-	 */
-	for (c = 0; c < 3; c++) {
-		if (rdep[m[c + 1]] & (1 << m[c]))
-			continue; /* if next one is depended on by us */
-		for (i = c + 1; i < 4; i++)
-			/* if we are depended on by a later one */
-			if (rdep[m[c]] & (1 << m[i]))
-				break;
-		if (i == 4)
-			continue;
-		/* now, swap */
-		x = m[c];
-		m[c] = m[c + 1];
-		m[c + 1] = x;
-
-		/* restart */
-		c = 0;
-	}
-
-	/* mark dependencies that could not be resolved by reordering */
-	for (i = 0; i < 3; ++i)
-		for (c = i + 1; c < 4; ++c)
-			if (rdep[m[i]] & (1 << m[c]))
-				unsafe |= (1 << i);
-
-	/* NOTE: $unsafe is with respect to order, not component */
-	return unsafe;
-}
-
-/* Select a suitable dst register for broadcasting scalar results,
- * or return NULL if we have to allocate an extra TEMP.
- *
- * If e.g. only 1 component is written, we may also emit the final
- * result to a write-only register.
- */
-static struct nv50_reg *
-tgsi_broadcast_dst(struct nv50_pc *pc,
-		   const struct tgsi_full_dst_register *fd, unsigned mask)
-{
-	if (fd->Register.File == TGSI_FILE_TEMPORARY) {
-		int c = ffs(~mask & fd->Register.WriteMask);
-		if (c)
-			return tgsi_dst(pc, c - 1, fd);
-	} else {
-		int c = ffs(fd->Register.WriteMask) - 1;
-		if ((1 << c) == fd->Register.WriteMask)
-			return tgsi_dst(pc, c, fd);
-	}
-
-	return NULL;
-}
-
-/* Scan source swizzles and return a bitmask indicating dst regs that
- * also occur among the src regs, and fill rdep for nv50_revdep_reoder.
- */
-static unsigned
-nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
-		       unsigned rdep[4])
-{
-	const struct tgsi_full_dst_register *fd = &insn->Dst[0];
-	const struct tgsi_full_src_register *fs;
-	unsigned i, deqs = 0;
-
-	for (i = 0; i < 4; ++i)
-		rdep[i] = 0;
-
-	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
-		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
-		int ms = get_supported_mods(insn, i);
-
-		fs = &insn->Src[i];
-		if (fs->Register.File != fd->Register.File ||
-		    fs->Register.Index != fd->Register.Index)
-			continue;
-
-		for (chn = 0; chn < 4; ++chn) {
-			unsigned s, c;
-
-			if (!(mask & (1 << chn))) /* src is not read */
-				continue;
-			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
-			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
-
-			if (!(fd->Register.WriteMask & (1 << c)))
-				continue;
-
-			if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG))
-					continue;
-			if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS))
-					continue;
-			if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3))
-					continue;
-
-			rdep[c] |= nv50_tgsi_dst_revdep(
-				insn->Instruction.Opcode, i, chn);
-			deqs |= (1 << c);
-		}
-	}
-
-	return deqs;
-}
-
-static boolean
-nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
-{
-	struct tgsi_full_instruction insn = tok->FullInstruction;
-	const struct tgsi_full_dst_register *fd;
-	unsigned i, deqs, rdep[4], m[4];
-
-	fd = &tok->FullInstruction.Dst[0];
-	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
-
-	if (is_scalar_op(insn.Instruction.Opcode)) {
-		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
-		if (!pc->r_brdc)
-			pc->r_brdc = temp_temp(pc, NULL);
-		return nv50_program_tx_insn(pc, &insn);
-	}
-	pc->r_brdc = NULL;
-
-	if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3]))
-		return nv50_program_tx_insn(pc, &insn);
-
-	deqs = nv50_revdep_reorder(m, rdep);
-
-	for (i = 0; i < 4; ++i) {
-		assert(pc->r_dst[m[i]] == NULL);
-
-		insn.Dst[0].Register.WriteMask =
-			fd->Register.WriteMask & (1 << m[i]);
-
-		if (!insn.Dst[0].Register.WriteMask)
-			continue;
-
-		if (deqs & (1 << i))
-			pc->r_dst[m[i]] = alloc_temp(pc, NULL);
-
-		if (!nv50_program_tx_insn(pc, &insn))
-			return FALSE;
-	}
-
-	for (i = 0; i < 4; i++) {
-		struct nv50_reg *reg = pc->r_dst[i];
-		if (!reg)
-			continue;
-		pc->r_dst[i] = NULL;
-
-		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE)
-			emit_sat(pc, tgsi_dst(pc, i, fd), reg);
-		else
-			emit_mov(pc, tgsi_dst(pc, i, fd), reg);
-		free_temp(pc, reg);
-	}
-
-	return TRUE;
-}
-
-static void
-load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
-{
-	struct nv50_reg *iv, **ppiv;
-	unsigned mode = pc->interp_mode[reg->index];
-
-	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p;
-	iv = *ppiv;
-
-	if ((mode & INTERP_PERSPECTIVE) && !iv) {
-		iv = *ppiv = alloc_temp(pc, NULL);
-		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
-
-		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
-		emit_flop(pc, NV50_FLOP_RCP, iv, iv);
-
-		/* XXX: when loading interpolants dynamically, move these
-		 * to the program head, or make sure it can't be skipped.
-		 */
-	}
-
-	emit_interp(pc, reg, iv, mode);
-}
-
-/* The face input is always at v[255] (varying space), with a
- * value of 0 for back-facing, and 0xffffffff for front-facing.
- */
-static void
-load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv)
-{
-	struct nv50_reg *temp = alloc_temp(pc, NULL);
-	int r_pred = 0;
-
-	temp->rhw = 255;
-	emit_interp(pc, temp, NULL, INTERP_FLAT);
-
-	emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32);
-
-	emit_not(pc, temp, temp);
-	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
-	emit_cvt(pc, sv, temp, -1, CVT_F32_S32);
-	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
-
-	free_temp(pc, temp);
-}
-
-static void
-load_instance_id(struct nv50_pc *pc, unsigned index)
-{
-	struct nv50_reg reg, mem;
-
-	ctor_reg(&reg, P_TEMP, -1, -1);
-	ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */
-	mem.buf_index = 2;
-
-	emit_add_b32(pc, &reg, &pc->sysval[index], &mem);
-	pc->sysval[index] = reg;
-}
-
-static void
-copy_semantic_info(struct nv50_program *p)
-{
-	unsigned i, id;
-
-	for (i = 0; i < p->cfg.in_nr; ++i) {
-		id = p->cfg.in[i].id;
-		p->cfg.in[i].sn = p->info.input_semantic_name[id];
-		p->cfg.in[i].si = p->info.input_semantic_index[id];
-	}
-
-	for (i = 0; i < p->cfg.out_nr; ++i) {
-		id = p->cfg.out[i].id;
-		p->cfg.out[i].sn = p->info.output_semantic_name[id];
-		p->cfg.out[i].si = p->info.output_semantic_index[id];
-	}
-}
-
-static boolean
-nv50_program_tx_prep(struct nv50_pc *pc)
-{
-	struct tgsi_parse_context tp;
-	struct nv50_program *p = pc->p;
-	boolean ret = FALSE;
-	unsigned i, c, instance_id = 0, vertex_id = 0, flat_nr = 0;
-
-	tgsi_parse_init(&tp, pc->p->pipe.tokens);
-	while (!tgsi_parse_end_of_tokens(&tp)) {
-		const union tgsi_full_token *tok = &tp.FullToken;
-
-		tgsi_parse_token(&tp);
-		switch (tok->Token.Type) {
-		case TGSI_TOKEN_TYPE_IMMEDIATE:
-		{
-			const struct tgsi_full_immediate *imm =
-				&tp.FullToken.FullImmediate;
-
-			ctor_immd_4f32(pc, imm->u[0].Float,
-				       imm->u[1].Float,
-				       imm->u[2].Float,
-				       imm->u[3].Float);
-		}
-			break;
-		case TGSI_TOKEN_TYPE_DECLARATION:
-		{
-			const struct tgsi_full_declaration *d;
-			unsigned si, last, first, mode;
-
-			d = &tp.FullToken.FullDeclaration;
-			first = d->Range.First;
-			last = d->Range.Last;
-
-			switch (d->Declaration.File) {
-			case TGSI_FILE_TEMPORARY:
-				break;
-			case TGSI_FILE_OUTPUT:
-				if (!d->Declaration.Semantic ||
-				    p->type == PIPE_SHADER_FRAGMENT)
-					break;
-
-				si = d->Semantic.Index;
-				switch (d->Semantic.Name) {
-				case TGSI_SEMANTIC_BCOLOR:
-					p->cfg.two_side[si].hw = first;
-					if (p->cfg.out_nr > first)
-						p->cfg.out_nr = first;
-					break;
-				case TGSI_SEMANTIC_PSIZE:
-					p->cfg.psiz = first;
-					if (p->cfg.out_nr > first)
-						p->cfg.out_nr = first;
-					break;
-				case TGSI_SEMANTIC_EDGEFLAG:
-					pc->edgeflag_out = first;
-					break;
-					/*
-				case TGSI_SEMANTIC_CLIP_DISTANCE:
-					p->cfg.clpd = MIN2(p->cfg.clpd, first);
-					break;
-					*/
-				default:
-					break;
-				}
-				break;
-			case TGSI_FILE_INPUT:
-			{
-				if (p->type != PIPE_SHADER_FRAGMENT)
-					break;
-
-				switch (d->Declaration.Interpolate) {
-				case TGSI_INTERPOLATE_CONSTANT:
-					mode = INTERP_FLAT;
-					flat_nr++;
-					break;
-				case TGSI_INTERPOLATE_PERSPECTIVE:
-					mode = INTERP_PERSPECTIVE;
-					p->cfg.regs[1] |= 0x08 << 24;
-					break;
-				default:
-					mode = INTERP_LINEAR;
-					break;
-				}
-				if (d->Declaration.Centroid)
-					mode |= INTERP_CENTROID;
-
-				assert(last < 32);
-				for (i = first; i <= last; i++)
-					pc->interp_mode[i] = mode;
-			}
-				break;
-			case TGSI_FILE_SYSTEM_VALUE:
-				assert(d->Declaration.Semantic);
-				switch (d->Semantic.Name) {
-				case TGSI_SEMANTIC_FACE:
-					assert(p->type == PIPE_SHADER_FRAGMENT);
-					load_frontfacing(pc,
-							 &pc->sysval[first]);
-					break;
-				case TGSI_SEMANTIC_INSTANCEID:
-					assert(p->type == PIPE_SHADER_VERTEX);
-					instance_id = first;
-					p->cfg.regs[0] |= (1 << 4);
-					break;
-				case TGSI_SEMANTIC_PRIMID:
-					assert(p->type != PIPE_SHADER_VERTEX);
-					p->cfg.prim_id = first;
-					break;
-					/*
-				case TGSI_SEMANTIC_PRIMIDIN:
-					assert(p->type == PIPE_SHADER_GEOMETRY);
-					pc->sysval[first].hw = 6;
-					p->cfg.regs[0] |= (1 << 8);
-					break;
-				case TGSI_SEMANTIC_VERTEXID:
-					assert(p->type == PIPE_SHADER_VERTEX);
-					vertex_id = first;
-					p->cfg.regs[0] |= (1 << 12) | (1 << 0);
-					break;
-					*/
-				}
-				break;
-			case TGSI_FILE_ADDRESS:
-			case TGSI_FILE_CONSTANT:
-			case TGSI_FILE_SAMPLER:
-				break;
-			default:
-				NOUVEAU_ERR("bad decl file %d\n",
-					    d->Declaration.File);
-				goto out_err;
-			}
-		}
-			break;
-		case TGSI_TOKEN_TYPE_INSTRUCTION:
-			pc->insn_nr++;
-			prep_inspect_insn(pc, &tok->FullInstruction);
-			break;
-		default:
-			break;
-		}
-	}
-
-	if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) {
-		int rid = 0;
-
-		if (p->type == PIPE_SHADER_GEOMETRY) {
-			for (i = 0; i < pc->attr_nr; ++i) {
-				p->cfg.in[i].hw = rid;
-				p->cfg.in[i].id = i;
-
-				for (c = 0; c < 4; ++c) {
-					int n = i * 4 + c;
-					if (!pc->attr[n].acc)
-						continue;
-					pc->attr[n].hw = rid++;
-					p->cfg.in[i].mask |= 1 << c;
-				}
-			}
-		} else {
-			for (i = 0; i < pc->attr_nr * 4; ++i) {
-				if (pc->attr[i].acc) {
-					pc->attr[i].hw = rid++;
-					p->cfg.attr[i / 32] |= 1 << (i % 32);
-				}
-			}
-			if (p->cfg.regs[0] & (1 << 0))
-				pc->sysval[vertex_id].hw = rid++;
-			if (p->cfg.regs[0] & (1 << 4)) {
-				pc->sysval[instance_id].hw = rid++;
-				load_instance_id(pc, instance_id);
-			}
-		}
-
-		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
-			p->cfg.out[i].hw = rid;
-			p->cfg.out[i].id = i;
-
-			for (c = 0; c < 4; ++c) {
-				int n = i * 4 + c;
-				if (!pc->result[n].acc)
-					continue;
-				pc->result[n].hw = rid++;
-				p->cfg.out[i].mask |= 1 << c;
-			}
-		}
-		if (p->cfg.prim_id < 0x40) {
-			/* GP has to write to PrimitiveID */
-			ctor_reg(&pc->sysval[p->cfg.prim_id],
-				 P_RESULT, p->cfg.prim_id, rid);
-			p->cfg.prim_id = rid++;
-		}
-
-		for (c = 0; c < 2; ++c)
-			if (p->cfg.two_side[c].hw < 0x40)
-				p->cfg.two_side[c] = p->cfg.out[
-					p->cfg.two_side[c].hw];
-
-		if (p->cfg.psiz < 0x40)
-			p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw;
-
-		copy_semantic_info(p);
-	} else
-	if (p->type == PIPE_SHADER_FRAGMENT) {
-		int rid = 0, aid;
-		unsigned n = 0, m = pc->attr_nr - flat_nr;
-
-		pc->allow32 = TRUE;
-
-		/* do we read FragCoord ? */
-		if (pc->attr_nr &&
-		    p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
-			/* select FCRD components we want accessible */
-			for (c = 0; c < 4; ++c)
-				if (pc->attr[c].acc)
-					p->cfg.regs[1] |= 1 << (24 + c);
-			aid = 0;
-		} else /* offset by 1 if FCRD.w is needed for pinterp */
-			aid = popcnt4(p->cfg.regs[1] >> 24);
-
-		/* non-flat interpolants have to be mapped to
-		 * the lower hardware IDs, so sort them:
-		 */
-		for (i = 0; i < pc->attr_nr; i++) {
-			if (pc->interp_mode[i] == INTERP_FLAT)
-				p->cfg.in[m++].id = i;
-			else {
-				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
-					p->cfg.in[n].linear = TRUE;
-				p->cfg.in[n++].id = i;
-			}
-		}
-		copy_semantic_info(p);
-
-		for (n = 0; n < pc->attr_nr; ++n) {
-			p->cfg.in[n].hw = rid = aid;
-			i = p->cfg.in[n].id;
-
-			if (p->info.input_semantic_name[i] ==
-			    TGSI_SEMANTIC_FACE) {
-				load_frontfacing(pc, &pc->attr[i * 4]);
-				continue;
-			}
-
-			for (c = 0; c < 4; ++c) {
-				if (!pc->attr[i * 4 + c].acc)
-					continue;
-				pc->attr[i * 4 + c].rhw = rid++;
-				p->cfg.in[n].mask |= 1 << c;
-
-				load_interpolant(pc, &pc->attr[i * 4 + c]);
-			}
-			aid += popcnt4(p->cfg.in[n].mask);
-		}
-
-		m = popcnt4(p->cfg.regs[1] >> 24);
-
-		/* set count of non-position inputs and of non-flat
-		 * non-position inputs for FP_INTERPOLANT_CTRL
-		 */
-		p->cfg.regs[1] |= aid - m;
-
-		if (flat_nr) {
-			i = p->cfg.in[pc->attr_nr - flat_nr].hw;
-			p->cfg.regs[1] |= (i - m) << 16;
-		} else
-			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
-
-		/* mark color semantic for light-twoside */
-		n = 0x80;
-		for (i = 0; i < p->cfg.in_nr; i++) {
-			if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) {
-				n = MIN2(n, p->cfg.in[i].hw - m);
-				p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i];
-
-				p->cfg.regs[0] += /* increase colour count */
-					popcnt4(p->cfg.in[i].mask) << 16;
-			}
-		}
-		if (n < 0x80)
-			p->cfg.regs[0] += n;
-
-		if (p->cfg.prim_id < 0x40) {
-			pc->sysval[p->cfg.prim_id].rhw = rid++;
-			emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL,
-				    INTERP_FLAT);
-			/* increase FP_INTERPOLANT_CTRL_COUNT */
-			p->cfg.regs[1] += 1;
-		}
-
-		/* Initialize FP results:
-		 * FragDepth is always first TGSI and last hw output
-		 */
-		i = p->info.writes_z ? 4 : 0;
-		for (rid = 0; i < pc->result_nr * 4; i++)
-			pc->result[i].rhw = rid++;
-		if (p->info.writes_z)
-			pc->result[2].rhw = rid++;
-
-		p->cfg.high_result = rid;
-
-		/* separate/different colour results for MRTs ? */
-		if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1)
-			p->cfg.regs[2] |= 1;
-	}
-
-	if (pc->immd_nr) {
-		int rid = 0;
-
-		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg));
-		if (!pc->immd)
-			goto out_err;
-
-		for (i = 0; i < pc->immd_nr; i++) {
-			for (c = 0; c < 4; c++, rid++)
-				ctor_reg(&pc->immd[rid], P_IMMD, i, rid);
-		}
-	}
-
-	ret = TRUE;
-out_err:
-	if (pc->iv_p)
-		free_temp(pc, pc->iv_p);
-	if (pc->iv_c)
-		free_temp(pc, pc->iv_c);
-
-	tgsi_parse_free(&tp);
-	return ret;
-}
-
-static void
-free_nv50_pc(struct nv50_pc *pc)
-{
-	if (pc->immd)
-		FREE(pc->immd);
-	if (pc->param)
-		FREE(pc->param);
-	if (pc->result)
-		FREE(pc->result);
-	if (pc->attr)
-		FREE(pc->attr);
-	if (pc->temp)
-		FREE(pc->temp);
-	if (pc->sysval)
-		FREE(pc->sysval);
-	if (pc->insn_pos)
-		FREE(pc->insn_pos);
-
-	FREE(pc);
-}
-
-static INLINE uint32_t
-nv50_map_gs_output_prim(unsigned pprim)
-{
-	switch (pprim) {
-	case PIPE_PRIM_POINTS:
-		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
-	case PIPE_PRIM_LINE_STRIP:
-		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
-	case PIPE_PRIM_TRIANGLE_STRIP:
-		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
-	default:
-		NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim);
-		abort();
-		return 0;
-	}
-}
-
-static boolean
-ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
-{
-	int i, c;
-	unsigned rtype[2] = { P_ATTR, P_RESULT };
-
-	pc->p = p;
-	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1;
-	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1;
-	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1;
-	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
-	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
-	assert(pc->addr_nr <= 2);
-	pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1;
-
-	p->cfg.high_temp = 4;
-
-	p->cfg.two_side[0].hw = 0x40;
-	p->cfg.two_side[1].hw = 0x40;
-	p->cfg.prim_id = 0x40;
-
-	p->cfg.edgeflag_in = pc->edgeflag_out = 0xff;
-
-	for (i = 0; i < p->info.num_properties; ++i) {
-		unsigned *data = &p->info.properties[i].data[0];
-
-		switch (p->info.properties[i].name) {
-		case TGSI_PROPERTY_GS_OUTPUT_PRIM:
-			p->cfg.prim_type = nv50_map_gs_output_prim(data[0]);
-			break;
-		case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
-			p->cfg.vert_count = data[0];
-			break;
-		default:
-			break;
-		}
-	}
-
-	switch (p->type) {
-	case PIPE_SHADER_VERTEX:
-		p->cfg.psiz = 0x40;
-		p->cfg.clpd = 0x40;
-		p->cfg.out_nr = pc->result_nr;
-		break;
-	case PIPE_SHADER_GEOMETRY:
-		assert(p->cfg.prim_type);
-		assert(p->cfg.vert_count);
-
-		p->cfg.psiz = 0x80;
-		p->cfg.clpd = 0x80;
-		p->cfg.prim_id = 0x80;
-		p->cfg.out_nr = pc->result_nr;
-		p->cfg.in_nr = pc->attr_nr;
-
-		p->cfg.two_side[0].hw = 0x80;
-		p->cfg.two_side[1].hw = 0x80;
-		break;
-	case PIPE_SHADER_FRAGMENT:
-		rtype[0] = rtype[1] = P_TEMP;
-
-		p->cfg.regs[0] = 0x01000004;
-		p->cfg.in_nr = pc->attr_nr;
-
-		if (p->info.writes_z) {
-			p->cfg.regs[2] |= 0x00000100;
-			p->cfg.regs[3] |= 0x00000011;
-		}
-		if (p->info.uses_kill)
-			p->cfg.regs[2] |= 0x00100000;
-		break;
-	}
-
-	if (pc->temp_nr) {
-		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg));
-		if (!pc->temp)
-			return FALSE;
-
-		for (i = 0; i < pc->temp_nr * 4; ++i)
-			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1);
-	}
-
-	if (pc->attr_nr) {
-		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg));
-		if (!pc->attr)
-			return FALSE;
-
-		for (i = 0; i < pc->attr_nr * 4; ++i)
-			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1);
-	}
-
-	if (pc->result_nr) {
-		unsigned nr = pc->result_nr * 4;
-
-		pc->result = MALLOC(nr * sizeof(struct nv50_reg));
-		if (!pc->result)
-			return FALSE;
-
-		for (i = 0; i < nr; ++i)
-			ctor_reg(&pc->result[i], rtype[1], i / 4, -1);
-	}
-
-	if (pc->param_nr) {
-		int rid = 0;
-
-		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg));
-		if (!pc->param)
-			return FALSE;
-
-		for (i = 0; i < pc->param_nr; ++i)
-			for (c = 0; c < 4; ++c, ++rid)
-				ctor_reg(&pc->param[rid], P_CONST, i, rid);
-	}
-
-	if (pc->addr_nr) {
-		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *));
-		if (!pc->addr)
-			return FALSE;
-	}
-	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
-		ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1);
-
-	if (pc->sysval_nr) {
-		pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *));
-		if (!pc->sysval)
-			return FALSE;
-		/* will only ever use SYSTEM_VALUE[i].x (hopefully) */
-		for (i = 0; i < pc->sysval_nr; ++i)
-			ctor_reg(&pc->sysval[i], rtype[0], i, -1);
-	}
-
-	return TRUE;
-}
-
-static void
-nv50_program_fixup_insns(struct nv50_pc *pc)
-{
-	struct nv50_program_exec *e, **bra_list;
-	unsigned i, n, pos;
-
-	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
-
-	/* Collect branch instructions, we need to adjust their offsets
-	 * when converting 32 bit instructions to 64 bit ones
-	 */
-	for (n = 0, e = pc->p->exec_head; e; e = e->next)
-		if (e->param.index >= 0 && !e->param.mask)
-			bra_list[n++] = e;
-
-	/* Make sure we don't have any single 32 bit instructions. */
-	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
-		pos += is_long(e) ? 2 : 1;
-
-		if ((pos & 1) && (!e->next || is_long(e->next))) {
-			for (i = 0; i < n; ++i)
-				if (bra_list[i]->param.index >= pos)
-					bra_list[i]->param.index += 1;
-			for (i = 0; i < pc->insn_nr; ++i)
-				if (pc->insn_pos[i] >= pos)
-					pc->insn_pos[i] += 1;
-			convert_to_long(pc, e);
-			++pos;
-		}
-	}
-
-	FREE(bra_list);
-
-	if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL])
-		return;
-
-	/* fill in CALL offsets */
-	for (e = pc->p->exec_head; e; e = e->next) {
-		if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2)
-			e->param.index = pc->insn_pos[e->param.index];
-	}
-}
-
-static boolean
-nv50_program_tx(struct nv50_program *p)
-{
-	struct tgsi_parse_context parse;
-	struct nv50_pc *pc;
-	boolean ret;
-
-	pc = CALLOC_STRUCT(nv50_pc);
-	if (!pc)
-		return FALSE;
-
-	ret = ctor_nv50_pc(pc, p);
-	if (ret == FALSE)
-		goto out_cleanup;
-
-	ret = nv50_program_tx_prep(pc);
-	if (ret == FALSE)
-		goto out_cleanup;
-
-	pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned));
-
-	tgsi_parse_init(&parse, pc->p->pipe.tokens);
-	while (!tgsi_parse_end_of_tokens(&parse)) {
-		const union tgsi_full_token *tok = &parse.FullToken;
-
-		/* previously allow32 was FALSE for first & last instruction */
-		pc->allow32 = TRUE;
-
-		tgsi_parse_token(&parse);
-
-		switch (tok->Token.Type) {
-		case TGSI_TOKEN_TYPE_INSTRUCTION:
-			pc->insn_pos[pc->insn_cur] = pc->p->exec_size;
-			++pc->insn_cur;
-			ret = nv50_tgsi_insn(pc, tok);
-			if (ret == FALSE)
-				goto out_err;
-			break;
-		default:
-			break;
-		}
-	}
-
-	nv50_program_fixup_insns(pc);
-
-	p->param_nr = pc->param_nr * 4;
-	p->immd_nr = pc->immd_nr * 4;
-	p->immd = pc->immd_buf;
-
-out_err:
-	tgsi_parse_free(&parse);
-
-out_cleanup:
-	free_nv50_pc(pc);
-	return ret;
-}
-
-static void
-nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
-{
-	if (nv50_program_tx(p) == FALSE)
-		assert(0);
-	p->translated = TRUE;
-}
-
-static void
-nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map,
-			unsigned start, unsigned count, unsigned cbuf)
-{
-	struct nouveau_channel *chan = nv50->screen->base.channel;
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
-
-	while (count) {
-		unsigned nr = count > 2047 ? 2047 : count;
-
-		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
-		OUT_RING  (chan, (cbuf << 0) | (start << 8));
-		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr);
-		OUT_RINGp (chan, map, nr);
-
-		map += nr;
-		start += nr;
-		count -= nr;
-	}
-}
-
-static void
-nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
-{
-	struct pipe_context *pipe = &nv50->pipe;
-	struct pipe_transfer *transfer;
-
-	if (!p->data[0] && p->immd_nr) {
-		struct nouveau_resource *heap = nv50->screen->immd_heap;
-
-		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) {
-			while (heap->next && heap->size < p->immd_nr) {
-				struct nv50_program *evict = heap->next->priv;
-				nouveau_resource_free(&evict->data[0]);
-			}
-
-			if (nouveau_resource_alloc(heap, p->immd_nr, p,
-						   &p->data[0]))
-				assert(0);
-		}
-
-		/* immediates only need to be uploaded again when freed */
-		nv50_program_upload_data(nv50, p->immd, p->data[0]->start,
-					 p->immd_nr, NV50_CB_PMISC);
-	}
-
-	assert(p->param_nr <= 16384);
-
-	if (p->param_nr) {
-		unsigned cb;
-		uint32_t *map = pipe_buffer_map(pipe,
-						nv50->constbuf[p->type],
-						PIPE_TRANSFER_READ,
-						&transfer);
-		switch (p->type) {
-		case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break;
-		case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break;
-		default:
-			cb = NV50_CB_PVP;
-			assert(p->type == PIPE_SHADER_VERTEX);
-			break;
-		}
-
-		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
-		pipe_buffer_unmap(pipe, nv50->constbuf[p->type],
-				  transfer);
-	}
-}
-
-static void
-nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
-{
-	struct nouveau_channel *chan = nv50->screen->base.channel;
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nv50_program_exec *e;
-	uint32_t *up, i;
-	boolean upload = FALSE;
-	unsigned offset;
-	int width;
-
-	if (!p->bo) {
-		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100,
-			       p->exec_size * 4, &p->bo);
-		upload = TRUE;
-	}
-
-	if (p->data[0] && p->data[0]->start != p->data_start[0])
-		upload = TRUE;
-
-	if (!upload)
-		return;
-
-	up = MALLOC(p->exec_size * 4);
-
-	for (i = 0, e = p->exec_head; e; e = e->next) {
-		unsigned ei, ci, bs;
-
-		if (e->param.index >= 0 && e->param.mask) {
-			bs = (e->inst[1] >> 22) & 0x07;
-			assert(bs < 2);
-			ei = e->param.shift >> 5;
-			ci = e->param.index;
-			if (bs == 0)
-				ci += p->data[bs]->start;
-
-			e->inst[ei] &= ~e->param.mask;
-			e->inst[ei] |= (ci << e->param.shift);
-		} else
-		if (e->param.index >= 0) {
-			/* zero mask means param is a jump/branch offset */
-			assert(!(e->param.index & 1));
-			/* seem to be 8 byte steps */
-			ei = (e->param.index >> 1) + 0 /* START_ID */;
-
-			e->inst[0] &= 0xf0000fff;
-			e->inst[0] |= ei << 12;
-		}
-
-		up[i++] = e->inst[0];
-		if (is_long(e))
-			up[i++] = e->inst[1];
-	}
-	assert(i == p->exec_size);
-
-	if (p->data[0])
-		p->data_start[0] = p->data[0]->start;
-
-#ifdef NV50_PROGRAM_DUMP
-	NOUVEAU_ERR("-------\n");
-	for (e = p->exec_head; e; e = e->next) {
-		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
-		if (is_long(e))
-			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
-	}
-#endif
-
-	/* SIFC_HEIGHT/SIFC_WIDTH of 65536 do not work, and are not reported
-	 * as data error either. hw bug ? */
-#define SIFC_MAX_WIDTH (65536 - 256)
-	offset = 0;
-	width = p->exec_size * 4;
-	while (width > 0) {
-		nv50_upload_sifc(nv50, p->bo, offset, NOUVEAU_BO_VRAM,
-				 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144,
-				 &up[offset / 4], NV50_2D_SIFC_FORMAT_R8_UNORM,
-				 0, 0, 0, MIN2(SIFC_MAX_WIDTH, width), 1, 1);
-		width -= SIFC_MAX_WIDTH;
-		offset += SIFC_MAX_WIDTH;
-	}
-	BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1);
-	OUT_RING  (chan, 0);
-
-	FREE(up);
-}
-
-struct nouveau_stateobj *
-nv50_vertprog_validate(struct nv50_context *nv50)
-{
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nv50_program *p = nv50->vertprog;
-	struct nouveau_stateobj *so;
-
-	if (!p->translated) {
-		nv50_program_validate(nv50, p);
-		if (!p->translated)
-			assert(0);
-	}
-
-	nv50_program_validate_data(nv50, p);
-	nv50_program_validate_code(nv50, p);
-
-	if (!(nv50->dirty & NV50_NEW_VERTPROG))
-		return NULL;
-
-	so = so_new(5, 7, 2);
-	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
-	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
-	so_data  (so, p->cfg.attr[0]);
-	so_data  (so, p->cfg.attr[1]);
-	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
-	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1);
-	so_data  (so, p->cfg.high_temp);
-	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
-	so_data  (so, 0); /* program start offset */
-	return so;
-}
-
-struct nouveau_stateobj *
-nv50_fragprog_validate(struct nv50_context *nv50)
-{
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nv50_program *p = nv50->fragprog;
-	struct nouveau_stateobj *so;
-
-	if (!p->translated) {
-		nv50_program_validate(nv50, p);
-		if (!p->translated)
-			assert(0);
-	}
-
-	nv50_program_validate_data(nv50, p);
-	nv50_program_validate_code(nv50, p);
-
-	if (!(nv50->dirty & NV50_NEW_FRAGPROG))
-		return NULL;
-
-	so = so_new(6, 7, 2);
-	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
-	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		      NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		      NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
-	so_data  (so, p->cfg.high_temp);
-	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
-	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, NV50TCL_FP_CONTROL, 1);
-	so_data  (so, p->cfg.regs[2]);
-	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
-	so_data  (so, p->cfg.regs[3]);
-	so_method(so, tesla, NV50TCL_FP_START_ID, 1);
-	so_data  (so, 0); /* program start offset */
-	return so;
-}
-
-struct nouveau_stateobj *
-nv50_geomprog_validate(struct nv50_context *nv50)
-{
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nv50_program *p = nv50->geomprog;
-	struct nouveau_stateobj *so;
-
-	if (!p->translated) {
-		nv50_program_validate(nv50, p);
-		if (!p->translated)
-			assert(0);
-	}
-
-	nv50_program_validate_data(nv50, p);
-	nv50_program_validate_code(nv50, p);
-
-	if (!(nv50->dirty & NV50_NEW_GEOMPROG))
-		return NULL;
-
-	so = so_new(6, 7, 2);
-	so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2);
-	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		  NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1);
-	so_data  (so, p->cfg.high_temp);
-	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1);
-	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1);
-	so_data  (so, p->cfg.prim_type);
-	so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1);
-	so_data  (so, p->cfg.vert_count);
-	so_method(so, tesla, NV50TCL_GP_START_ID, 1);
-	so_data  (so, 0);
-	return so;
-}
-
-static uint32_t
-nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
-{
-	struct nv50_program *vp;
-	struct nv50_program *fp = nv50->fragprog;
-	unsigned i, c, m = base;
-	uint32_t origin = 0x00000010;
-
-	vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog;
-
-	/* XXX: this might not work correctly in all cases yet - we'll
-	 * just assume that an FP generic input that is not written in
-	 * the VP is PointCoord.
-	 */
-	memset(pntc, 0, 8 * sizeof(uint32_t));
-
-	for (i = 0; i < fp->cfg.in_nr; i++) {
-		unsigned j, n = popcnt4(fp->cfg.in[i].mask);
-
-		if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) {
-			m += n;
-			continue;
-		}
-
-		for (j = 0; j < vp->cfg.out_nr; ++j)
-			if (vp->cfg.out[j].sn ==  fp->cfg.in[i].sn &&
-			    vp->cfg.out[j].si == fp->cfg.in[i].si)
-				break;
-
-		if (j < vp->info.num_outputs) {
-			ubyte enable =
-				 (nv50->rasterizer->pipe.sprite_coord_enable >> vp->cfg.out[j].si) & 1;
-
-			if (enable == 0) {
-				m += n;
-				continue;
-			}
-		}
-
-		/* this is either PointCoord or replaced by sprite coords */
-		for (c = 0; c < 4; c++) {
-			if (!(fp->cfg.in[i].mask & (1 << c)))
-				continue;
-			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
-			++m;
-		}
-	}
-	return (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT ? 0 : origin);
-}
-
-static int
-nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4],
-	      struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
-{
-	int c;
-	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
-	uint8_t *map = (uint8_t *)map32;
-
-	for (c = 0; c < 4; ++c) {
-		if (mf & 1) {
-			if (fpi->linear == TRUE)
-				lin[mid / 32] |= 1 << (mid % 32);
-			if (mv & 1)
-				map[mid] = oid;
-			else
-				map[mid] = (c == 3) ? (zval + 1) : zval;
-			++mid;
-		}
-
-		oid += mv & 1;
-		mf >>= 1;
-		mv >>= 1;
-	}
-
-	return mid;
-}
-
-struct nouveau_stateobj *
-nv50_fp_linkage_validate(struct nv50_context *nv50)
-{
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nv50_program *vp = nv50->vertprog;
-	struct nv50_program *fp = nv50->fragprog;
-	struct nouveau_stateobj *so;
-	struct nv50_sreg4 dummy;
-	int i, n, c, m = 0;
-	uint32_t map[16], lin[4], reg[6], pcrd[8];
-	uint8_t zval = 0x40;
-
-	if (nv50->geomprog) {
-		vp = nv50->geomprog;
-		zval = 0x80;
-	}
-	memset(map, 0, sizeof(map));
-	memset(lin, 0, sizeof(lin));
-
-	reg[1] = 0x00000004; /* low and high clip distance map ids */
-	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
-	reg[3] = 0x00000000; /* point size map id & enable */
-	reg[5] = 0x00000000; /* primitive ID map slot */
-	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
-	reg[4] = fp->cfg.regs[1]; /* interpolant info */
-
-	dummy.linear = FALSE;
-	dummy.mask = 0xf; /* map all components of HPOS */
-	m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]);
-
-	dummy.mask = 0x0;
-
-	if (vp->cfg.clpd < 0x40) {
-		for (c = 0; c < vp->cfg.clpd_nr; ++c) {
-			map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8);
-			++m;
-		}
-		reg[1] = (m << 8);
-	}
-
-	reg[0] |= m << 8; /* adjust BFC0 id */
-
-	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
-	if (nv50->rasterizer->pipe.light_twoside) {
-		struct nv50_sreg4 *vpo = &vp->cfg.two_side[0];
-		struct nv50_sreg4 *fpi = &fp->cfg.two_side[0];
-
-		m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]);
-		m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]);
-	}
-
-	reg[0] += m - 4; /* adjust FFC0 id */
-	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
-
-	for (i = 0; i < fp->cfg.in_nr; i++) {
-		/* maybe even remove these from cfg.io */
-		if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION ||
-		    fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE)
-			continue;
-
-		for (n = 0; n < vp->cfg.out_nr; ++n)
-			if (vp->cfg.out[n].sn == fp->cfg.in[i].sn &&
-			    vp->cfg.out[n].si == fp->cfg.in[i].si)
-				break;
-
-		m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i],
-				  (n < vp->cfg.out_nr) ?
-				  &vp->cfg.out[n] : &dummy);
-	}
-	/* PrimitiveID either is replaced by the system value, or
-	 * written by the geometry shader into an output register
-	 */
-	if (fp->cfg.prim_id < 0x40) {
-		map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8);
-		reg[5] = m++;
-	}
-
-	if (nv50->rasterizer->pipe.point_size_per_vertex) {
-		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8);
-		reg[3] = (m++ << 4) | 1;
-	}
-
-	/* now fill the stateobj (at most 28 so_data)  */
-	so = so_new(10, 54, 0);
-
-	n = (m + 3) / 4;
-	assert(m <= 64);
-	if (vp->type == PIPE_SHADER_GEOMETRY) {
-		so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1);
-		so_data  (so, m);
-		so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n);
-		so_datap (so, map, n);
-	} else {
-		so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
-		so_data  (so, vp->cfg.regs[0]);
-
-		so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1);
-		so_data  (so, reg[5]);
-
-		so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
-		so_data  (so, m);
-		so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
-		so_datap (so, map, n);
-	}
-
-	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
-	so_datap (so, reg, 4);
-
-	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
-	so_data  (so, reg[4]);
-
-	so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
-	so_datap (so, lin, 4);
-
-	if (nv50->rasterizer->pipe.sprite_coord_enable) {
-		so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
-		so_data  (so,
-			  nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff));
-
-		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
-		so_datap (so, pcrd, 8);
-	}
-
-	so_method(so, tesla, NV50TCL_GP_ENABLE, 1);
-	so_data  (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0);
-
-	return so;
-}
-
-static int
-construct_vp_gp_mapping(uint32_t *map32, int m,
-			struct nv50_program *vp, struct nv50_program *gp)
-{
-	uint8_t *map = (uint8_t *)map32;
-	int i, j, c;
-
-        for (i = 0; i < gp->cfg.in_nr; ++i) {
-                uint8_t oid = 0, mv = 0, mg = gp->cfg.in[i].mask;
-
-                for (j = 0; j < vp->cfg.out_nr; ++j) {
-                        if (vp->cfg.out[j].sn == gp->cfg.in[i].sn &&
-                            vp->cfg.out[j].si == gp->cfg.in[i].si) {
-				mv = vp->cfg.out[j].mask;
-				oid = vp->cfg.out[j].hw;
-                                break;
-			}
-		}
-
-                for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) {
-			if (mg & mv & 1)
-				map[m++] = oid;
-			else
-			if (mg & 1)
-				map[m++] = (c == 3) ? 0x41 : 0x40;
-                        oid += mv & 1;
-                }
-        }
-	return m;
-}
-
-struct nouveau_stateobj *
-nv50_gp_linkage_validate(struct nv50_context *nv50)
-{
-	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nouveau_stateobj *so;
-	struct nv50_program *vp = nv50->vertprog;
-	struct nv50_program *gp = nv50->geomprog;
-	uint32_t map[16];
-	int m = 0;
-
-	if (!gp)
-		return NULL;
-	memset(map, 0, sizeof(map));
-
-	m = construct_vp_gp_mapping(map, m, vp, gp);
-
-	so = so_new(3, 24 - 3, 0);
-
-	so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
-	so_data  (so, vp->cfg.regs[0] | gp->cfg.regs[0]);
-
-	assert(m <= 32);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
-	so_data  (so, m);
-
-	m = (m + 3) / 4;
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m);
-	so_datap (so, map, m);
-
-	return so;
-}
-
-void
-nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
-{
-	while (p->exec_head) {
-		struct nv50_program_exec *e = p->exec_head;
-
-		p->exec_head = e->next;
-		FREE(e);
-	}
-	p->exec_tail = NULL;
-	p->exec_size = 0;
+   nouveau_bo_ref(NULL, &p->bo);
 
-	nouveau_bo_ref(NULL, &p->bo);
+   so_ref(NULL, &p->so);
 
-	FREE(p->immd);
-	nouveau_resource_free(&p->data[0]);
+   if (p->code)
+      FREE(p->code);
 
-	p->translated = 0;
+   p->translated = FALSE;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 1e3ad6bff0..654bce59f3 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -1,75 +1,116 @@
-#ifndef __NV50_PROGRAM_H__
-#define __NV50_PROGRAM_H__
+/*
+ * Copyright 2010 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NV50_PROG_H__
+#define __NV50_PROG_H__
 
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
+#include "nouveau/nouveau_class.h"
 
-struct nv50_program_exec {
-	struct nv50_program_exec *next;
+struct nv50_varying {
+   uint8_t id; /* tgsi index */
+   uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
 
-	unsigned inst[2];
-	struct {
-		int index;
-		unsigned mask;
-		unsigned shift;
-	} param;
-};
-
-struct nv50_sreg4 {
-	uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
-	uint8_t id; /* tgsi index */
-
-	uint8_t mask;
-	boolean linear;
+   uint8_t mask   : 4;
+   uint8_t linear : 1;
+   uint8_t pad    : 3;
 
-	ubyte sn, si; /* semantic name & index */
+   ubyte sn; /* semantic name */
+   ubyte si; /* semantic index */
 };
 
 struct nv50_program {
-	struct pipe_shader_state pipe;
-	struct tgsi_shader_info info;
-	boolean translated;
-
-	unsigned type;
-	struct nv50_program_exec *exec_head;
-	struct nv50_program_exec *exec_tail;
-	unsigned exec_size;
-	struct nouveau_resource *data[1];
-	unsigned data_start[1];
-
-	struct nouveau_bo *bo;
-
-	uint32_t *immd;
-	unsigned immd_nr;
-	unsigned param_nr;
-
-	struct {
-		unsigned high_temp;
-		unsigned high_result;
-
-		uint32_t attr[2];
-		uint32_t regs[4];
-
-		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
-		unsigned in_nr, out_nr;
-		struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS];
-		struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS];
-
-		/* FP colour inputs, VP/GP back colour outputs */
-		struct nv50_sreg4 two_side[2];
-
-		/* GP only */
-		unsigned vert_count;
-		uint8_t prim_type;
-
-		/* VP & GP only */
-		uint8_t clpd, clpd_nr;
-		uint8_t psiz;
-		uint8_t edgeflag_in;
-
-		/* FP & GP only */
-		uint8_t prim_id;
-	} cfg;
+   struct pipe_shader_state pipe;
+
+   ubyte type;
+   boolean translated;
+
+   struct nouveau_bo *bo;
+   struct nouveau_stateobj *so;
+
+   uint32_t *code;
+   unsigned code_size;
+   unsigned code_start; /* offset inside bo */
+   uint32_t *immd;
+   unsigned immd_size;
+   unsigned parm_size; /* size limit of uniform buffer */
+
+   ubyte max_gpr; /* REG_ALLOC_TEMP */
+   ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */
+
+   ubyte in_nr;
+   ubyte out_nr;
+   struct nv50_varying in[16];
+   struct nv50_varying out[16];
+
+   struct {
+      uint32_t attrs[3]; /* VP_ATTR_EN_0,1 and VP_GP_BUILTIN_ATTR_EN */
+      ubyte psiz;
+      ubyte bfc[2];
+      ubyte edgeflag;
+      ubyte clpd;
+      ubyte clpd_nr;
+   } vp;
+
+   struct {
+      uint32_t flags[2]; /* 0x19a8, 196c */
+      uint32_t interp; /* 0x1988 */
+      uint32_t colors; /* 0x1904 */
+   } fp;
+
+   struct {
+      ubyte primid; /* primitive id output register */
+      uint8_t vert_count;
+      uint8_t prim_type; /* point, line strip or tri strip */
+   } gp;
+
+   void *fixups;
+   unsigned num_fixups;
 };
 
-#endif
+#define NV50_INTERP_LINEAR   (1 << 0)
+#define NV50_INTERP_FLAT     (1 << 1)
+#define NV50_INTERP_CENTROID (1 << 2)
+
+struct nv50_translation_info {
+   struct nv50_program *p;
+   unsigned inst_nr;
+   ubyte input_file;
+   ubyte output_file;
+   ubyte input_map[PIPE_MAX_SHADER_INPUTS][4];
+   ubyte output_map[PIPE_MAX_SHADER_OUTPUTS][4];
+   ubyte interp_mode[PIPE_MAX_SHADER_INPUTS];
+   int input_access[PIPE_MAX_SHADER_INPUTS][4];
+   int output_access[PIPE_MAX_SHADER_OUTPUTS][4];
+   boolean indirect_inputs;
+   boolean indirect_outputs;
+   struct tgsi_shader_info scan;
+   uint32_t *immd32;
+   unsigned immd32_nr;
+   ubyte edgeflag_out;
+};
+
+int nv50_generate_code(struct nv50_translation_info *ti);
+boolean nv50_program_tx(struct nv50_program *p);
+
+#endif /* __NV50_PROG_H__ */
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
index c3ac804146..481182dd8d 100644
--- a/src/gallium/drivers/nv50/nv50_push.c
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -227,7 +227,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe,
    ctx.idxbuf = NULL;
    ctx.vtx_size = 0;
    ctx.edgeflag = 0.5f;
-   ctx.edgeflag_attr = nv50->vertprog->cfg.edgeflag_in;
+   ctx.edgeflag_attr = nv50->vertprog->vp.edgeflag;
 
    /* map vertex buffers, determine vertex size */
    for (i = 0; i < nv50->vtxelt->num_elements; i++) {
diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
new file mode 100644
index 0000000000..f7e6355286
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -0,0 +1,619 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "nv50_context.h"
+#include "nv50_transfer.h"
+
+static void
+nv50_transfer_constbuf(struct nv50_context *nv50,
+                       struct pipe_resource *buf, unsigned size, unsigned cbi)
+{
+   struct pipe_context *pipe = &nv50->pipe;
+   struct pipe_transfer *transfer;
+   struct nouveau_channel *chan = nv50->screen->base.channel;
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+   uint32_t *map;
+   unsigned count, start;
+
+   map = pipe_buffer_map(pipe, buf, PIPE_TRANSFER_READ, &transfer);
+   if (!map)
+      return;
+
+   count = MIN2(buf->width0, size);
+   start = 0;
+
+   while (count) {
+      unsigned nr = count;
+      nr = MIN2(nr, 2047);
+
+      /* FIXME: emit relocs for unsuiTed MM */
+      BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
+      OUT_RING  (chan, (start << 8) | cbi);
+      BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr);
+      OUT_RINGp (chan, map, nr);
+
+      count -= nr;
+      start += nr;
+      map += nr;
+   }
+
+   pipe_buffer_unmap(pipe, buf, transfer);
+}
+
+static void
+nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
+{
+   struct nouveau_channel *chan = nv50->screen->base.channel;
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+   unsigned cbi;
+
+   if (p->immd_size) {
+      uint32_t *data = p->immd;
+      unsigned count = p->immd_size / 4;
+      unsigned start = 0;
+
+      while (count) {
+         unsigned nr = count;
+         nr = MIN2(nr, 2047);
+
+         BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1);
+         OUT_RING  (chan, (start << 8) | NV50_CB_PMISC);
+         BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr);
+         OUT_RINGp (chan, data, nr);
+
+         count -= nr;
+         start += nr;
+         data += nr;
+      }
+   }
+
+   if (p->parm_size == 0)
+      return;
+
+   switch (p->type) {
+   case PIPE_SHADER_VERTEX:
+      cbi = NV50_CB_PVP;
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      cbi = NV50_CB_PFP;
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      cbi = NV50_CB_PGP;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   nv50_transfer_constbuf(nv50, nv50->constbuf[p->type], p->parm_size, cbi);
+}
+
+static void
+nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
+{
+   struct nouveau_channel *chan = nv50->screen->base.channel;
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+   struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+   int ret;
+   unsigned offset;
+   unsigned size = p->code_size;
+   uint32_t *data = p->code;
+
+   assert(p->translated);
+
+   /* TODO: use a single bo (for each type) for shader code */
+   if (p->bo)
+      return;
+   ret = nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, size, &p->bo);
+   assert(!ret);
+
+   offset = p->code_start = 0;
+
+   BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2);
+   OUT_RING  (chan, NV50_2D_DST_FORMAT_R8_UNORM);
+   OUT_RING  (chan, 1);
+   BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1);
+   OUT_RING  (chan, 0x40000);
+   BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 2);
+   OUT_RING  (chan, 0x10000);
+   OUT_RING  (chan, 1);
+
+   while (size) {
+      unsigned nr = size / 4;
+
+      if (AVAIL_RING(chan) < 32)
+         FIRE_RING(chan);
+
+      nr = MIN2(nr, AVAIL_RING(chan) - 18);
+      nr = MIN2(nr, 1792);
+      if (nr < (size / 4))
+         nr &= ~0x3f;
+      assert(!(size & 3));
+
+      BEGIN_RING(chan, eng2d, NV50_2D_DST_ADDRESS_HIGH, 2);
+      OUT_RELOCh(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+      OUT_RELOCl(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+      BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
+      OUT_RING  (chan, 0);
+      OUT_RING  (chan, NV50_2D_SIFC_FORMAT_R8_UNORM);
+      BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10);
+      OUT_RING  (chan, nr * 4);
+      OUT_RING  (chan, 1);
+      OUT_RING  (chan, 0);
+      OUT_RING  (chan, 1);
+      OUT_RING  (chan, 0);
+      OUT_RING  (chan, 1);
+      OUT_RING  (chan, 0);
+      OUT_RING  (chan, 0);
+      OUT_RING  (chan, 0);
+      OUT_RING  (chan, 0);
+
+      BEGIN_RING_NI(chan, eng2d, NV50_2D_SIFC_DATA, nr);
+      OUT_RINGp (chan, data, nr);
+
+      data += nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1);
+   OUT_RING  (chan, 0);
+}
+
+static void
+nv50_vp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p)
+{
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+   struct nouveau_stateobj *so = so_new(5, 7, 2);
+
+   nv50_program_validate_code(nv50, p);
+
+   so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
+   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+             NOUVEAU_BO_HIGH, 0, 0);
+   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+             NOUVEAU_BO_LOW, 0, 0);
+   so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
+   so_data  (so, p->vp.attrs[0]);
+   so_data  (so, p->vp.attrs[1]);
+   so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
+   so_data  (so, p->max_out);
+   so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1);
+   so_data  (so, p->max_gpr);
+   so_method(so, tesla, NV50TCL_VP_START_ID, 1);
+   so_data  (so, p->code_start);
+
+   so_ref(so, &p->so);
+   so_ref(NULL, &so);
+}
+
+static void
+nv50_fp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p)
+{
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(6, 7, 2);
+
+   nv50_program_validate_code(nv50, p);
+
+   so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
+   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+             NOUVEAU_BO_HIGH, 0, 0);
+   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+             NOUVEAU_BO_LOW, 0, 0);
+   so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1);
+   so_data  (so, p->max_gpr);
+   so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
+   so_data  (so, p->max_out);
+   so_method(so, tesla, NV50TCL_FP_CONTROL, 1);
+   so_data  (so, p->fp.flags[0]);
+   so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
+   so_data  (so, p->fp.flags[1]);
+   so_method(so, tesla, NV50TCL_FP_START_ID, 1);
+   so_data  (so, p->code_start);
+
+   so_ref(so, &p->so);
+   so_ref(NULL, &so);
+}
+
+static void
+nv50_gp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p)
+{
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(6, 7, 2);
+
+   nv50_program_validate_code(nv50, p);
+
+   so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2);
+   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+             NOUVEAU_BO_HIGH, 0, 0);
+   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+             NOUVEAU_BO_LOW, 0, 0);
+   so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1);
+   so_data  (so, p->max_gpr);
+   so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1);
+   so_data  (so, p->max_out);
+   so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1);
+   so_data  (so, p->gp.prim_type);
+   so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1);
+   so_data  (so, p->gp.vert_count);
+   so_method(so, tesla, NV50TCL_GP_START_ID, 1);
+   so_data  (so, p->code_start);
+
+   so_ref(so, &p->so);
+   so_ref(NULL, &so);
+}
+
+static boolean
+nv50_program_validate(struct nv50_program *p)
+{
+   p->translated = nv50_program_tx(p);
+   assert(p->translated);
+   return p->translated;
+}
+
+struct nouveau_stateobj *
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+   struct nv50_program *p = nv50->vertprog;
+   struct nouveau_stateobj *so = NULL;
+
+   if (!p->translated) {
+      if (nv50_program_validate(p))
+         nv50_vp_update_stateobj(nv50, p);
+      else
+         return NULL;
+   }
+
+   if (nv50->dirty & NV50_NEW_VERTPROG_CB)
+      nv50_program_validate_data(nv50, p);
+
+   if (!(nv50->dirty & NV50_NEW_VERTPROG))
+      return NULL;
+
+   nv50_program_validate_code(nv50, p);
+
+   so_ref(p->so, &so);
+   return so;
+}
+
+struct nouveau_stateobj *
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+   struct nv50_program *p = nv50->fragprog;
+   struct nouveau_stateobj *so = NULL;
+
+   if (!p->translated) {
+      if (nv50_program_validate(p))
+         nv50_fp_update_stateobj(nv50, p);
+      else
+         return NULL;
+   }
+
+   if (nv50->dirty & NV50_NEW_FRAGPROG_CB)
+      nv50_program_validate_data(nv50, p);
+
+   if (!(nv50->dirty & NV50_NEW_FRAGPROG))
+      return NULL;
+
+   nv50_program_validate_code(nv50, p);
+
+   so_ref(p->so, &so);
+   return so;
+}
+
+struct nouveau_stateobj *
+nv50_geomprog_validate(struct nv50_context *nv50)
+{
+   struct nv50_program *p = nv50->geomprog;
+   struct nouveau_stateobj *so = NULL;
+
+   if (!p->translated) {
+      if (nv50_program_validate(p))
+         nv50_gp_update_stateobj(nv50, p);
+      else
+         return NULL;
+   }
+
+   if (nv50->dirty & NV50_NEW_GEOMPROG_CB)
+      nv50_program_validate_data(nv50, p);
+
+   if (!(nv50->dirty & NV50_NEW_GEOMPROG))
+      return NULL;
+
+   nv50_program_validate_code(nv50, p);
+
+   so_ref(p->so, &so);
+   return so;
+}
+
+/* XXX: this might not work correctly in all cases yet: we assume that
+ * an FP generic input that is not written in the VP is gl_PointCoord.
+ */
+static uint32_t
+nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned m)
+{
+   struct nv50_program *vp = nv50->vertprog;
+   struct nv50_program *fp = nv50->fragprog;
+   unsigned i, c;
+
+   memset(pntc, 0, 8 * sizeof(uint32_t));
+
+   if (nv50->geomprog)
+      vp = nv50->geomprog;
+
+   for (i = 0; i < fp->in_nr; i++) {
+      unsigned j, n = util_bitcount(fp->in[i].mask);
+
+      if (fp->in[i].sn != TGSI_SEMANTIC_GENERIC) {
+         m += n;
+         continue;
+      }
+
+      for (j = 0; j < vp->out_nr; ++j)
+         if (vp->out[j].sn == fp->in[i].sn && vp->out[j].si == fp->in[i].si)
+            break;
+
+      if (j < vp->out_nr) {
+         ubyte en = nv50->rasterizer->pipe.sprite_coord_enable;
+
+         if (!(en & (1 << vp->out[j].si))) {
+            m += n;
+            continue;
+         }
+      }
+
+      /* this is either PointCoord or replaced by sprite coords */
+      for (c = 0; c < 4; c++) {
+         if (!(fp->in[i].mask & (1 << c)))
+            continue;
+         pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
+         ++m;
+      }
+   }
+   if (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+      return 0;
+   return (1 << 4);
+}
+
+static int
+nv50_vec4_map(uint32_t *map32, int mid, uint32_t lin[4],
+              struct nv50_varying *in, struct nv50_varying *out)
+{
+   int c;
+   uint8_t mv = out->mask, mf = in->mask, oid = out->hw;
+   uint8_t *map = (uint8_t *)map32;
+
+   for (c = 0; c < 4; ++c) {
+      if (mf & 1) {
+         if (in->linear)
+            lin[mid / 32] |= 1 << (mid % 32);
+         if (mv & 1)
+            map[mid] = oid;
+         else
+         if (c == 3)
+            map[mid] |= 1;
+         ++mid;
+      }
+
+      oid += mv & 1;
+      mf >>= 1;
+      mv >>= 1;
+   }
+
+   return mid;
+}
+
+struct nouveau_stateobj *
+nv50_fp_linkage_validate(struct nv50_context *nv50)
+{
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+   struct nv50_program *vp;
+   struct nv50_program *fp = nv50->fragprog;
+   struct nouveau_stateobj *so;
+   struct nv50_varying dummy;
+   int i, n, c, m;
+
+   uint32_t map[16], lin[4], pntc[8];
+
+   uint32_t interp = fp->fp.interp;
+   uint32_t colors = fp->fp.colors;
+   uint32_t clip = 0x04;
+   uint32_t psiz = 0x000;
+   uint32_t primid = 0;
+   uint32_t sysval = 0;
+
+   if (nv50->geomprog) {
+      vp = nv50->geomprog;
+      memset(map, 0x80, sizeof(map));
+   } else {
+      vp = nv50->vertprog;
+      memset(map, 0x40, sizeof(map));
+   }
+   memset(lin, 0, sizeof(lin));
+
+   dummy.linear = 0;
+   dummy.mask = 0xf; /* map all components of HPOS */
+   m = nv50_vec4_map(map, 0, lin, &dummy, &vp->out[0]);
+
+   if (vp->vp.clpd < 0x40) {
+      for (c = 0; c < vp->vp.clpd_nr; ++c) {
+         map[m / 4] |= (vp->vp.clpd + c) << ((m % 4) * 8);
+         ++m;
+      }
+      clip |= vp->vp.clpd_nr << 8;
+   }
+
+   colors |= m << 8; /* adjust BFC0 id */
+
+   /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
+   if (nv50->rasterizer->pipe.light_twoside) {
+      for (i = 0; i < 2; ++i)
+         m = nv50_vec4_map(map, m, lin,
+                           &fp->in[fp->vp.bfc[i]],
+                           &vp->out[vp->vp.bfc[i]]);
+   }
+
+   colors += m - 4; /* adjust FFC0 id */
+   interp |= m << 8; /* set mid where 'normal' FP inputs start */
+
+   dummy.mask = 0x0;
+   for (i = 0; i < fp->in_nr; i++) {
+      for (n = 0; n < vp->out_nr; ++n)
+         if (vp->out[n].sn == fp->in[i].sn &&
+             vp->out[n].si == fp->in[i].si)
+            break;
+
+      m = nv50_vec4_map(map, m, lin,
+                        &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy);
+	}
+   /* PrimitiveID either is replaced by the system value, or
+    * written by the geometry shader into an output register
+    */
+   if (fp->gp.primid < 0x40) {
+      map[m / 4] |= vp->gp.primid << ((m % 4) * 8);
+      primid = m++;
+   }
+
+   if (nv50->rasterizer->pipe.point_size_per_vertex) {
+      map[m / 4] |= vp->vp.psiz << ((m % 4) * 8);
+      psiz = (m++ << 4) | 1;
+   }
+
+   /* now fill the stateobj (at most 28 so_data)  */
+   so = so_new(10, 54, 0);
+
+   n = (m + 3) / 4;
+   assert(m <= 64);
+   if (vp->type == PIPE_SHADER_GEOMETRY) {
+      so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1);
+      so_data  (so, m);
+      so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n);
+      so_datap (so, map, n);
+   } else {
+      so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+      so_data  (so, vp->vp.attrs[2]);
+
+      so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1);
+      so_data  (so, primid);
+
+      so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+      so_data  (so, m);
+      so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+      so_datap (so, map, n);
+   }
+
+   //colors = 0x01000404;
+   so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
+   so_data  (so, colors);
+   so_data  (so, clip);
+   so_data  (so, sysval);
+   so_data  (so, psiz);
+
+   so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
+   so_data  (so, interp);
+
+   so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
+   so_datap (so, lin, 4);
+
+   if (nv50->rasterizer->pipe.sprite_coord_enable) {
+      so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
+      so_data  (so,
+                nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff));
+
+      so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
+      so_datap (so, pntc, 8);
+   }
+
+   so_method(so, tesla, NV50TCL_GP_ENABLE, 1);
+   so_data  (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0);
+
+   return so;
+}
+
+static int
+nv50_vp_gp_mapping(uint32_t *map32, int m,
+                   struct nv50_program *vp, struct nv50_program *gp)
+{
+   uint8_t *map = (uint8_t *)map32;
+   int i, j, c;
+
+   for (i = 0; i < gp->in_nr; ++i) {
+      uint8_t oid = 0, mv = 0, mg = gp->in[i].mask;
+
+      for (j = 0; j < vp->out_nr; ++j) {
+         if (vp->out[j].sn == gp->in[i].sn &&
+             vp->out[j].si == gp->in[i].si) {
+            mv = vp->out[j].mask;
+            oid = vp->out[j].hw;
+            break;
+         }
+      }
+
+      for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) {
+         if (mg & mv & 1)
+            map[m++] = oid;
+         else
+         if (mg & 1)
+            map[m++] = (c == 3) ? 0x41 : 0x40;
+         oid += mv & 1;
+      }
+   }
+   return m;
+}
+
+struct nouveau_stateobj *
+nv50_gp_linkage_validate(struct nv50_context *nv50)
+{
+   struct nouveau_grobj *tesla = nv50->screen->tesla;
+   struct nouveau_stateobj *so;
+   struct nv50_program *vp = nv50->vertprog;
+   struct nv50_program *gp = nv50->geomprog;
+   uint32_t map[16];
+   int m = 0;
+
+   if (!gp)
+      return NULL;
+   memset(map, 0, sizeof(map));
+
+   m = nv50_vp_gp_mapping(map, m, vp, gp);
+
+   so = so_new(3, 24 - 3, 0);
+
+   so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+   so_data  (so, vp->vp.attrs[2] | gp->vp.attrs[2]);
+
+   assert(m <= 32);
+   so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+   so_data  (so, m);
+
+   m = (m + 3) / 4;
+   so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m);
+   so_datap (so, map, m);
+
+   return so;
+}
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 42c5a58318..0d744ab788 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -546,7 +546,6 @@ nv50_vp_state_create(struct pipe_context *pipe,
 
 	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
 	p->type = PIPE_SHADER_VERTEX;
-	tgsi_scan_shader(p->pipe.tokens, &p->info);
 	return (void *)p;
 }
 
@@ -578,7 +577,6 @@ nv50_fp_state_create(struct pipe_context *pipe,
 
 	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
 	p->type = PIPE_SHADER_FRAGMENT;
-	tgsi_scan_shader(p->pipe.tokens, &p->info);
 	return (void *)p;
 }
 
@@ -610,7 +608,6 @@ nv50_gp_state_create(struct pipe_context *pipe,
 
 	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
 	p->type = PIPE_SHADER_GEOMETRY;
-	tgsi_scan_shader(p->pipe.tokens, &p->info);
 	return (void *)p;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 524696f35d..8d662d8f60 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -81,6 +81,9 @@ validate_fb(struct nv50_context *nv50)
 		case PIPE_FORMAT_R16G16B16A16_UNORM:
 			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM);
 			break;
+		case PIPE_FORMAT_R16G16B16A16_FLOAT:
+			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT);
+			break;
 		case PIPE_FORMAT_R32G32B32A32_FLOAT:
 			so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT);
 			break;
@@ -135,6 +138,12 @@ validate_fb(struct nv50_context *nv50)
 		case PIPE_FORMAT_Z32_FLOAT:
 			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT);
 			break;
+		case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED:
+			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM);
+			break;
+		case PIPE_FORMAT_Z16_UNORM:
+			so_data(so, NV50TCL_ZETA_FORMAT_Z16_UNORM);
+			break;
 		default:
 			NOUVEAU_ERR("AIIII unknown format %s\n",
 			            util_format_name(fb->zsbuf->format));
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
new file mode 100644
index 0000000000..aa15917774
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -0,0 +1,1266 @@
+
+#include <unistd.h>
+
+#include "nv50_context.h"
+#include "nv50_pc.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "util/u_simple_list.h"
+#include "tgsi/tgsi_dump.h"
+
+#define BLD_MAX_TEMPS 64
+#define BLD_MAX_ADDRS 4
+#define BLD_MAX_PREDS 4
+#define BLD_MAX_IMMDS 128
+
+#define BLD_MAX_COND_NESTING 4
+#define BLD_MAX_LOOP_NESTING 4
+#define BLD_MAX_CALL_NESTING 2
+
+/* collects all values assigned to the same TGSI register */
+struct bld_value_stack {
+   struct nv_value *top;
+   struct nv_value **body;
+   unsigned size;
+};
+
+static INLINE void
+bld_push_value(struct bld_value_stack *stk)
+{
+   assert(!stk->size || (stk->body[stk->size - 1] != stk->top));
+
+   if (!(stk->size % 8)) {
+      unsigned old_sz = (stk->size + 0) * sizeof(struct nv_value *);
+      unsigned new_sz = (stk->size + 8) * sizeof(struct nv_value *);
+      stk->body = (struct nv_value **)REALLOC(stk->body, old_sz, new_sz);
+   }
+   stk->body[stk->size++] = stk->top;
+   stk->top = NULL;
+}
+
+static INLINE void
+bld_push_values(struct bld_value_stack *stacks, int n)
+{
+   int i, c;
+
+   for (i = 0; i < n; ++i)
+      for (c = 0; c < 4; ++c)
+         if (stacks[i * 4 + c].top)
+            bld_push_value(&stacks[i * 4 + c]);
+}
+
+#define FETCH_TEMP(i, c)    (bld->tvs[i][c].top)
+#define STORE_TEMP(i, c, v) (bld->tvs[i][c].top = (v))
+#define FETCH_ADDR(i, c)    (bld->avs[i][c].top)
+#define STORE_ADDR(i, c, v) (bld->avs[i][c].top = (v))
+#define FETCH_PRED(i, c)    (bld->pvs[i][c].top)
+#define STORE_PRED(i, c, v) (bld->pvs[i][c].top = (v))
+#define FETCH_OUTR(i, c)    (bld->ovs[i][c].top)
+#define STORE_OUTR(i, c, v)                                         \
+   do {                                                             \
+      bld->ovs[i][c].top = (v);                                     \
+      bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \
+   } while (0)
+
+struct bld_context {
+   struct nv50_translation_info *ti;
+
+   struct nv_pc *pc;
+   struct nv_basic_block *b;
+
+   struct tgsi_parse_context parse[BLD_MAX_CALL_NESTING];
+   int call_lvl;
+
+   struct nv_basic_block *cond_bb[BLD_MAX_COND_NESTING];
+   struct nv_basic_block *join_bb[BLD_MAX_COND_NESTING];
+   struct nv_basic_block *else_bb[BLD_MAX_COND_NESTING];
+   int cond_lvl;
+   struct nv_basic_block *loop_bb[BLD_MAX_LOOP_NESTING];
+   int loop_lvl;
+
+   struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */
+   struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */
+   struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */
+   struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4];
+
+   uint32_t outputs_written[PIPE_MAX_SHADER_OUTPUTS / 32];
+
+   struct nv_value *frgcrd[4];
+   struct nv_value *sysval[4];
+
+   /* wipe on new BB */
+   struct nv_value *saved_addr[4][2];
+   struct nv_value *saved_inputs[128];
+   struct nv_value *saved_immd[BLD_MAX_IMMDS];
+   uint num_immds;
+};
+
+static INLINE struct nv_value *
+bld_def(struct nv_instruction *i, int c, struct nv_value *value)
+{
+   i->def[c] = value;
+   value->insn = i;
+   return value;
+}
+
+static INLINE struct nv_value *
+find_by_bb(struct bld_value_stack *stack, struct nv_basic_block *b)
+{
+   int i;
+
+   if (stack->top && stack->top->insn->bb == b)
+      return stack->top;
+
+   for (i = stack->size - 1; i >= 0; --i)
+      if (stack->body[i]->insn->bb == b)
+         return stack->body[i];
+   return NULL;
+}
+
+/* fetch value from stack that was defined in the specified basic block,
+ * or search for first definitions in all of its predecessors
+ */
+static void
+fetch_by_bb(struct bld_value_stack *stack,
+            struct nv_value **vals, int *n,
+            struct nv_basic_block *b)
+{
+   int i;
+   struct nv_value *val;
+
+   assert(*n < 16); /* MAX_COND_NESTING */
+
+   val = find_by_bb(stack, b);
+   if (val) {
+      for (i = 0; i < *n; ++i)
+         if (vals[i] == val)
+            return;
+      vals[(*n)++] = val;
+      return;
+   }
+   for (i = 0; i < b->num_in; ++i)
+      fetch_by_bb(stack, vals, n, b->in[i]);
+}
+
+static struct nv_value *
+bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack)
+{
+   struct nv_value *vals[16], *phi = NULL;
+   int j, i = 0, n = 0;
+
+   fetch_by_bb(stack, vals, &n, bld->pc->current_block);
+
+   assert(n);
+   if (n == 1)
+      return vals[0];
+
+   debug_printf("phi required: %i candidates\n", n);
+
+   while (i < n) {
+      struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_PHI);
+
+      j = phi ? 1 : 0;
+      if (phi)
+         insn->src[0] = new_ref(bld->pc, phi);
+
+      phi = new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type);
+
+      bld_def(insn, 0, phi);
+
+      for (; j < 4; ++j) {
+         insn->src[j] = new_ref(bld->pc, vals[i++]);
+         if (i == n)
+            break;
+      }
+      debug_printf("new phi: %i, %i in\n", phi->n, j);
+   }
+
+   /* insert_at_head(list, phi) is done at end of block */
+   return phi;
+}
+
+static INLINE struct nv_value *
+bld_imm_u32(struct bld_context *bld, uint32_t u)
+{
+   int i;
+   unsigned n = bld->num_immds;
+
+   debug_printf("bld_imm_u32: 0x%08x\n", u);
+
+   for (i = 0; i < n; ++i)
+      if (bld->saved_immd[i]->reg.imm.u32 == u)
+         return bld->saved_immd[i];
+   assert(n < BLD_MAX_IMMDS);
+
+   debug_printf("need new one\n");
+
+   bld->num_immds++;
+
+   bld->saved_immd[n] = new_value(bld->pc, NV_FILE_IMM, NV_TYPE_U32);
+   bld->saved_immd[n]->reg.imm.u32 = u;
+   return bld->saved_immd[n];
+}
+
+static INLINE struct nv_value *
+bld_imm_f32(struct bld_context *bld, float f)
+{
+   return bld_imm_u32(bld, fui(f));
+}
+
+#define SET_TYPE(v, t) ((v)->reg.type = NV_TYPE_##t)
+
+static struct nv_value *
+bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0)
+{
+   struct nv_instruction *insn = new_instruction(bld->pc, opcode);
+   assert(insn);
+
+   nv_reference(bld->pc, &insn->src[0], src0); /* NOTE: new_ref would suffice */
+   
+   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
+}
+
+static struct nv_value *
+bld_insn_2(struct bld_context *bld, uint opcode,
+	      struct nv_value *src0, struct nv_value *src1)
+{
+   struct nv_instruction *insn = new_instruction(bld->pc, opcode);
+
+   nv_reference(bld->pc, &insn->src[0], src0);
+   nv_reference(bld->pc, &insn->src[1], src1);
+
+   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
+}
+
+static struct nv_value *
+bld_insn_3(struct bld_context *bld, uint opcode,
+              struct nv_value *src0, struct nv_value *src1,
+              struct nv_value *src2)
+{
+   struct nv_instruction *insn = new_instruction(bld->pc, opcode);
+
+   nv_reference(bld->pc, &insn->src[0], src0);
+   nv_reference(bld->pc, &insn->src[1], src1);
+   nv_reference(bld->pc, &insn->src[2], src2);
+
+   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
+}
+
+#define BLD_INSN_1_EX(d, op, dt, s0, s0t)           \
+   do {                                             \
+      (d) = bld_insn_1(bld, (NV_OP_##op), (s0));    \
+      (d)->reg.type = NV_TYPE_##dt;                 \
+      (d)->insn->src[0]->typecast = NV_TYPE_##s0t;  \
+   } while(0)
+
+static struct nv_value *
+bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e)
+{
+   struct nv_value *val;
+
+   BLD_INSN_1_EX(val, LG2, F32, x, F32);
+   BLD_INSN_1_EX(val, MUL, F32, e, F32);
+   val = bld_insn_1(bld, NV_OP_PREEX2, val);
+   val = bld_insn_1(bld, NV_OP_EX2, val);
+
+   return val;
+}
+
+static INLINE struct nv_value *
+bld_load_imm_f32(struct bld_context *bld, float f)
+{
+   return bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f));
+}
+
+static INLINE struct nv_value *
+bld_load_imm_u32(struct bld_context *bld, uint32_t u)
+{
+   return bld_insn_1(bld, NV_OP_MOV, bld_imm_u32(bld, u));
+}
+
+static struct nv_value *
+bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)
+{
+   int i;
+   struct nv_instruction *nvi;
+
+   for (i = 0; i < 4; ++i) {
+      if (!bld->saved_addr[i][0])
+         break;
+      if (bld->saved_addr[i][1] == indirect) {
+         nvi = bld->saved_addr[i][0]->insn;
+         if (nvi->src[0]->value->reg.imm.u32 == id)
+            return bld->saved_addr[i][0];
+      }
+   }
+   i &= 3;
+
+   bld->saved_addr[i][0] = bld_load_imm_u32(bld, id);
+   bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR;
+   bld->saved_addr[i][1] = indirect;
+   return bld->saved_addr[i][0];
+}
+
+
+static struct nv_value *
+bld_predicate(struct bld_context *bld, struct nv_value *src)
+{
+   struct nv_instruction *nvi = src->insn;
+
+   if (nvi->opcode == NV_OP_LDA ||
+       nvi->opcode == NV_OP_PHI ||
+       nvi->bb != bld->pc->current_block) {
+      nvi = new_instruction(bld->pc, NV_OP_CVT);
+      nv_reference(bld->pc, &nvi->src[0], src);
+   }
+
+   if (!nvi->flags_def) {
+      nvi->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16);
+      nvi->flags_def->insn = nvi;
+   }
+   return nvi->flags_def;
+}
+
+static void
+bld_kil(struct bld_context *bld, struct nv_value *src)
+{
+   struct nv_instruction *nvi;
+
+   src = bld_predicate(bld, src);
+   nvi = new_instruction(bld->pc, NV_OP_KIL);
+   nvi->fixed = 1;
+   nvi->flags_src = new_ref(bld->pc, src);
+   nvi->cc = NV_CC_LT;
+}
+
+static void
+bld_flow(struct bld_context *bld, uint opcode, ubyte cc,
+         struct nv_value *src, boolean plan_reconverge)
+{
+   struct nv_instruction *nvi;
+
+   if (plan_reconverge)
+      new_instruction(bld->pc, NV_OP_JOINAT)->fixed = 1;
+
+   nvi = new_instruction(bld->pc, opcode);
+   nvi->is_terminator = 1;
+   nvi->cc = cc;
+   nvi->flags_src = new_ref(bld->pc, src);
+}
+
+static ubyte
+translate_setcc(unsigned opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_SLT: return NV_CC_LT;
+   case TGSI_OPCODE_SGE: return NV_CC_GE;
+   case TGSI_OPCODE_SEQ: return NV_CC_EQ;
+   case TGSI_OPCODE_SGT: return NV_CC_GT;
+   case TGSI_OPCODE_SLE: return NV_CC_LE;
+   case TGSI_OPCODE_SNE: return NV_CC_NE | NV_CC_U;
+   case TGSI_OPCODE_STR: return NV_CC_TR;
+   case TGSI_OPCODE_SFL: return NV_CC_FL;
+
+   case TGSI_OPCODE_ISLT: return NV_CC_LT;
+   case TGSI_OPCODE_ISGE: return NV_CC_GE;
+   case TGSI_OPCODE_USEQ: return NV_CC_EQ;
+   case TGSI_OPCODE_USGE: return NV_CC_GE;
+   case TGSI_OPCODE_USLT: return NV_CC_LT;
+   case TGSI_OPCODE_USNE: return NV_CC_NE;
+   default:
+      assert(0);
+      return NV_CC_FL;
+   }
+}
+
+static uint
+translate_opcode(uint opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_ABS: return NV_OP_ABS;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_UADD: return NV_OP_ADD;
+   case TGSI_OPCODE_AND: return NV_OP_AND;
+   case TGSI_OPCODE_EX2: return NV_OP_EX2;
+   case TGSI_OPCODE_CEIL: return NV_OP_CEIL;
+   case TGSI_OPCODE_FLR: return NV_OP_FLOOR;
+   case TGSI_OPCODE_TRUNC: return NV_OP_TRUNC;
+   case TGSI_OPCODE_DDX: return NV_OP_DFDX;
+   case TGSI_OPCODE_DDY: return NV_OP_DFDY;
+   case TGSI_OPCODE_F2I:
+   case TGSI_OPCODE_F2U:
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_U2F: return NV_OP_CVT;
+   case TGSI_OPCODE_INEG: return NV_OP_NEG;
+   case TGSI_OPCODE_LG2: return NV_OP_LG2;
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_USHR: return NV_OP_SHR;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_UMAD: return NV_OP_MAD;
+   case TGSI_OPCODE_MAX:
+   case TGSI_OPCODE_IMAX:
+   case TGSI_OPCODE_UMAX: return NV_OP_MAX;
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_IMIN:
+   case TGSI_OPCODE_UMIN: return NV_OP_MIN;
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_UMUL: return NV_OP_MUL;
+   case TGSI_OPCODE_OR: return NV_OP_OR;
+   case TGSI_OPCODE_RCP: return NV_OP_RCP;
+   case TGSI_OPCODE_RSQ: return NV_OP_RSQ;
+   case TGSI_OPCODE_SAD: return NV_OP_SAD;
+   case TGSI_OPCODE_SHL: return NV_OP_SHL;
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE: return NV_OP_SET;
+   case TGSI_OPCODE_TEX: return NV_OP_TEX;
+   case TGSI_OPCODE_TXP: return NV_OP_TEX;
+   case TGSI_OPCODE_TXB: return NV_OP_TXB;
+   case TGSI_OPCODE_TXL: return NV_OP_TXL;
+   case TGSI_OPCODE_XOR: return NV_OP_XOR;
+   default:
+      return NV_OP_NOP;
+   }
+}
+
+static ubyte
+infer_src_type(unsigned opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_XOR:
+   case TGSI_OPCODE_SAD:
+   case TGSI_OPCODE_U2F:
+   case TGSI_OPCODE_UADD:
+   case TGSI_OPCODE_UDIV:
+   case TGSI_OPCODE_UMOD:
+   case TGSI_OPCODE_UMAD:
+   case TGSI_OPCODE_UMUL:
+   case TGSI_OPCODE_UMAX:
+   case TGSI_OPCODE_UMIN:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE:
+   case TGSI_OPCODE_USHR:
+      return NV_TYPE_U32;
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_IDIV:
+   case TGSI_OPCODE_IMAX:
+   case TGSI_OPCODE_IMIN:
+   case TGSI_OPCODE_INEG:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_ISLT:
+      return NV_TYPE_S32;
+   default:
+      return NV_TYPE_F32;
+   }
+}
+
+static ubyte
+infer_dst_type(unsigned opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_F2U:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_XOR:
+   case TGSI_OPCODE_SAD:
+   case TGSI_OPCODE_UADD:
+   case TGSI_OPCODE_UDIV:
+   case TGSI_OPCODE_UMOD:
+   case TGSI_OPCODE_UMAD:
+   case TGSI_OPCODE_UMUL:
+   case TGSI_OPCODE_UMAX:
+   case TGSI_OPCODE_UMIN:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE:
+   case TGSI_OPCODE_USHR:
+      return NV_TYPE_U32;
+   case TGSI_OPCODE_F2I:
+   case TGSI_OPCODE_IDIV:
+   case TGSI_OPCODE_IMAX:
+   case TGSI_OPCODE_IMIN:
+   case TGSI_OPCODE_INEG:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_ISLT:
+      return NV_TYPE_S32;
+   default:
+      return NV_TYPE_F32;
+   }
+}
+
+static void
+emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
+	   unsigned chan, struct nv_value *value)
+{
+   const struct tgsi_full_dst_register *reg = &inst->Dst[0];
+
+   assert(chan < 4);
+
+   if (inst->Instruction.Opcode != TGSI_OPCODE_MOV)
+      value->reg.type = infer_dst_type(inst->Instruction.Opcode);
+
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      BLD_INSN_1_EX(value, SAT, F32, value, F32);
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      value = bld_insn_2(bld, NV_OP_MAX, value, bld_load_imm_f32(bld, -1.0f));
+      value = bld_insn_2(bld, NV_OP_MIN, value, bld_load_imm_f32(bld, 1.0f));
+      value->reg.type = NV_TYPE_F32;
+      break;
+   }
+
+   switch (reg->Register.File) {
+   case TGSI_FILE_OUTPUT:
+      value = bld_insn_1(bld, NV_OP_MOV, value);
+      value->reg.file = bld->ti->output_file;
+
+      if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) {
+         STORE_OUTR(reg->Register.Index, chan, value);
+      } else {
+         value->insn->fixed = 1;
+         value->reg.id = bld->ti->output_map[reg->Register.Index][chan];
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      assert(reg->Register.Index < BLD_MAX_TEMPS);
+      value->reg.file = NV_FILE_GPR;
+      if (value->insn->bb != bld->pc->current_block)
+         value = bld_insn_1(bld, NV_OP_MOV, value);
+      STORE_TEMP(reg->Register.Index, chan, value);
+      break;
+   case TGSI_FILE_ADDRESS:
+      assert(reg->Register.Index < BLD_MAX_ADDRS);
+      value->reg.file = NV_FILE_ADDR;
+      STORE_ADDR(reg->Register.Index, chan, value);
+      break;
+   }
+}
+
+static INLINE uint32_t
+bld_is_output_written(struct bld_context *bld, int i, int c)
+{
+   if (c < 0)
+      return bld->outputs_written[i / 8] & (0xf << ((i * 4) % 32));
+   return bld->outputs_written[i / 8] & (1 << ((i * 4 + c) % 32));
+}
+
+static void
+bld_export_outputs(struct bld_context *bld)
+{
+   struct nv_value *vals[4];
+   struct nv_instruction *nvi;
+   int i, c, n;
+
+   bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS);
+
+   for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) {
+      if (!bld_is_output_written(bld, i, -1))
+         continue;
+      for (n = 0, c = 0; c < 4; ++c) {
+         if (!bld_is_output_written(bld, i, c))
+            continue;
+         vals[n] = bld_fetch_global(bld, &bld->ovs[i][c]);
+         vals[n] = bld_insn_1(bld, NV_OP_MOV, vals[n]);
+         vals[n++]->reg.id = bld->ti->output_map[i][c];
+      }
+      assert(n);
+
+      (nvi = new_instruction(bld->pc, NV_OP_EXPORT))->fixed = 1;
+
+      for (c = 0; c < n; ++c)
+         nvi->src[c] = new_ref(bld->pc, vals[c]);
+   }
+}
+
+static void
+bld_new_block(struct bld_context *bld, struct nv_basic_block *b)
+{
+   int i;
+
+   bld_push_values(&bld->tvs[0][0], BLD_MAX_TEMPS);
+   bld_push_values(&bld->avs[0][0], BLD_MAX_ADDRS);
+   bld_push_values(&bld->pvs[0][0], BLD_MAX_PREDS);
+   bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS);
+
+   bld->pc->current_block = b;
+
+   for (i = 0; i < 4; ++i)
+      bld->saved_addr[i][0] = NULL;
+}
+
+static struct nv_value *
+bld_saved_input(struct bld_context *bld, unsigned i, unsigned c)
+{
+   unsigned idx = bld->ti->input_map[i][c];
+
+   if (bld->ti->p->type != PIPE_SHADER_FRAGMENT)
+      return NULL;
+   if (bld->saved_inputs[idx])
+      return bld->saved_inputs[idx];
+   return NULL;
+}
+
+static struct nv_value *
+bld_interpolate(struct bld_context *bld, unsigned mode, struct nv_value *val)
+{
+   if (mode & (NV50_INTERP_LINEAR | NV50_INTERP_FLAT))
+      val = bld_insn_1(bld, NV_OP_LINTERP, val);
+   else
+      val = bld_insn_2(bld, NV_OP_PINTERP, val, bld->frgcrd[3]);
+
+   val->insn->flat = (mode & NV50_INTERP_FLAT) ? 1 : 0;
+   val->insn->centroid = (mode & NV50_INTERP_CENTROID) ? 1 : 0;
+   return val;
+}
+
+static struct nv_value *
+emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
+           const unsigned s, const unsigned chan)
+{
+   const struct tgsi_full_src_register *src = &insn->Src[s];
+   struct nv_value *res;
+   unsigned idx, swz, dim_idx, ind_idx, ind_swz;
+   ubyte type = infer_src_type(insn->Instruction.Opcode);
+
+   idx = src->Register.Index;
+   swz = tgsi_util_get_full_src_register_swizzle(src, chan);
+   dim_idx = -1;
+   ind_idx = -1;
+   ind_swz = 0;
+
+   if (src->Register.Indirect) {
+      ind_idx = src->Indirect.Index;
+      ind_swz = tgsi_util_get_src_register_swizzle(&src->Indirect, 0);
+   }
+
+   switch (src->Register.File) {
+   case TGSI_FILE_CONSTANT:
+      dim_idx = src->Dimension.Index ? src->Dimension.Index + 2 : 1;
+      assert(dim_idx < 14);
+      assert(dim_idx == 1); /* for now */
+
+      res = new_value(bld->pc, NV_FILE_MEM_C(dim_idx), type);
+      res->reg.type = type;
+      res->reg.id = (idx * 4 + swz) & 127;
+      res = bld_insn_1(bld, NV_OP_LDA, res);
+
+      if (src->Register.Indirect)
+         res->insn->src[4] = new_ref(bld->pc, FETCH_ADDR(ind_idx, ind_swz));
+      if (idx >= (128 / 4))
+         res->insn->src[4] =
+            new_ref(bld->pc, bld_get_address(bld, (idx * 16) & ~0x1ff, NULL));
+      break;
+   case TGSI_FILE_IMMEDIATE:
+      assert(idx < bld->ti->immd32_nr);
+      res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]);
+      res->reg.type = type;
+      break;
+   case TGSI_FILE_INPUT:
+      res = bld_saved_input(bld, idx, swz);
+      if (res && (insn->Instruction.Opcode != TGSI_OPCODE_TXP))
+         return res;
+
+      res = new_value(bld->pc, bld->ti->input_file, type);
+      res->reg.id = bld->ti->input_map[idx][swz];
+
+      if (res->reg.file == NV_FILE_MEM_V) {
+         res = bld_interpolate(bld, bld->ti->interp_mode[idx], res);
+      } else {
+         assert(src->Dimension.Dimension == 0);
+         res = bld_insn_1(bld, NV_OP_LDA, res);
+      }
+      assert(res->reg.type == type);
+
+      bld->saved_inputs[bld->ti->input_map[idx][swz]] = res;
+      break;
+   case TGSI_FILE_TEMPORARY:
+      /* this should be load from l[], with reload elimination later on */
+      res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
+      break;
+   case TGSI_FILE_ADDRESS:
+      res = bld_fetch_global(bld, &bld->avs[idx][swz]);
+      break;
+   case TGSI_FILE_PREDICATE:
+      res = bld_fetch_global(bld, &bld->pvs[idx][swz]);
+      break;
+   default:
+      NOUVEAU_ERR("illegal/unhandled src reg file: %d\n", src->Register.File);
+      abort();
+      break;	   
+   }
+
+   switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   case TGSI_UTIL_SIGN_CLEAR:
+      res = bld_insn_1(bld, NV_OP_ABS, res);
+      break;
+   case TGSI_UTIL_SIGN_TOGGLE:
+      res = bld_insn_1(bld, NV_OP_NEG, res);
+      break;
+   case TGSI_UTIL_SIGN_SET:
+      res = bld_insn_1(bld, NV_OP_ABS, res);
+      res = bld_insn_1(bld, NV_OP_NEG, res);
+      break;
+   default:
+      NOUVEAU_ERR("illegal/unhandled src reg sign mode\n");
+      abort();
+      break;
+   }
+
+   return res;
+}
+
+static void
+bld_lit(struct bld_context *bld, struct nv_value *dst0[4],
+        const struct tgsi_full_instruction *insn)
+{
+   struct nv_value *val0, *zero;
+   unsigned mask = insn->Dst[0].Register.WriteMask;
+
+   if (mask & ((1 << 0) | (1 << 3)))
+      dst0[3] = dst0[0] = bld_load_imm_f32(bld, 1.0f);
+
+   if (mask & (3 << 1)) {
+      zero = bld_load_imm_f32(bld, 0.0f);
+      val0 = bld_insn_2(bld, NV_OP_MAX, emit_fetch(bld, insn, 0, 0), zero);
+
+      if (mask & (1 << 1))
+         dst0[1] = val0;
+   }
+
+   if (mask & (1 << 2)) {
+      struct nv_value *val1, *val3, *src1, *src3;
+      struct nv_value *pos128 = bld_load_imm_f32(bld, 127.999999f);
+      struct nv_value *neg128 = bld_load_imm_f32(bld, -127.999999f);
+
+      src1 = emit_fetch(bld, insn, 0, 1);
+      src3 = emit_fetch(bld, insn, 0, 3);
+
+      val0->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16);
+      val0->insn->flags_def->insn = val0->insn;
+
+      val1 = bld_insn_2(bld, NV_OP_MAX, src1, zero);
+      val3 = bld_insn_2(bld, NV_OP_MAX, src3, neg128);
+      val3 = bld_insn_2(bld, NV_OP_MIN, val3, pos128);
+      val3 = bld_pow(bld, val1, val3);
+
+      dst0[2] = bld_insn_1(bld, NV_OP_MOV, zero);
+      dst0[2]->insn->cc = NV_CC_LE;
+      dst0[2]->insn->flags_src = new_ref(bld->pc, val0->insn->flags_def);
+
+      dst0[2] = bld_insn_2(bld, NV_OP_SELECT, val3, dst0[2]);
+   }
+}
+
+static INLINE void
+get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg)
+{
+   switch (insn->Texture.Texture) {
+   case TGSI_TEXTURE_1D:
+      *arg = *dim = 1;
+      break;
+   case TGSI_TEXTURE_SHADOW1D:
+      *dim = 1;
+      *arg = 2;
+      break;
+   case TGSI_TEXTURE_UNKNOWN:
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      *arg = *dim = 2;
+      break;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      *dim = 2;
+      *arg = 3;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      *dim = *arg = 3;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+static void
+load_proj_tex_coords(struct bld_context *bld,
+		     struct nv_value *t[4], int dim,
+		     const struct tgsi_full_instruction *insn)
+{
+   int c, mask = 0;
+
+   t[3] = emit_fetch(bld, insn, 0, 3);
+
+   if (t[3]->insn->opcode == NV_OP_PINTERP) {
+      t[3]->insn->opcode = NV_OP_LINTERP;
+      nv_reference(bld->pc, &t[3]->insn->src[1], NULL);
+   }
+
+   t[3] = bld_insn_1(bld, NV_OP_RCP, t[3]);
+
+   for (c = 0; c < dim; ++c) {
+      t[c] = emit_fetch(bld, insn, 0, c);
+      if (t[c]->insn->opcode == NV_OP_LINTERP)
+         t[c]->insn->opcode = NV_OP_PINTERP;
+
+      if (t[c]->insn->opcode == NV_OP_PINTERP)
+         nv_reference(bld->pc, &t[c]->insn->src[1], t[3]);
+      else
+         mask |= 1 << c;
+   }
+
+   for (c = 0; mask; ++c, mask >>= 1) {
+      if (!(mask & 1))
+         continue;
+      t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], t[3]);
+   }
+}
+
+static void
+bld_tex(struct bld_context *bld, struct nv_value *dst0[4],
+        const struct tgsi_full_instruction *insn)
+{
+   struct nv_value *t[4];
+   struct nv_instruction *nvi;
+   uint opcode = translate_opcode(insn->Instruction.Opcode);
+   int arg, dim, c;
+
+   get_tex_dim(insn, &dim, &arg);
+
+   if (insn->Texture.Texture == TGSI_TEXTURE_CUBE) {
+   }
+   // else
+   if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) {
+      load_proj_tex_coords(bld, t, dim, insn);
+   } else
+      for (c = 0; c < dim; ++c)
+         t[c] = emit_fetch(bld, insn, 0, c);
+
+   if (arg != dim)
+      t[dim] = emit_fetch(bld, insn, 0, 2);
+
+   if (insn->Instruction.Opcode == TGSI_OPCODE_TXB ||
+       insn->Instruction.Opcode == TGSI_OPCODE_TXL) {
+      t[arg++] = emit_fetch(bld, insn, 0, 3);
+   }
+
+   for (c = 0; c < arg; ++c) {
+      t[c] = bld_insn_1(bld, NV_OP_MOV, t[c]);
+      t[c]->reg.type = NV_TYPE_F32;
+   }
+
+   nvi = new_instruction(bld->pc, opcode);
+
+   for (c = 0; c < 4; ++c) {
+      nvi->def[c] = dst0[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32);
+      nvi->def[c]->insn = nvi;
+   }
+   for (c = 0; c < arg; ++c)
+      nvi->src[c] = new_ref(bld->pc, t[c]);
+
+   nvi->tex_t = insn->Src[1].Register.Index;
+   nvi->tex_s = 0;
+   nvi->tex_mask = 0xf;
+   nvi->tex_cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0;
+   nvi->tex_live = 0;
+   nvi->tex_argc = arg;
+}
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL(chan, inst) \
+   for (chan = 0; chan < 4; ++chan)               \
+      if ((inst)->Dst[0].Register.WriteMask & (1 << chan))
+
+static void
+bld_instruction(struct bld_context *bld,
+                const struct tgsi_full_instruction *insn)
+{
+   struct nv_value *src0;
+   struct nv_value *src1;
+   struct nv_value *src2;
+   struct nv_value *dst0[4];
+   struct nv_value *temp;
+   int c;
+   uint opcode = translate_opcode(insn->Instruction.Opcode);
+
+   tgsi_dump_instruction(insn, 1);
+	
+   switch (insn->Instruction.Opcode) {
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_MAX:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         dst0[c] = bld_insn_2(bld, opcode, src0, src1);
+      }
+      break;
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         src2 = emit_fetch(bld, insn, 2, c);
+         src0 = bld_predicate(bld, src0);
+
+         src1 = bld_insn_1(bld, NV_OP_MOV, src1);
+         src1->insn->flags_src = new_ref(bld->pc, src0);
+         src1->insn->cc = NV_CC_LT;
+
+         src2 = bld_insn_1(bld, NV_OP_MOV, src2);
+         src2->insn->flags_src = new_ref(bld->pc, src0);
+         src2->insn->cc = NV_CC_GE;
+
+         dst0[c] = bld_insn_2(bld, NV_OP_SELECT, src1, src2);
+      }
+      break;
+   case TGSI_OPCODE_COS:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      temp = bld_insn_1(bld, NV_OP_PRESIN, src0);
+      if (insn->Dst[0].Register.WriteMask & 7)
+         temp = bld_insn_1(bld, NV_OP_COS, temp);
+      for (c = 0; c < 3; ++c)
+         if (insn->Dst[0].Register.WriteMask & (1 << c))
+            dst0[c] = temp;
+      if (!(insn->Dst[0].Register.WriteMask & (1 << 3)))
+         break;
+      /* XXX: if src0.x is src0.w, don't emit new insns */
+      src0 = emit_fetch(bld, insn, 0, 3);
+      temp = bld_insn_1(bld, NV_OP_PRESIN, src0);
+      dst0[3] = bld_insn_1(bld, NV_OP_COS, temp);
+      break;
+   case TGSI_OPCODE_DP3:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      src1 = emit_fetch(bld, insn, 1, 0);
+      temp = bld_insn_2(bld, NV_OP_MUL, src0, src1);
+      for (c = 1; c < 3; ++c) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp);
+      }
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
+   case TGSI_OPCODE_DP4:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      src1 = emit_fetch(bld, insn, 1, 0);
+      temp = bld_insn_2(bld, NV_OP_MUL, src0, src1);
+      for (c = 1; c < 4; ++c) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp);
+      }
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
+   case TGSI_OPCODE_EX2:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      temp = bld_insn_1(bld, NV_OP_PREEX2, src0);
+      temp = bld_insn_1(bld, NV_OP_EX2, temp);
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
+   case TGSI_OPCODE_FRC:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         dst0[c] = bld_insn_1(bld, NV_OP_FLOOR, src0);
+         dst0[c] = bld_insn_2(bld, NV_OP_SUB, src0, dst0[c]);
+      }
+      break;
+   case TGSI_OPCODE_KIL:
+      for (c = 0; c < 4; ++c) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         bld_kil(bld, src0);
+      }
+      break;
+   case TGSI_OPCODE_IF:
+   {
+      struct nv_basic_block *b = new_basic_block(bld->pc);
+
+      nvbb_attach_block(bld->pc->current_block, b);
+
+      bld->join_bb[bld->cond_lvl] = bld->pc->current_block;
+      bld->cond_bb[bld->cond_lvl] = bld->pc->current_block;
+
+      src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0));
+
+      bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, FALSE);
+
+      ++bld->cond_lvl;
+      bld_new_block(bld, b);
+   }
+      break;
+   case TGSI_OPCODE_ELSE:
+   {
+      struct nv_basic_block *b = new_basic_block(bld->pc);
+
+      --bld->cond_lvl;
+      nvbb_attach_block(bld->join_bb[bld->cond_lvl], b);
+
+      bld->cond_bb[bld->cond_lvl]->exit->target = b;
+      bld->cond_bb[bld->cond_lvl] = bld->pc->current_block;
+
+      new_instruction(bld->pc, NV_OP_BRA)->is_terminator = 1;
+
+      ++bld->cond_lvl;
+      bld_new_block(bld, b);
+   }
+      break;
+   case TGSI_OPCODE_ENDIF: /* XXX: deal with ENDIF; ENDIF; */
+   {
+      struct nv_basic_block *b = new_basic_block(bld->pc);
+
+      --bld->cond_lvl;
+      nvbb_attach_block(bld->pc->current_block, b);
+      nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b);
+
+      bld->cond_bb[bld->cond_lvl]->exit->target = b;
+
+      if (0 && bld->join_bb[bld->cond_lvl]) {
+         bld->join_bb[bld->cond_lvl]->exit->prev->target = b;
+
+         new_instruction(bld->pc, NV_OP_NOP)->is_join = TRUE;
+      }
+
+      bld_new_block(bld, b);
+   }
+      break;
+   case TGSI_OPCODE_BGNLOOP:
+      assert(0);
+      break;
+   case TGSI_OPCODE_BRK:
+      assert(0);
+      break;
+   case TGSI_OPCODE_CONT:
+      assert(0);
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      assert(0);
+      break;
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_CEIL:
+   case TGSI_OPCODE_FLR:
+   case TGSI_OPCODE_TRUNC:
+   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         dst0[c] = bld_insn_1(bld, opcode, src0);
+      }	   
+      break;
+   case TGSI_OPCODE_LIT:
+      bld_lit(bld, dst0, insn);
+      break;
+   case TGSI_OPCODE_LRP:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         src2 = emit_fetch(bld, insn, 2, c);
+         dst0[c] = bld_insn_2(bld, NV_OP_SUB, src1, src2);
+         dst0[c] = bld_insn_3(bld, NV_OP_MAD, dst0[c], src0, src2);
+      }
+      break;
+   case TGSI_OPCODE_MOV:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = emit_fetch(bld, insn, 0, c);
+      break;
+   case TGSI_OPCODE_MAD:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         src2 = emit_fetch(bld, insn, 2, c);
+         dst0[c] = bld_insn_3(bld, opcode, src0, src1, src2);
+      }
+      break;
+   case TGSI_OPCODE_POW:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      src1 = emit_fetch(bld, insn, 1, 0);
+      temp = bld_pow(bld, src0, src1);
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_LG2:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      temp = bld_insn_1(bld, opcode, src0);
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
+   case TGSI_OPCODE_RSQ:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      temp = bld_insn_1(bld, NV_OP_ABS, src0);
+      temp = bld_insn_1(bld, NV_OP_RSQ, temp);
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         dst0[c] = bld_insn_2(bld, NV_OP_SET, src0, src1);
+         dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode);
+         dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode);
+
+         if (dst0[c]->reg.type != NV_TYPE_F32)
+            break;
+         dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]);
+         dst0[c]->insn->src[0]->typecast = NV_TYPE_S32;
+         dst0[c]->reg.type = NV_TYPE_S32;
+         dst0[c] = bld_insn_1(bld, NV_OP_CVT, dst0[c]);
+         dst0[c]->reg.type = NV_TYPE_F32;
+      }
+      break;
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = emit_fetch(bld, insn, 1, c);
+         dst0[c] = bld_insn_2(bld, NV_OP_ADD, src0, src1);
+         dst0[c]->insn->src[1]->mod ^= NV_MOD_NEG;
+      }
+      break;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+      bld_tex(bld, dst0, insn);
+      break;
+   case TGSI_OPCODE_XPD:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         if (c == 3) {
+            dst0[3] = bld_imm_f32(bld, 1.0f);
+            break;
+         }
+         src0 = emit_fetch(bld, insn, 0, (c + 1) % 3);
+         src1 = emit_fetch(bld, insn, 1, (c + 2) % 3);
+         dst0[c] = bld_insn_2(bld, NV_OP_MUL, src0, src1);
+
+         src0 = emit_fetch(bld, insn, 0, (c + 2) % 3);
+         src1 = emit_fetch(bld, insn, 1, (c + 1) % 3);
+         dst0[c] = bld_insn_3(bld, NV_OP_MAD, src0, src1, dst0[c]);
+
+         dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG;
+      }
+      break;
+   case TGSI_OPCODE_END:
+      if (bld->ti->p->type == PIPE_SHADER_FRAGMENT)
+         bld_export_outputs(bld);
+      break;
+   default:
+      NOUVEAU_ERR("nv_bld: unhandled opcode %u\n", insn->Instruction.Opcode);
+      abort();
+      break;
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+      emit_store(bld, insn, c, dst0[c]);
+}
+
+int
+nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
+{
+   struct bld_context *bld = CALLOC_STRUCT(bld_context);
+   int c;
+
+   pc->root = pc->current_block = new_basic_block(pc);
+
+   bld->pc = pc;
+   bld->ti = ti;
+
+   pc->loop_nesting_bound = 1; /* XXX: should work with 0 */
+
+   c = util_bitcount(bld->ti->p->fp.interp >> 24);
+   if (c && ti->p->type == PIPE_SHADER_FRAGMENT) {
+      bld->frgcrd[3] = new_value(pc, NV_FILE_MEM_V, NV_TYPE_F32);
+      bld->frgcrd[3]->reg.id = c - 1;
+      bld->frgcrd[3] = bld_insn_1(bld, NV_OP_LINTERP, bld->frgcrd[3]);
+      bld->frgcrd[3] = bld_insn_1(bld, NV_OP_RCP, bld->frgcrd[3]);
+   }
+
+   tgsi_parse_init(&bld->parse[0], ti->p->pipe.tokens);
+
+   while (!tgsi_parse_end_of_tokens(&bld->parse[bld->call_lvl])) {
+      const union tgsi_full_token *tok = &bld->parse[bld->call_lvl].FullToken;
+
+      tgsi_parse_token(&bld->parse[bld->call_lvl]);
+
+      switch (tok->Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         bld_instruction(bld, &tok->FullInstruction);
+         break;
+      default:
+         break;
+      }
+   }
+
+   FREE(bld);
+   return 0;
+}
+
+#if 0
+/* If a variable is assigned in a loop, replace all references to the value
+ * from outside the loop with a phi value.
+ */
+static void
+bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b,
+                   struct nv_value *old_val,
+                   struct nv_value *new_val)
+{
+   struct nv_instruction *nvi;
+
+   for (nvi = b->entry; nvi; nvi = nvi->next) {
+      int s;
+      for (s = 0; s < 5; ++s) {
+         if (!nvi->src[s])
+            continue;
+         if (nvi->src[s]->value == old_val)
+            nv_reference(pc, &nvi->src[s], new_val);
+      }
+      if (nvi->flags_src && nvi->flags_src->value == old_val)
+         nv_reference(pc, &nvi->flags_src, new_val);
+   }
+   b->pass_seq = pc->pass_seq;
+
+   if (b->out[0] && b->out[0]->pass_seq < pc->pass_seq)
+      bld_adjust_nv_refs(pc, b, old_val, new_val);
+
+   if (b->out[1] && b->out[1]->pass_seq < pc->pass_seq)
+      bld_adjust_nv_refs(pc, b, old_val, new_val);
+}
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 864cb09352..6bd52884b5 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -519,7 +519,7 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
 		so_data  (so, fui(v[1]));
 		break;
 	case 1:
-		if (attrib == nv50->vertprog->cfg.edgeflag_in) {
+		if (attrib == nv50->vertprog->vp.edgeflag) {
 			so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
 			so_data  (so, v[0] ? 1 : 0);
 		}
@@ -560,7 +560,7 @@ nv50_vbo_validate(struct nv50_context *nv50)
 
 	nv50->vbo_fifo = 0;
 	if (nv50->screen->force_push ||
-	    nv50->vertprog->cfg.edgeflag_in < 16)
+	    nv50->vertprog->vp.edgeflag < 16)
 		nv50->vbo_fifo = 0xffff;
 
 	for (i = 0; i < nv50->vtxbuf_nr; i++) {
-- 
cgit v1.2.3


From f3af1201c578443dd0f72e73470dd1763888a41d Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 24 Jul 2010 12:49:15 +0200
Subject: nouveau: update nouveau_class.h

Adds nvc0, new vertex formats, and dual source blending values.
---
 src/gallium/drivers/nouveau/nouveau_class.h | 1171 +++++++++++++++++++++++++--
 src/gallium/drivers/nv50/nv50_state.c       |   55 +-
 src/gallium/drivers/nv50/nv50_vbo.c         |   17 +-
 3 files changed, 1159 insertions(+), 84 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
index adfdd37b1b..975fd8f35a 100644
--- a/src/gallium/drivers/nouveau/nouveau_class.h
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -735,6 +735,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_HIGH					0x0000023c
 
 
+#define NVC0_MEMORY_TO_MEMORY_FORMAT							0x00009039
+
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_NOP						0x00000100
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_SERIALIZE						0x00000110
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_MODE_IN					0x00000204
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_PITCH_IN					0x00000208
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_HEIGHT_IN					0x0000020c
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_DEPTH_IN					0x00000210
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_Z				0x00000214
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_MODE_OUT					0x00000220
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_PITCH_OUT					0x00000224
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_HEIGHT_OUT					0x00000228
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_DEPTH_OUT					0x0000022c
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_Z				0x00000230
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_HIGH					0x00000238
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_LOW					0x0000023c
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC						0x00000300
+#define   NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_PUSH					(1 <<  0)
+#define   NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_LINEAR_IN					(1 <<  4)
+#define   NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_LINEAR_OUT					(1 <<  8)
+#define   NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_NOTIFY					(1 << 13)
+#define   NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_INC_SHIFT					20
+#define   NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_INC_MASK					0x00f00000
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_DATA						0x00000304
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH					0x0000030c
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_LOW					0x00000310
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_PITCH_IN						0x00000314
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT						0x00000318
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN					0x0000031c
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_LINE_COUNT					0x00000320
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_NOTIFY_ADDRESS_HIGH				0x0000032c
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_NOTIFY_ADDRESS_LOW				0x00000330
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_NOTIFY						0x00000334
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_X				0x00000344
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_Y				0x00000348
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_X				0x0000034c
+#define  NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_Y				0x00000350
+
+
 #define NV01_MEMORY_LOCAL_BANKED							0x0000003d
 
 
@@ -4507,6 +4546,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV20TCL_VTXFMT_TYPE_SHIFT							0
 #define   NV20TCL_VTXFMT_TYPE_MASK							0x0000000f
 #define    NV20TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV20TCL_VTXFMT_TYPE_HALF							0x00000003
 #define    NV20TCL_VTXFMT_TYPE_UBYTE							0x00000004
 #define    NV20TCL_VTXFMT_TYPE_USHORT							0x00000005
 #define   NV20TCL_VTXFMT_SIZE_SHIFT							4
@@ -6990,6 +7030,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV40TCL_VTXFMT_TYPE_SHIFT							0
 #define   NV40TCL_VTXFMT_TYPE_MASK							0x0000000f
 #define    NV40TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV40TCL_VTXFMT_TYPE_HALF							0x00000003
 #define    NV40TCL_VTXFMT_TYPE_UBYTE							0x00000004
 #define    NV40TCL_VTXFMT_TYPE_USHORT							0x00000005
 #define   NV40TCL_VTXFMT_SIZE_SHIFT							4
@@ -7699,7 +7740,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50TCL_DMA_TIC								0x000001a0
 #define  NV50TCL_DMA_TEXTURE								0x000001a4
 #define  NV50TCL_DMA_STRMOUT								0x000001a8
-#define  NV50TCL_DMA_UNK01AC								0x000001ac
+#define  NV50TCL_DMA_CLIPID								0x000001ac
 #define  NV50TCL_DMA_COLOR(x)								(0x000001c0+((x)*4))
 #define  NV50TCL_DMA_COLOR__SIZE							0x00000008
 #define  NV50TCL_RT_ADDRESS_HIGH(x)							(0x00000200+((x)*32))
@@ -7916,8 +7957,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50TCL_DEPTH_RANGE_FAR__SIZE							0x00000010
 #define  NV50TCL_VIEWPORT_CLIP_HORIZ(x)							(0x00000d00+((x)*8))
 #define  NV50TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NV50TCL_VIEWPORT_CLIP_HORIZ_MIN_SHIFT						0
+#define   NV50TCL_VIEWPORT_CLIP_HORIZ_MIN_MASK						0x0000ffff
+#define   NV50TCL_VIEWPORT_CLIP_HORIZ_MAX_SHIFT						16
+#define   NV50TCL_VIEWPORT_CLIP_HORIZ_MAX_MASK						0xffff0000
 #define  NV50TCL_VIEWPORT_CLIP_VERT(x)							(0x00000d04+((x)*8))
 #define  NV50TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NV50TCL_VIEWPORT_CLIP_VERT_MIN_SHIFT						0
+#define   NV50TCL_VIEWPORT_CLIP_VERT_MIN_MASK						0x0000ffff
+#define   NV50TCL_VIEWPORT_CLIP_VERT_MAX_SHIFT						16
+#define   NV50TCL_VIEWPORT_CLIP_VERT_MAX_MASK						0xffff0000
+#define  NV50TCL_CLIPID_REGION_HORIZ(x)							(0x00000d40+((x)*8))
+#define  NV50TCL_CLIPID_REGION_HORIZ__SIZE						0x00000004
+#define  NV50TCL_CLIPID_REGION_VERT(x)							(0x00000d44+((x)*8))
+#define  NV50TCL_CLIPID_REGION_VERT__SIZE						0x00000004
 #define  NV50TCL_VERTEX_BUFFER_FIRST							0x00000d74
 #define  NV50TCL_VERTEX_BUFFER_COUNT							0x00000d78
 #define  NV50TCL_CLEAR_COLOR(x)								(0x00000d80+((x)*4))
@@ -7975,14 +8028,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50TCL_GP_ADDRESS_LOW								0x00000f74
 #define  NV50TCL_VP_ADDRESS_HIGH							0x00000f7c
 #define  NV50TCL_VP_ADDRESS_LOW								0x00000f80
-#define  NV50TCL_UNK0F84_ADDRESS_HIGH							0x00000f84
-#define  NV50TCL_UNK0F84_ADDRESS_LOW							0x00000f88
+#define  NV50TCL_VERTEX_RUNOUT_HIGH							0x00000f84
+#define  NV50TCL_VERTEX_RUNOUT_LOW							0x00000f88
 #define  NV50TCL_DEPTH_BOUNDS(x)							(0x00000f9c+((x)*4))
 #define  NV50TCL_DEPTH_BOUNDS__SIZE							0x00000002
 #define  NV50TCL_FP_ADDRESS_HIGH							0x00000fa4
 #define  NV50TCL_FP_ADDRESS_LOW								0x00000fa8
 #define  NV50TCL_MSAA_MASK(x)								(0x00000fbc+((x)*4))
 #define  NV50TCL_MSAA_MASK__SIZE							0x00000004
+#define  NV50TCL_CLIPID_ADDRESS_HIGH							0x00000fcc
+#define  NV50TCL_CLIPID_ADDRESS_LOW							0x00000fd0
 #define  NV50TCL_ZETA_ADDRESS_HIGH							0x00000fe0
 #define  NV50TCL_ZETA_ADDRESS_LOW							0x00000fe4
 #define  NV50TCL_ZETA_FORMAT								0x00000fe8
@@ -8112,37 +8167,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV50TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
 #define   NV50TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
 #define  NV50TCL_BLEND_FUNC_SRC_RGB							0x00001344
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE					0x00000308
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
-#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00004000
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE						0x00004001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00004300
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00004302
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00004304
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00004306
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00004307
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE					0x00004308
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x0000c001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x0000c003
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR						0x0000c900
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA						0x0000c902
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA				0x0000c903
 #define  NV50TCL_BLEND_FUNC_DST_RGB							0x00001348
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
-#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
-#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
-#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
-#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
-#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE					0x00000308
-#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
-#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
-#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ZERO						0x00004000
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE						0x00004001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00004300
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00004302
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00004304
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00004306
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00004307
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE					0x00004308
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x0000c001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x0000c003
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC1_COLOR						0x0000c900
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC1_ALPHA						0x0000c902
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_ALPHA				0x0000c903
 #define  NV50TCL_BLEND_EQUATION_ALPHA							0x0000134c
 #define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_ADD						0x00008006
 #define   NV50TCL_BLEND_EQUATION_ALPHA_MIN						0x00008007
@@ -8150,37 +8213,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x0000800a
 #define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x0000800b
 #define  NV50TCL_BLEND_FUNC_SRC_ALPHA							0x00001350
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00000001
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x00000300
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x00000302
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x00000304
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x00000306
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x00000308
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x00008001
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x00008003
-#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00004000
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00004001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x00004300
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x00004302
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x00004304
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x00004306
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x00004307
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x00004308
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x0000c001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x0000c003
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC1_COLOR					0x0000c900
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC1_ALPHA					0x0000c902
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_ALPHA				0x0000c903
 #define  NV50TCL_BLEND_FUNC_DST_ALPHA							0x00001358
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00000001
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x00000300
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x00000302
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x00000304
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x00000306
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x00000308
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x00008001
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x00008003
-#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00004000
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00004001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x00004300
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x00004302
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x00004304
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x00004306
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x00004307
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x00004308
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x0000c001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x0000c003
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC1_COLOR					0x0000c900
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC1_ALPHA					0x0000c902
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_ALPHA				0x0000c903
 #define  NV50TCL_BLEND_ENABLE(x)							(0x00001360+((x)*4))
 #define  NV50TCL_BLEND_ENABLE__SIZE							0x00000008
 #define  NV50TCL_STENCIL_FRONT_ENABLE							0x00001380
@@ -8239,6 +8310,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50TCL_FP_START_ID								0x00001414
 #define  NV50TCL_GP_VERTEX_OUTPUT_COUNT							0x00001420
 #define  NV50TCL_VB_ELEMENT_BASE							0x00001434
+#define  NV50TCL_INSTANCE_BASE								0x00001438
 #define  NV50TCL_CODE_CB_FLUSH								0x00001440
 #define  NV50TCL_BIND_TSC(x)								(0x00001444+((x)*8))
 #define  NV50TCL_BIND_TSC__SIZE								0x00000003
@@ -8256,6 +8328,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV50TCL_BIND_TIC_TIC_MASK							0x7ffffe00
 #define  NV50TCL_STRMOUT_MAP(x)								(0x00001480+((x)*4))
 #define  NV50TCL_STRMOUT_MAP__SIZE							0x00000020
+#define  NV50TCL_CLIPID_HEIGHT								0x00001504
 #define  NV50TCL_VP_CLIP_DISTANCE_ENABLE						0x00001510
 #define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_0						(1 <<  0)
 #define   NV50TCL_VP_CLIP_DISTANCE_ENABLE_1						(1 <<  1)
@@ -8340,7 +8413,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50TCL_GP_BUILTIN_RESULT_EN							0x000015cc
 #define   NV50TCL_GP_BUILTIN_RESULT_EN_VPORT_IDX					(1 <<  0)
 #define   NV50TCL_GP_BUILTIN_RESULT_EN_LAYER_IDX					(1 << 16)
-#define  NV50TCL_MULTISAMPLE_SAMPLES_LOG2						0x000015d0
+#define  NV50TCL_MULTISAMPLE_MODE							0x000015d0
+#define   NV50TCL_MULTISAMPLE_MODE_1X							0x00000000
+#define   NV50TCL_MULTISAMPLE_MODE_2XMS							0x00000001
+#define   NV50TCL_MULTISAMPLE_MODE_4XMS							0x00000002
+#define   NV50TCL_MULTISAMPLE_MODE_8XMS							0x00000004
+#define   NV50TCL_MULTISAMPLE_MODE_4XMS_4XCS						0x00000008
+#define   NV50TCL_MULTISAMPLE_MODE_4XMS_12XCS						0x00000009
+#define   NV50TCL_MULTISAMPLE_MODE_8XMS_8XCS						0x0000000a
 #define  NV50TCL_VERTEX_BEGIN								0x000015dc
 #define   NV50TCL_VERTEX_BEGIN_POINTS							0x00000000
 #define   NV50TCL_VERTEX_BEGIN_LINES							0x00000001
@@ -8356,6 +8436,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY					0x0000000b
 #define   NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY					0x0000000c
 #define   NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY					0x0000000d
+#define   NV50TCL_VERTEX_BEGIN_PATCHES							0x0000000e
 #define  NV50TCL_VERTEX_END								0x000015e0
 #define  NV50TCL_EDGEFLAG_ENABLE							0x000015e4
 #define  NV50TCL_VB_ELEMENT_U32								0x000015e8
@@ -8369,6 +8450,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV50TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
 #define   NV50TCL_VB_ELEMENT_U16_I1_SHIFT						16
 #define   NV50TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV50TCL_VERTEX_BASE_HIGH							0x000015f4
+#define  NV50TCL_VERTEX_BASE_LOW							0x000015f8
 #define  NV50TCL_VERTEX_DATA								0x00001640
 #define  NV50TCL_PRIM_RESTART_ENABLE							0x00001644
 #define  NV50TCL_PRIM_RESTART_INDEX							0x00001648
@@ -8754,7 +8837,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50TCL_VIEWPORT_TRANSFORM_EN							0x0000192c
 #define  NV50TCL_VIEW_VOLUME_CLIP_CTRL							0x0000193c
 #define  NV50TCL_VIEWPORT_CLIP_RECTS_EN							0x0000194c
+#define  NV50TCL_VIEWPORT_CLIP_MODE							0x00001950
+#define   NV50TCL_VIEWPORT_CLIP_MODE_INCLUDE						0x00000000
+#define   NV50TCL_VIEWPORT_CLIP_MODE_EXCLUDE						0x00000001
+#define   NV50TCL_VIEWPORT_CLIP_MODE_UNKNOWN						0x00000002
 #define  NV50TCL_FP_CTRL_UNK196C							0x0000196c
+#define  NV50TCL_CLIPID_ENABLE								0x0000197c
+#define  NV50TCL_CLIPID_WIDTH								0x00001980
+#define  NV50TCL_CLIPID_ID								0x00001984
 #define  NV50TCL_FP_INTERPOLANT_CTRL							0x00001988
 #define   NV50TCL_FP_INTERPOLANT_CTRL_UMASK_SHIFT					24
 #define   NV50TCL_FP_INTERPOLANT_CTRL_UMASK_MASK					0xff000000
@@ -8855,19 +8945,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8					0x00c00000
 #define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16					0x00d80000
 #define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8						0x00e80000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_2_10_10_10				0x01800000
 #define   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SHIFT					25
-#define   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK						0x7e000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT					0x7e000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM					0x24000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM					0x12000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED					0x5a000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED					0x6c000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT					0x48000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT					0x36000000
+#define   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK						0x0e000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT					0x0e000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM					0x02000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM					0x04000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED					0x0a000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED					0x0c000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT					0x08000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT					0x06000000
 #define   NV50TCL_VERTEX_ARRAY_ATTRIB_BGRA						(1 << 31)
 #define  NV50TCL_QUERY_ADDRESS_HIGH							0x00001b00
 #define  NV50TCL_QUERY_ADDRESS_LOW							0x00001b04
-#define  NV50TCL_QUERY_COUNTER								0x00001b08
+#define  NV50TCL_QUERY_SEQUENCE								0x00001b08
 #define  NV50TCL_QUERY_GET								0x00001b0c
 
 
@@ -9022,4 +9113,938 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define  NV50_COMPUTE_USER_PARAM__SIZE							0x00000040
 
 
+#define NVC0TCL										0x00009097
+
+#define  NVC0TCL_SEMAPHORE_ADDRESS_HIGH							0x00000010
+#define  NVC0TCL_SEMAPHORE_ADDRESS_LOW							0x00000014
+#define  NVC0TCL_NOP									0x00000100
+#define  NVC0TCL_NOTIFY_ADDRESS_HIGH							0x00000104
+#define  NVC0TCL_NOTIFY_ADDRESS_LOW							0x00000108
+#define  NVC0TCL_NOTIFY									0x0000010c
+#define  NVC0TCL_SERIALIZE								0x00000110
+#define  NVC0TCL_EARLY_FRAGMENT_TESTS							0x00000210
+#define  NVC0TCL_TESS_MODE								0x00000320
+#define   NVC0TCL_TESS_MODE_PRIM_SHIFT							0
+#define   NVC0TCL_TESS_MODE_PRIM_MASK							0x0000000f
+#define    NVC0TCL_TESS_MODE_PRIM_ISOLINES						0x00000000
+#define    NVC0TCL_TESS_MODE_PRIM_TRIANGLES						0x00000001
+#define    NVC0TCL_TESS_MODE_PRIM_QUADS							0x00000002
+#define   NVC0TCL_TESS_MODE_SPACING_SHIFT						4
+#define   NVC0TCL_TESS_MODE_SPACING_MASK						0x000000f0
+#define    NVC0TCL_TESS_MODE_SPACING_EQUAL						0x00000000
+#define    NVC0TCL_TESS_MODE_SPACING_FRACTIONAL_ODD					0x00000010
+#define    NVC0TCL_TESS_MODE_SPACING_FRACTIONAL_EVEN					0x00000020
+#define   NVC0TCL_TESS_MODE_CW								(1 <<  8)
+#define   NVC0TCL_TESS_MODE_CONNECTED							(1 <<  9)
+#define  NVC0TCL_TESS_LEVEL_OUTER(x)							(0x00000324+((x)*4))
+#define  NVC0TCL_TESS_LEVEL_OUTER__SIZE							0x00000004
+#define  NVC0TCL_TESS_LEVEL_INNER(x)							(0x00000334+((x)*4))
+#define  NVC0TCL_TESS_LEVEL_INNER__SIZE							0x00000002
+#define  NVC0TCL_RASTERIZE_ENABLE							0x0000037c
+#define  NVC0TCL_TFB_BUFFER_ENABLE(x)							(0x00000380+((x)*32))
+#define  NVC0TCL_TFB_BUFFER_ENABLE__SIZE						0x00000004
+#define  NVC0TCL_TFB_ADDRESS_HIGH(x)							(0x00000384+((x)*32))
+#define  NVC0TCL_TFB_ADDRESS_HIGH__SIZE							0x00000004
+#define  NVC0TCL_TFB_ADDRESS_LOW(x)							(0x00000388+((x)*32))
+#define  NVC0TCL_TFB_ADDRESS_LOW__SIZE							0x00000004
+#define  NVC0TCL_TFB_BUFFER_SIZE(x)							(0x0000038c+((x)*32))
+#define  NVC0TCL_TFB_BUFFER_SIZE__SIZE							0x00000004
+#define  NVC0TCL_TFB_PRIMITIVE_ID(x)							(0x00000390+((x)*32))
+#define  NVC0TCL_TFB_PRIMITIVE_ID__SIZE							0x00000004
+#define  NVC0TCL_TFB_UNK0700(x)								(0x00000700+((x)*16))
+#define  NVC0TCL_TFB_UNK0700__SIZE							0x00000004
+#define  NVC0TCL_TFB_VARYING_COUNT(x)							(0x00000704+((x)*16))
+#define  NVC0TCL_TFB_VARYING_COUNT__SIZE						0x00000004
+#define  NVC0TCL_TFB_BUFFER_STRIDE(x)							(0x00000708+((x)*16))
+#define  NVC0TCL_TFB_BUFFER_STRIDE__SIZE						0x00000004
+#define  NVC0TCL_TFB_ENABLE								0x00000744
+#define  NVC0TCL_LOCAL_BASE								0x0000077c
+#define  NVC0TCL_UNK0790_ADDRESS_HIGH							0x00000790
+#define  NVC0TCL_UNK0790_ADDRESS_LOW							0x00000794
+#define  NVC0TCL_RT_ADDRESS_HIGH(x)							(0x00000800+((x)*32))
+#define  NVC0TCL_RT_ADDRESS_HIGH__SIZE							0x00000008
+#define  NVC0TCL_RT_ADDRESS_LOW(x)							(0x00000804+((x)*32))
+#define  NVC0TCL_RT_ADDRESS_LOW__SIZE							0x00000008
+#define  NVC0TCL_RT_HORIZ(x)								(0x00000808+((x)*32))
+#define  NVC0TCL_RT_HORIZ__SIZE								0x00000008
+#define  NVC0TCL_RT_VERT(x)								(0x0000080c+((x)*32))
+#define  NVC0TCL_RT_VERT__SIZE								0x00000008
+#define  NVC0TCL_RT_FORMAT(x)								(0x00000810+((x)*32))
+#define  NVC0TCL_RT_FORMAT__SIZE							0x00000008
+#define   NVC0TCL_RT_FORMAT_R32G32B32A32_FLOAT						0x000000c0
+#define   NVC0TCL_RT_FORMAT_R32G32B32A32_SINT						0x000000c1
+#define   NVC0TCL_RT_FORMAT_R32G32B32A32_UINT						0x000000c2
+#define   NVC0TCL_RT_FORMAT_R32G32B32X32_FLOAT						0x000000c3
+#define   NVC0TCL_RT_FORMAT_R16G16B16A16_UNORM						0x000000c6
+#define   NVC0TCL_RT_FORMAT_R16G16B16A16_SNORM						0x000000c7
+#define   NVC0TCL_RT_FORMAT_R16G16B16A16_SINT						0x000000c8
+#define   NVC0TCL_RT_FORMAT_R16G16B16A16_UINT						0x000000c9
+#define   NVC0TCL_RT_FORMAT_R16G16B16A16_FLOAT						0x000000ca
+#define   NVC0TCL_RT_FORMAT_R32G32_FLOAT						0x000000cb
+#define   NVC0TCL_RT_FORMAT_R32G32_SINT							0x000000cc
+#define   NVC0TCL_RT_FORMAT_R32G32_UINT							0x000000cd
+#define   NVC0TCL_RT_FORMAT_R16G16B16X16_FLOAT						0x000000ce
+#define   NVC0TCL_RT_FORMAT_A8R8G8B8_UNORM						0x000000cf
+#define   NVC0TCL_RT_FORMAT_A8R8G8B8_SRGB						0x000000d0
+#define   NVC0TCL_RT_FORMAT_A2B10G10R10_UNORM						0x000000d1
+#define   NVC0TCL_RT_FORMAT_A2B10G10R10_UINT						0x000000d2
+#define   NVC0TCL_RT_FORMAT_A8B8G8R8_UNORM						0x000000d5
+#define   NVC0TCL_RT_FORMAT_A8B8G8R8_SRGB						0x000000d6
+#define   NVC0TCL_RT_FORMAT_A8B8G8R8_SNORM						0x000000d7
+#define   NVC0TCL_RT_FORMAT_A8B8G8R8_SINT						0x000000d8
+#define   NVC0TCL_RT_FORMAT_A8B8G8R8_UINT						0x000000d9
+#define   NVC0TCL_RT_FORMAT_R16G16_UNORM						0x000000da
+#define   NVC0TCL_RT_FORMAT_R16G16_SNORM						0x000000db
+#define   NVC0TCL_RT_FORMAT_R16G16_SINT							0x000000dc
+#define   NVC0TCL_RT_FORMAT_R16G16_UINT							0x000000dd
+#define   NVC0TCL_RT_FORMAT_R16G16_FLOAT						0x000000de
+#define   NVC0TCL_RT_FORMAT_A2R10G10B10_UNORM						0x000000df
+#define   NVC0TCL_RT_FORMAT_B10G11R11_FLOAT						0x000000e0
+#define   NVC0TCL_RT_FORMAT_R32_FLOAT							0x000000e5
+#define   NVC0TCL_RT_FORMAT_X8R8G8B8_UNORM						0x000000e6
+#define   NVC0TCL_RT_FORMAT_X8R8G8B8_SRGB						0x000000e7
+#define   NVC0TCL_RT_FORMAT_R5G6B5_UNORM						0x000000e8
+#define   NVC0TCL_RT_FORMAT_A1R5G5B5_UNORM						0x000000e9
+#define   NVC0TCL_RT_FORMAT_R8G8_UNORM							0x000000ea
+#define   NVC0TCL_RT_FORMAT_R8G8_SNORM							0x000000eb
+#define   NVC0TCL_RT_FORMAT_R8G8_SINT							0x000000ec
+#define   NVC0TCL_RT_FORMAT_R8G8_UINT							0x000000ed
+#define   NVC0TCL_RT_FORMAT_R16_UNORM							0x000000ee
+#define   NVC0TCL_RT_FORMAT_R16_SNORM							0x000000ef
+#define   NVC0TCL_RT_FORMAT_R16_SINT							0x000000f0
+#define   NVC0TCL_RT_FORMAT_R16_UINT							0x000000f1
+#define   NVC0TCL_RT_FORMAT_R16_FLOAT							0x000000f2
+#define   NVC0TCL_RT_FORMAT_R8_UNORM							0x000000f3
+#define   NVC0TCL_RT_FORMAT_R8_SNORM							0x000000f4
+#define   NVC0TCL_RT_FORMAT_R8_SINT							0x000000f5
+#define   NVC0TCL_RT_FORMAT_R8_UINT							0x000000f6
+#define   NVC0TCL_RT_FORMAT_A8_UNORM							0x000000f7
+#define   NVC0TCL_RT_FORMAT_X1R5G5B5_UNORM						0x000000f8
+#define   NVC0TCL_RT_FORMAT_X8B8G8R8_UNORM						0x000000f9
+#define   NVC0TCL_RT_FORMAT_X8B8G8R8_SRGB						0x000000fa
+#define  NVC0TCL_RT_TILE_MODE(x)							(0x00000814+((x)*32))
+#define  NVC0TCL_RT_TILE_MODE__SIZE							0x00000008
+#define  NVC0TCL_RT_ARRAY_MODE(x)							(0x00000818+((x)*32))
+#define  NVC0TCL_RT_ARRAY_MODE__SIZE							0x00000008
+#define   NVC0TCL_RT_ARRAY_MODE_LAYERS_SHIFT						0
+#define   NVC0TCL_RT_ARRAY_MODE_LAYERS_MASK						0x0000ffff
+#define   NVC0TCL_RT_ARRAY_MODE_VOLUME							(1 << 16)
+#define  NVC0TCL_RT_LAYER_STRIDE(x)							(0x0000081c+((x)*32))
+#define  NVC0TCL_RT_LAYER_STRIDE__SIZE							0x00000008
+#define  NVC0TCL_VIEWPORT_SCALE_X(x)							(0x00000a00+((x)*32))
+#define  NVC0TCL_VIEWPORT_SCALE_X__SIZE							0x00000010
+#define  NVC0TCL_VIEWPORT_SCALE_Y(x)							(0x00000a04+((x)*32))
+#define  NVC0TCL_VIEWPORT_SCALE_Y__SIZE							0x00000010
+#define  NVC0TCL_VIEWPORT_SCALE_Z(x)							(0x00000a08+((x)*32))
+#define  NVC0TCL_VIEWPORT_SCALE_Z__SIZE							0x00000010
+#define  NVC0TCL_VIEWPORT_TRANSLATE_X(x)						(0x00000a0c+((x)*32))
+#define  NVC0TCL_VIEWPORT_TRANSLATE_X__SIZE						0x00000010
+#define  NVC0TCL_VIEWPORT_TRANSLATE_Y(x)						(0x00000a10+((x)*32))
+#define  NVC0TCL_VIEWPORT_TRANSLATE_Y__SIZE						0x00000010
+#define  NVC0TCL_VIEWPORT_TRANSLATE_Z(x)						(0x00000a14+((x)*32))
+#define  NVC0TCL_VIEWPORT_TRANSLATE_Z__SIZE						0x00000010
+#define  NVC0TCL_VIEWPORT_HORIZ(x)							(0x00000c00+((x)*16))
+#define  NVC0TCL_VIEWPORT_HORIZ__SIZE							0x00000010
+#define   NVC0TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NVC0TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NVC0TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NVC0TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NVC0TCL_VIEWPORT_VERT(x)							(0x00000c04+((x)*16))
+#define  NVC0TCL_VIEWPORT_VERT__SIZE							0x00000010
+#define   NVC0TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NVC0TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NVC0TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NVC0TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NVC0TCL_DEPTH_RANGE_NEAR(x)							(0x00000c08+((x)*16))
+#define  NVC0TCL_DEPTH_RANGE_NEAR__SIZE							0x00000010
+#define  NVC0TCL_DEPTH_RANGE_FAR(x)							(0x00000c0c+((x)*16))
+#define  NVC0TCL_DEPTH_RANGE_FAR__SIZE							0x00000010
+#define  NVC0TCL_VIEWPORT_CLIP_HORIZ(x)							(0x00000d00+((x)*8))
+#define  NVC0TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NVC0TCL_VIEWPORT_CLIP_HORIZ_MIN_SHIFT						0
+#define   NVC0TCL_VIEWPORT_CLIP_HORIZ_MIN_MASK						0x0000ffff
+#define   NVC0TCL_VIEWPORT_CLIP_HORIZ_MAX_SHIFT						16
+#define   NVC0TCL_VIEWPORT_CLIP_HORIZ_MAX_MASK						0xffff0000
+#define  NVC0TCL_VIEWPORT_CLIP_VERT(x)							(0x00000d04+((x)*8))
+#define  NVC0TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NVC0TCL_VIEWPORT_CLIP_VERT_MIN_SHIFT						0
+#define   NVC0TCL_VIEWPORT_CLIP_VERT_MIN_MASK						0x0000ffff
+#define   NVC0TCL_VIEWPORT_CLIP_VERT_MAX_SHIFT						16
+#define   NVC0TCL_VIEWPORT_CLIP_VERT_MAX_MASK						0xffff0000
+#define  NVC0TCL_CLIPID_REGION_HORIZ(x)							(0x00000d40+((x)*8))
+#define  NVC0TCL_CLIPID_REGION_HORIZ__SIZE						0x00000004
+#define  NVC0TCL_CLIPID_REGION_VERT(x)							(0x00000d44+((x)*8))
+#define  NVC0TCL_CLIPID_REGION_VERT__SIZE						0x00000004
+#define  NVC0TCL_VERTEX_BUFFER_FIRST							0x00000d74
+#define  NVC0TCL_VERTEX_BUFFER_COUNT							0x00000d78
+#define  NVC0TCL_CLEAR_COLOR(x)								(0x00000d80+((x)*4))
+#define  NVC0TCL_CLEAR_COLOR__SIZE							0x00000004
+#define  NVC0TCL_CLEAR_DEPTH								0x00000d90
+#define  NVC0TCL_STACK_ADDRESS_HIGH							0x00000d94
+#define  NVC0TCL_STACK_ADDRESS_LOW							0x00000d98
+#define  NVC0TCL_STACK_SIZE_LOG								0x00000d9c
+#define  NVC0TCL_CLEAR_STENCIL								0x00000da0
+#define  NVC0TCL_POLYGON_SMOOTH_ENABLE							0x00000db4
+#define  NVC0TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000dc0
+#define  NVC0TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000dc4
+#define  NVC0TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000dc8
+#define  NVC0TCL_PATCH_VERTICES								0x00000dcc
+#define  NVC0TCL_WATCHDOG_TIMER								0x00000de4
+#define  NVC0TCL_WINDOW_OFFSET_X							0x00000df8
+#define  NVC0TCL_WINDOW_OFFSET_Y							0x00000dfc
+#define  NVC0TCL_SCISSOR_ENABLE(x)							(0x00000e00+((x)*16))
+#define  NVC0TCL_SCISSOR_ENABLE__SIZE							0x00000010
+#define  NVC0TCL_SCISSOR_HORIZ(x)							(0x00000e04+((x)*16))
+#define  NVC0TCL_SCISSOR_HORIZ__SIZE							0x00000010
+#define   NVC0TCL_SCISSOR_HORIZ_MIN_SHIFT						0
+#define   NVC0TCL_SCISSOR_HORIZ_MIN_MASK						0x0000ffff
+#define   NVC0TCL_SCISSOR_HORIZ_MAX_SHIFT						16
+#define   NVC0TCL_SCISSOR_HORIZ_MAX_MASK						0xffff0000
+#define  NVC0TCL_SCISSOR_VERT(x)							(0x00000e08+((x)*16))
+#define  NVC0TCL_SCISSOR_VERT__SIZE							0x00000010
+#define   NVC0TCL_SCISSOR_VERT_MIN_SHIFT						0
+#define   NVC0TCL_SCISSOR_VERT_MIN_MASK							0x0000ffff
+#define   NVC0TCL_SCISSOR_VERT_MAX_SHIFT						16
+#define   NVC0TCL_SCISSOR_VERT_MAX_MASK							0xffff0000
+#define  NVC0TCL_LOCAL_WARPS_LOG_ALLOC							0x00000f44
+#define  NVC0TCL_LOCAL_WARPS_NO_CLAMP							0x00000f48
+#define  NVC0TCL_STACK_WARPS_LOG_ALLOC							0x00000f4c
+#define  NVC0TCL_STACK_WARPS_NO_CLAMP							0x00000f50
+#define  NVC0TCL_STENCIL_BACK_FUNC_REF							0x00000f54
+#define  NVC0TCL_STENCIL_BACK_MASK							0x00000f58
+#define  NVC0TCL_STENCIL_BACK_FUNC_MASK							0x00000f5c
+#define  NVC0TCL_VERTEX_RUNOUT_HIGH							0x00000f84
+#define  NVC0TCL_VERTEX_RUNOUT_LOW							0x00000f88
+#define  NVC0TCL_DEPTH_BOUNDS(x)							(0x00000f9c+((x)*4))
+#define  NVC0TCL_DEPTH_BOUNDS__SIZE							0x00000002
+#define  NVC0TCL_MSAA_MASK(x)								(0x00000fbc+((x)*4))
+#define  NVC0TCL_MSAA_MASK__SIZE							0x00000004
+#define  NVC0TCL_CLIPID_ADDRESS_HIGH							0x00000fcc
+#define  NVC0TCL_CLIPID_ADDRESS_LOW							0x00000fd0
+#define  NVC0TCL_ZETA_ADDRESS_HIGH							0x00000fe0
+#define  NVC0TCL_ZETA_ADDRESS_LOW							0x00000fe4
+#define  NVC0TCL_ZETA_FORMAT								0x00000fe8
+#define   NVC0TCL_ZETA_FORMAT_Z32_FLOAT							0x0000000a
+#define   NVC0TCL_ZETA_FORMAT_Z16_UNORM							0x00000013
+#define   NVC0TCL_ZETA_FORMAT_Z24S8_UNORM						0x00000014
+#define   NVC0TCL_ZETA_FORMAT_X8Z24_UNORM						0x00000015
+#define   NVC0TCL_ZETA_FORMAT_S8Z24_UNORM						0x00000016
+#define   NVC0TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM					0x00000019
+#define  NVC0TCL_ZETA_TILE_MODE								0x00000fec
+#define  NVC0TCL_ZETA_LAYER_STRIDE							0x00000ff0
+#define  NVC0TCL_SCREEN_SCISSOR_HORIZ							0x00000ff4
+#define   NVC0TCL_SCREEN_SCISSOR_HORIZ_W_SHIFT						16
+#define   NVC0TCL_SCREEN_SCISSOR_HORIZ_W_MASK						0xffff0000
+#define   NVC0TCL_SCREEN_SCISSOR_HORIZ_X_SHIFT						0
+#define   NVC0TCL_SCREEN_SCISSOR_HORIZ_X_MASK						0x0000ffff
+#define  NVC0TCL_SCREEN_SCISSOR_VERT							0x00000ff8
+#define   NVC0TCL_SCREEN_SCISSOR_VERT_H_SHIFT						16
+#define   NVC0TCL_SCREEN_SCISSOR_VERT_H_MASK						0xffff0000
+#define   NVC0TCL_SCREEN_SCISSOR_VERT_Y_SHIFT						0
+#define   NVC0TCL_SCREEN_SCISSOR_VERT_Y_MASK						0x0000ffff
+#define  NVC0TCL_VTX_ATTR_DEFINE							0x0000114c
+#define   NVC0TCL_VTX_ATTR_DEFINE_ATTR_SHIFT						0
+#define   NVC0TCL_VTX_ATTR_DEFINE_ATTR_MASK						0x0000003f
+#define   NVC0TCL_VTX_ATTR_DEFINE_COMP_SHIFT						8
+#define   NVC0TCL_VTX_ATTR_DEFINE_COMP_MASK						0x00000f00
+#define   NVC0TCL_VTX_ATTR_DEFINE_SIZE_SHIFT						12
+#define   NVC0TCL_VTX_ATTR_DEFINE_SIZE_MASK						0x0000f000
+#define   NVC0TCL_VTX_ATTR_DEFINE_TYPE_SHIFT						16
+#define   NVC0TCL_VTX_ATTR_DEFINE_TYPE_MASK						0x000f0000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_FLOAT						0x00070000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_UNORM						0x00010000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_SNORM						0x00020000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_USCALED						0x00050000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_SSCALED						0x00060000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_UINT						0x00040000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_SINT						0x00030000
+#define  NVC0TCL_VTX_ATTR_DATA(x)							(0x00001150+((x)*4))
+#define  NVC0TCL_VTX_ATTR_DATA__SIZE							0x00000004
+#define  NVC0TCL_VERTEX_ATTRIB_FORMAT(x)						(0x00001160+((x)*4))
+#define  NVC0TCL_VERTEX_ATTRIB_FORMAT__SIZE						0x00000020
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_BUFFER_SHIFT					0
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_BUFFER_MASK					0x0000003f
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_CONST						(1 <<  6)
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_OFFSET_SHIFT					7
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_OFFSET_MASK					0x001fff80
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_SHIFT					21
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_MASK					0x07e00000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32_32_32_32				0x00200000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32_32_32					0x00400000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16_16_16_16				0x00600000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32_32					0x00800000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16_16_16					0x00a00000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8_8_8_8					0x01400000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16_16					0x01e00000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32					0x02400000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8_8_8					0x02600000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8_8					0x03000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16					0x03600000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8					0x03a00000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_2_10_10_10				0x06000000
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SHIFT					27
+#define   NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_MASK					0x78000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT					0x38000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UNORM					0x08000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SNORM					0x10000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_USCALED					0x28000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SSCALED					0x30000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UINT					0x20000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SINT					0x18000000
+#define  NVC0TCL_RT_CONTROL								0x0000121c
+#define   NVC0TCL_RT_CONTROL_COUNT_SHIFT						0
+#define   NVC0TCL_RT_CONTROL_COUNT_MASK							0x0000000f
+#define   NVC0TCL_RT_CONTROL_MAP0_SHIFT							4
+#define   NVC0TCL_RT_CONTROL_MAP0_MASK							0x00000070
+#define   NVC0TCL_RT_CONTROL_MAP1_SHIFT							7
+#define   NVC0TCL_RT_CONTROL_MAP1_MASK							0x00000380
+#define   NVC0TCL_RT_CONTROL_MAP2_SHIFT							10
+#define   NVC0TCL_RT_CONTROL_MAP2_MASK							0x00001c00
+#define   NVC0TCL_RT_CONTROL_MAP3_SHIFT							13
+#define   NVC0TCL_RT_CONTROL_MAP3_MASK							0x0000e000
+#define   NVC0TCL_RT_CONTROL_MAP4_SHIFT							16
+#define   NVC0TCL_RT_CONTROL_MAP4_MASK							0x00070000
+#define   NVC0TCL_RT_CONTROL_MAP5_SHIFT							19
+#define   NVC0TCL_RT_CONTROL_MAP5_MASK							0x00380000
+#define   NVC0TCL_RT_CONTROL_MAP6_SHIFT							22
+#define   NVC0TCL_RT_CONTROL_MAP6_MASK							0x01c00000
+#define   NVC0TCL_RT_CONTROL_MAP7_SHIFT							25
+#define   NVC0TCL_RT_CONTROL_MAP7_MASK							0x0e000000
+#define  NVC0TCL_ZETA_HORIZ								0x00001228
+#define  NVC0TCL_ZETA_VERT								0x0000122c
+#define  NVC0TCL_ZETA_ARRAY_MODE							0x00001230
+#define   NVC0TCL_ZETA_ARRAY_MODE_LAYERS_SHIFT						0
+#define   NVC0TCL_ZETA_ARRAY_MODE_LAYERS_MASK						0x0000ffff
+#define   NVC0TCL_ZETA_ARRAY_MODE_UNK							(1 << 16)
+#define  NVC0TCL_LINKED_TSC								0x00001234
+#define  NVC0TCL_FP_RESULT_COUNT							0x00001298
+#define  NVC0TCL_DEPTH_TEST_ENABLE							0x000012cc
+#define  NVC0TCL_SHADE_MODEL								0x000012d4
+#define   NVC0TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NVC0TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NVC0TCL_BLEND_INDEPENDENT							0x000012e4
+#define  NVC0TCL_DEPTH_WRITE_ENABLE							0x000012e8
+#define  NVC0TCL_ALPHA_TEST_ENABLE							0x000012ec
+#define  NVC0TCL_PM_SET(x)								(0x000012f0+((x)*4))
+#define  NVC0TCL_PM_SET__SIZE								0x00000004
+#define  NVC0TCL_VB_ELEMENT_U8_SETUP							0x00001300
+#define   NVC0TCL_VB_ELEMENT_U8_SETUP_OFFSET_SHIFT					30
+#define   NVC0TCL_VB_ELEMENT_U8_SETUP_OFFSET_MASK					0xc0000000
+#define   NVC0TCL_VB_ELEMENT_U8_SETUP_COUNT_SHIFT					0
+#define   NVC0TCL_VB_ELEMENT_U8_SETUP_COUNT_MASK					0x3fffffff
+#define  NVC0TCL_VB_ELEMENT_U8								0x00001304
+#define   NVC0TCL_VB_ELEMENT_U8_I0_SHIFT						0
+#define   NVC0TCL_VB_ELEMENT_U8_I0_MASK							0x000000ff
+#define   NVC0TCL_VB_ELEMENT_U8_I1_SHIFT						8
+#define   NVC0TCL_VB_ELEMENT_U8_I1_MASK							0x0000ff00
+#define   NVC0TCL_VB_ELEMENT_U8_I2_SHIFT						16
+#define   NVC0TCL_VB_ELEMENT_U8_I2_MASK							0x00ff0000
+#define   NVC0TCL_VB_ELEMENT_U8_I3_SHIFT						24
+#define   NVC0TCL_VB_ELEMENT_U8_I3_MASK							0xff000000
+#define  NVC0TCL_DEPTH_TEST_FUNC							0x0000130c
+#define   NVC0TCL_DEPTH_TEST_FUNC_NEVER							0x00000200
+#define   NVC0TCL_DEPTH_TEST_FUNC_LESS							0x00000201
+#define   NVC0TCL_DEPTH_TEST_FUNC_EQUAL							0x00000202
+#define   NVC0TCL_DEPTH_TEST_FUNC_LEQUAL						0x00000203
+#define   NVC0TCL_DEPTH_TEST_FUNC_GREATER						0x00000204
+#define   NVC0TCL_DEPTH_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NVC0TCL_DEPTH_TEST_FUNC_GEQUAL						0x00000206
+#define   NVC0TCL_DEPTH_TEST_FUNC_ALWAYS						0x00000207
+#define  NVC0TCL_ALPHA_TEST_REF								0x00001310
+#define  NVC0TCL_ALPHA_TEST_FUNC							0x00001314
+#define   NVC0TCL_ALPHA_TEST_FUNC_NEVER							0x00000200
+#define   NVC0TCL_ALPHA_TEST_FUNC_LESS							0x00000201
+#define   NVC0TCL_ALPHA_TEST_FUNC_EQUAL							0x00000202
+#define   NVC0TCL_ALPHA_TEST_FUNC_LEQUAL						0x00000203
+#define   NVC0TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NVC0TCL_ALPHA_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NVC0TCL_ALPHA_TEST_FUNC_GEQUAL						0x00000206
+#define   NVC0TCL_ALPHA_TEST_FUNC_ALWAYS						0x00000207
+#define  NVC0TCL_BLEND_COLOR(x)								(0x0000131c+((x)*4))
+#define  NVC0TCL_BLEND_COLOR__SIZE							0x00000004
+#define  NVC0TCL_TIC_FLUSH								0x00001330
+#define  NVC0TCL_TSC_FLUSH								0x00001334
+#define  NVC0TCL_TEX_CACHE_CTL								0x00001338
+#define  NVC0TCL_BLEND_EQUATION_RGB							0x00001340
+#define   NVC0TCL_BLEND_EQUATION_RGB_FUNC_ADD						0x00008006
+#define   NVC0TCL_BLEND_EQUATION_RGB_MIN						0x00008007
+#define   NVC0TCL_BLEND_EQUATION_RGB_MAX						0x00008008
+#define   NVC0TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
+#define   NVC0TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NVC0TCL_BLEND_FUNC_SRC_RGB							0x00001344
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00004300
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00004302
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00004304
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00004306
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE					0x00004308
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR						0x0000c900
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA						0x0000c902
+#define   NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_BLEND_FUNC_DST_RGB							0x00001348
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00004300
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00004302
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00004304
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00004306
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE					0x00004308
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_SRC1_COLOR						0x0000c900
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_SRC1_ALPHA						0x0000c902
+#define   NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_BLEND_EQUATION_ALPHA							0x0000134c
+#define   NVC0TCL_BLEND_EQUATION_ALPHA_FUNC_ADD						0x00008006
+#define   NVC0TCL_BLEND_EQUATION_ALPHA_MIN						0x00008007
+#define   NVC0TCL_BLEND_EQUATION_ALPHA_MAX						0x00008008
+#define   NVC0TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x0000800a
+#define   NVC0TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NVC0TCL_BLEND_FUNC_SRC_ALPHA							0x00001350
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x00004300
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x00004302
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x00004304
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x00004306
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x00004308
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC1_COLOR					0x0000c900
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC1_ALPHA					0x0000c902
+#define   NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_BLEND_FUNC_DST_ALPHA							0x00001358
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x00004300
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x00004302
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x00004304
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x00004306
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x00004308
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC1_COLOR					0x0000c900
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC1_ALPHA					0x0000c902
+#define   NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_STENCIL_ENABLE								0x00001380
+#define  NVC0TCL_STENCIL_FRONT_OP_FAIL							0x00001384
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NVC0TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NVC0TCL_STENCIL_FRONT_OP_ZFAIL							0x00001388
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NVC0TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NVC0TCL_STENCIL_FRONT_OP_ZPASS							0x0000138c
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NVC0TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NVC0TCL_STENCIL_FRONT_FUNC_FUNC						0x00001390
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NVC0TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NVC0TCL_STENCIL_FRONT_FUNC_REF							0x00001394
+#define  NVC0TCL_STENCIL_FRONT_MASK							0x00001398
+#define  NVC0TCL_STENCIL_FRONT_FUNC_MASK						0x0000139c
+#define  NVC0TCL_FRAG_COLOR_CLAMP_EN							0x000013a8
+#define  NVC0TCL_Y_ORIGIN_BOTTOM							0x000013ac
+#define  NVC0TCL_LINE_WIDTH(x)								(0x000013b0+((x)*4))
+#define  NVC0TCL_LINE_WIDTH__SIZE							0x00000002
+#define  NVC0TCL_POINT_COORD_REPLACE_MAP(x)						(0x000013c0+((x)*4))
+#define  NVC0TCL_POINT_COORD_REPLACE_MAP__SIZE						0x00000008
+#define  NVC0TCL_GP_VERTEX_OUTPUT_COUNT							0x00001420
+#define  NVC0TCL_FENCE									0x0000142c
+#define  NVC0TCL_VB_ELEMENT_BASE							0x00001434
+#define  NVC0TCL_INSTANCE_BASE								0x00001438
+#define  NVC0TCL_CODE_CB_FLUSH								0x00001440
+#define  NVC0TCL_CLIPID_HEIGHT								0x00001504
+#define  NVC0TCL_VP_CLIP_DISTANCE_ENABLE						0x00001510
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_0						(1 <<  0)
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_1						(1 <<  1)
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_2						(1 <<  2)
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_3						(1 <<  3)
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_4						(1 <<  4)
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_5						(1 <<  5)
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_6						(1 <<  6)
+#define   NVC0TCL_VP_CLIP_DISTANCE_ENABLE_7						(1 <<  7)
+#define  NVC0TCL_SAMPLECNT_ENABLE							0x00001514
+#define  NVC0TCL_POINT_SIZE								0x00001518
+#define  NVC0TCL_POINT_SPRITE_ENABLE							0x00001520
+#define  NVC0TCL_SAMPLECNT_RESET							0x00001530
+#define  NVC0TCL_MULTISAMPLE_ZETA_ENABLE						0x00001534
+#define  NVC0TCL_ZETA_ENABLE								0x00001538
+#define  NVC0TCL_MULTISAMPLE_CTRL							0x0000153c
+#define   NVC0TCL_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE					(1 <<  0)
+#define   NVC0TCL_MULTISAMPLE_CTRL_ALPHA_TO_ONE						(1 <<  4)
+#define  NVC0TCL_NOPERSPECTIVE_BITMAP(x)						(0x00001540+((x)*4))
+#define  NVC0TCL_NOPERSPECTIVE_BITMAP__SIZE						0x00000004
+#define  NVC0TCL_COND_ADDRESS_HIGH							0x00001550
+#define  NVC0TCL_COND_ADDRESS_LOW							0x00001554
+#define  NVC0TCL_COND_MODE								0x00001558
+#define   NVC0TCL_COND_MODE_NEVER							0x00000000
+#define   NVC0TCL_COND_MODE_ALWAYS							0x00000001
+#define   NVC0TCL_COND_MODE_RES								0x00000002
+#define   NVC0TCL_COND_MODE_NOT_RES_AND_NOT_ID						0x00000003
+#define   NVC0TCL_COND_MODE_RES_OR_ID							0x00000004
+#define  NVC0TCL_TSC_ADDRESS_HIGH							0x0000155c
+#define  NVC0TCL_TSC_ADDRESS_LOW							0x00001560
+#define  NVC0TCL_TSC_LIMIT								0x00001564
+#define  NVC0TCL_POLYGON_OFFSET_FACTOR							0x0000156c
+#define  NVC0TCL_LINE_SMOOTH_ENABLE							0x00001570
+#define  NVC0TCL_TIC_ADDRESS_HIGH							0x00001574
+#define  NVC0TCL_TIC_ADDRESS_LOW							0x00001578
+#define  NVC0TCL_TIC_LIMIT								0x0000157c
+#define  NVC0TCL_PM_CONTROL(x)								(0x00001580+((x)*4))
+#define  NVC0TCL_PM_CONTROL__SIZE							0x00000004
+#define   NVC0TCL_PM_CONTROL_UNK0							(1 <<  0)
+#define   NVC0TCL_PM_CONTROL_UNK1_SHIFT							4
+#define   NVC0TCL_PM_CONTROL_UNK1_MASK							0x00000070
+#define   NVC0TCL_PM_CONTROL_UNK2_SHIFT							8
+#define   NVC0TCL_PM_CONTROL_UNK2_MASK							0xffffff00
+#define  NVC0TCL_STENCIL_TWO_SIDE_ENABLE						0x00001594
+#define  NVC0TCL_STENCIL_BACK_OP_FAIL							0x00001598
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NVC0TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NVC0TCL_STENCIL_BACK_OP_ZFAIL							0x0000159c
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NVC0TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NVC0TCL_STENCIL_BACK_OP_ZPASS							0x000015a0
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NVC0TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NVC0TCL_STENCIL_BACK_FUNC_FUNC							0x000015a4
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NVC0TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NVC0TCL_MULTISAMPLE_COLOR_ENABLE						0x000015b4
+#define  NVC0TCL_FRAMEBUFFER_SRGB							0x000015b8
+#define  NVC0TCL_POLYGON_OFFSET_UNITS							0x000015bc
+#define  NVC0TCL_GP_BUILTIN_RESULT_EN							0x000015cc
+#define   NVC0TCL_GP_BUILTIN_RESULT_EN_VPORT						(1 <<  0)
+#define   NVC0TCL_GP_BUILTIN_RESULT_EN_LAYER						(1 << 16)
+#define  NVC0TCL_MULTISAMPLE_MODE							0x000015d0
+#define   NVC0TCL_MULTISAMPLE_MODE_1X							0x00000000
+#define   NVC0TCL_MULTISAMPLE_MODE_2XMS							0x00000001
+#define   NVC0TCL_MULTISAMPLE_MODE_4XMS							0x00000002
+#define   NVC0TCL_MULTISAMPLE_MODE_8XMS							0x00000004
+#define   NVC0TCL_MULTISAMPLE_MODE_4XMS_4XCS						0x00000008
+#define   NVC0TCL_MULTISAMPLE_MODE_4XMS_12XCS						0x00000009
+#define   NVC0TCL_MULTISAMPLE_MODE_8XMS_8XCS						0x0000000a
+#define  NVC0TCL_EDGEFLAG_ENABLE							0x000015e4
+#define  NVC0TCL_VB_ELEMENT_U32								0x000015e8
+#define  NVC0TCL_VB_ELEMENT_U16_SETUP							0x000015ec
+#define   NVC0TCL_VB_ELEMENT_U16_SETUP_OFFSET_SHIFT					30
+#define   NVC0TCL_VB_ELEMENT_U16_SETUP_OFFSET_MASK					0xc0000000
+#define   NVC0TCL_VB_ELEMENT_U16_SETUP_COUNT_SHIFT					0
+#define   NVC0TCL_VB_ELEMENT_U16_SETUP_COUNT_MASK					0x3fffffff
+#define  NVC0TCL_VB_ELEMENT_U16								0x000015f0
+#define   NVC0TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NVC0TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NVC0TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NVC0TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NVC0TCL_VERTEX_BASE_HIGH							0x000015f4
+#define  NVC0TCL_VERTEX_BASE_LOW							0x000015f8
+#define  NVC0TCL_CODE_ADDRESS_HIGH							0x00001608
+#define  NVC0TCL_CODE_ADDRESS_LOW							0x0000160c
+#define  NVC0TCL_VERTEX_BEGIN								0x00001618
+#define   NVC0TCL_VERTEX_BEGIN_MODE_SHIFT						0
+#define   NVC0TCL_VERTEX_BEGIN_MODE_MASK						0x0000000f
+#define    NVC0TCL_VERTEX_BEGIN_MODE_POINTS						0x00000000
+#define    NVC0TCL_VERTEX_BEGIN_MODE_LINES						0x00000001
+#define    NVC0TCL_VERTEX_BEGIN_MODE_LINE_LOOP						0x00000002
+#define    NVC0TCL_VERTEX_BEGIN_MODE_LINE_STRIP						0x00000003
+#define    NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLES						0x00000004
+#define    NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLE_STRIP					0x00000005
+#define    NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLE_FAN					0x00000006
+#define    NVC0TCL_VERTEX_BEGIN_MODE_QUADS						0x00000007
+#define    NVC0TCL_VERTEX_BEGIN_MODE_QUAD_STRIP						0x00000008
+#define    NVC0TCL_VERTEX_BEGIN_MODE_POLYGON						0x00000009
+#define    NVC0TCL_VERTEX_BEGIN_MODE_LINES_ADJACENCY					0x0000000a
+#define    NVC0TCL_VERTEX_BEGIN_MODE_LINE_STRIP_ADJACENCY				0x0000000b
+#define    NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLES_ADJACENCY				0x0000000c
+#define    NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLE_STRIP_ADJACENCY				0x0000000d
+#define    NVC0TCL_VERTEX_BEGIN_MODE_PATCHES						0x0000000e
+#define   NVC0TCL_VERTEX_BEGIN_INSTANCE							(1 << 26)
+#define  NVC0TCL_VERTEX_END								0x00001614
+#define  NVC0TCL_VERTEX_DATA								0x00001640
+#define  NVC0TCL_PRIM_RESTART_ENABLE							0x00001644
+#define  NVC0TCL_PRIM_RESTART_INDEX							0x00001648
+#define  NVC0TCL_POINT_SMOOTH_ENABLE							0x00001658
+#define  NVC0TCL_POINT_SPRITE_CTRL							0x00001660
+#define  NVC0TCL_LINE_STIPPLE_ENABLE							0x0000166c
+#define  NVC0TCL_LINE_STIPPLE_PATTERN							0x00001680
+#define  NVC0TCL_PROVOKING_VERTEX_LAST							0x00001684
+#define  NVC0TCL_VERTEX_TWO_SIDE_ENABLE							0x00001688
+#define  NVC0TCL_POLYGON_STIPPLE_ENABLE							0x0000168c
+#define  NVC0TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001700+((x)*4))
+#define  NVC0TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NVC0TCL_UNK17BC_ADDRESS_HIGH							0x000017bc
+#define  NVC0TCL_UNK17BC_ADDRESS_LOW							0x000017c0
+#define  NVC0TCL_UNK17BC_LIMIT								0x000017c4
+#define  NVC0TCL_VP_POINT_SIZE_EN							0x00001910
+#define  NVC0TCL_CULL_FACE_ENABLE							0x00001918
+#define  NVC0TCL_FRONT_FACE								0x0000191c
+#define   NVC0TCL_FRONT_FACE_CW								0x00000900
+#define   NVC0TCL_FRONT_FACE_CCW							0x00000901
+#define  NVC0TCL_CULL_FACE								0x00001920
+#define   NVC0TCL_CULL_FACE_FRONT							0x00000404
+#define   NVC0TCL_CULL_FACE_BACK							0x00000405
+#define   NVC0TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NVC0TCL_VIEWPORT_TRANSFORM_EN							0x0000192c
+#define  NVC0TCL_VIEW_VOLUME_CLIP_CTRL							0x0000193c
+#define  NVC0TCL_VIEWPORT_CLIP_RECTS_EN							0x0000194c
+#define  NVC0TCL_VIEWPORT_CLIP_MODE							0x00001950
+#define   NVC0TCL_VIEWPORT_CLIP_MODE_INCLUDE						0x00000000
+#define   NVC0TCL_VIEWPORT_CLIP_MODE_EXCLUDE						0x00000001
+#define   NVC0TCL_VIEWPORT_CLIP_MODE_UNKNOWN						0x00000002
+#define  NVC0TCL_FP_ZORDER_CTRL								0x0000196c
+#define  NVC0TCL_CLIPID_ENABLE								0x0000197c
+#define  NVC0TCL_CLIPID_WIDTH								0x00001980
+#define  NVC0TCL_CLIPID_ID								0x00001984
+#define  NVC0TCL_REG_MODE								0x000019a0
+#define   NVC0TCL_REG_MODE_PACKED							0x00000001
+#define   NVC0TCL_REG_MODE_STRIPED							0x00000002
+#define  NVC0TCL_FP_CONTROL								0x000019a8
+#define   NVC0TCL_FP_CONTROL_MULTIPLE_RESULTS						(1 <<  0)
+#define   NVC0TCL_FP_CONTROL_EXPORTS_Z							(1 <<  8)
+#define   NVC0TCL_FP_CONTROL_USES_KIL							(1 << 20)
+#define  NVC0TCL_DEPTH_BOUNDS_EN							0x000019bc
+#define  NVC0TCL_LOGIC_OP_ENABLE							0x000019c4
+#define  NVC0TCL_LOGIC_OP								0x000019c8
+#define   NVC0TCL_LOGIC_OP_CLEAR							0x00001500
+#define   NVC0TCL_LOGIC_OP_AND								0x00001501
+#define   NVC0TCL_LOGIC_OP_AND_REVERSE							0x00001502
+#define   NVC0TCL_LOGIC_OP_COPY								0x00001503
+#define   NVC0TCL_LOGIC_OP_AND_INVERTED							0x00001504
+#define   NVC0TCL_LOGIC_OP_NOOP								0x00001505
+#define   NVC0TCL_LOGIC_OP_XOR								0x00001506
+#define   NVC0TCL_LOGIC_OP_OR								0x00001507
+#define   NVC0TCL_LOGIC_OP_NOR								0x00001508
+#define   NVC0TCL_LOGIC_OP_EQUIV							0x00001509
+#define   NVC0TCL_LOGIC_OP_INVERT							0x0000150a
+#define   NVC0TCL_LOGIC_OP_OR_REVERSE							0x0000150b
+#define   NVC0TCL_LOGIC_OP_COPY_INVERTED						0x0000150c
+#define   NVC0TCL_LOGIC_OP_OR_INVERTED							0x0000150d
+#define   NVC0TCL_LOGIC_OP_NAND								0x0000150e
+#define   NVC0TCL_LOGIC_OP_SET								0x0000150f
+#define  NVC0TCL_CLEAR_BUFFERS								0x000019d0
+#define   NVC0TCL_CLEAR_BUFFERS_Z							(1 <<  0)
+#define   NVC0TCL_CLEAR_BUFFERS_S							(1 <<  1)
+#define   NVC0TCL_CLEAR_BUFFERS_R							(1 <<  2)
+#define   NVC0TCL_CLEAR_BUFFERS_G							(1 <<  3)
+#define   NVC0TCL_CLEAR_BUFFERS_B							(1 <<  4)
+#define   NVC0TCL_CLEAR_BUFFERS_A							(1 <<  5)
+#define   NVC0TCL_CLEAR_BUFFERS_RT_SHIFT						6
+#define   NVC0TCL_CLEAR_BUFFERS_RT_MASK							0x000003c0
+#define   NVC0TCL_CLEAR_BUFFERS_LAYER_SHIFT						10
+#define   NVC0TCL_CLEAR_BUFFERS_LAYER_MASK						0x0007fc00
+#define  NVC0TCL_COLOR_MASK(x)								(0x00001a00+((x)*4))
+#define  NVC0TCL_COLOR_MASK__SIZE							0x00000008
+#define   NVC0TCL_COLOR_MASK_R_SHIFT							0
+#define   NVC0TCL_COLOR_MASK_R_MASK							0x0000000f
+#define   NVC0TCL_COLOR_MASK_G_SHIFT							4
+#define   NVC0TCL_COLOR_MASK_G_MASK							0x000000f0
+#define   NVC0TCL_COLOR_MASK_B_SHIFT							8
+#define   NVC0TCL_COLOR_MASK_B_MASK							0x00000f00
+#define   NVC0TCL_COLOR_MASK_A_SHIFT							12
+#define   NVC0TCL_COLOR_MASK_A_MASK							0x0000f000
+#define  NVC0TCL_QUERY_ADDRESS_HIGH							0x00001b00
+#define  NVC0TCL_QUERY_ADDRESS_LOW							0x00001b04
+#define  NVC0TCL_QUERY_SEQUENCE								0x00001b08
+#define  NVC0TCL_QUERY_GET								0x00001b0c
+#define  NVC0TCL_VERTEX_ARRAY_FETCH(x)							(0x00001c00+((x)*16))
+#define  NVC0TCL_VERTEX_ARRAY_FETCH__SIZE						0x00000020
+#define   NVC0TCL_VERTEX_ARRAY_FETCH_STRIDE_SHIFT					0
+#define   NVC0TCL_VERTEX_ARRAY_FETCH_STRIDE_MASK					0x00000fff
+#define   NVC0TCL_VERTEX_ARRAY_FETCH_ENABLE						(1 << 12)
+#define  NVC0TCL_BLEND_EQUATIONI_RGB(x)							(0x00001e04+((x)*32))
+#define  NVC0TCL_BLEND_EQUATIONI_RGB__SIZE						0x00000008
+#define   NVC0TCL_BLEND_EQUATIONI_RGB_FUNC_ADD						0x00008006
+#define   NVC0TCL_BLEND_EQUATIONI_RGB_MIN						0x00008007
+#define   NVC0TCL_BLEND_EQUATIONI_RGB_MAX						0x00008008
+#define   NVC0TCL_BLEND_EQUATIONI_RGB_FUNC_SUBTRACT					0x0000800a
+#define   NVC0TCL_BLEND_EQUATIONI_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NVC0TCL_BLEND_FUNCI_SRC_RGB(x)							(0x00001e08+((x)*32))
+#define  NVC0TCL_BLEND_FUNCI_SRC_RGB__SIZE						0x00000008
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC_COLOR						0x00004300
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC_ALPHA						0x00004302
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_DST_ALPHA						0x00004304
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_DST_COLOR						0x00004306
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC_ALPHA_SATURATE				0x00004308
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC1_COLOR					0x0000c900
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC1_ALPHA					0x0000c902
+#define   NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_BLEND_FUNCI_DST_RGB(x)							(0x00001e0c+((x)*32))
+#define  NVC0TCL_BLEND_FUNCI_DST_RGB__SIZE						0x00000008
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_SRC_COLOR						0x00004300
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_SRC_ALPHA						0x00004302
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_DST_ALPHA						0x00004304
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_DST_COLOR						0x00004306
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_SRC_ALPHA_SATURATE				0x00004308
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x0000c002
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x0000c004
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_SRC1_COLOR					0x0000c900
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_SRC1_ALPHA					0x0000c902
+#define   NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_BLEND_EQUATIONI_ALPHA(x)						(0x00001e10+((x)*32))
+#define  NVC0TCL_BLEND_EQUATIONI_ALPHA__SIZE						0x00000008
+#define   NVC0TCL_BLEND_EQUATIONI_ALPHA_FUNC_ADD					0x00008006
+#define   NVC0TCL_BLEND_EQUATIONI_ALPHA_MIN						0x00008007
+#define   NVC0TCL_BLEND_EQUATIONI_ALPHA_MAX						0x00008008
+#define   NVC0TCL_BLEND_EQUATIONI_ALPHA_FUNC_SUBTRACT					0x0000800a
+#define   NVC0TCL_BLEND_EQUATIONI_ALPHA_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NVC0TCL_BLEND_FUNCI_SRC_ALPHA(x)						(0x00001e14+((x)*32))
+#define  NVC0TCL_BLEND_FUNCI_SRC_ALPHA__SIZE						0x00000008
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC_COLOR					0x00004300
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC_ALPHA					0x00004302
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_DST_ALPHA					0x00004304
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_DST_COLOR					0x00004306
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC_ALPHA_SATURATE				0x00004308
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x0000c002
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x0000c004
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC1_COLOR					0x0000c900
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC1_ALPHA					0x0000c902
+#define   NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_BLEND_FUNCI_DST_ALPHA(x)						(0x00001e18+((x)*32))
+#define  NVC0TCL_BLEND_FUNCI_DST_ALPHA__SIZE						0x00000008
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ZERO						0x00004000
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE						0x00004001
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC_COLOR					0x00004300
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x00004301
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC_ALPHA					0x00004302
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x00004303
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_DST_ALPHA					0x00004304
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x00004305
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_DST_COLOR					0x00004306
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_DST_COLOR				0x00004307
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC_ALPHA_SATURATE				0x00004308
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_CONSTANT_COLOR					0x0000c001
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x0000c002
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_CONSTANT_ALPHA					0x0000c003
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x0000c004
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC1_COLOR					0x0000c900
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC1_COLOR				0x0000c901
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC1_ALPHA					0x0000c902
+#define   NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC1_ALPHA				0x0000c903
+#define  NVC0TCL_SP_SELECT(x)								(0x00002000+((x)*64))
+#define  NVC0TCL_SP_SELECT__SIZE							0x00000006
+#define   NVC0TCL_SP_SELECT_ENABLE							(1 <<  0)
+#define   NVC0TCL_SP_SELECT_PROGRAM_SHIFT						4
+#define   NVC0TCL_SP_SELECT_PROGRAM_MASK						0x000000f0
+#define  NVC0TCL_SP_START_ID(x)								(0x00002004+((x)*64))
+#define  NVC0TCL_SP_START_ID__SIZE							0x00000006
+#define  NVC0TCL_SP_GPR_ALLOC(x)							(0x0000200c+((x)*64))
+#define  NVC0TCL_SP_GPR_ALLOC__SIZE							0x00000006
+#define  NVC0TCL_CB_SIZE								0x00002380
+#define  NVC0TCL_CB_BIND(x)								(0x00002410+((x)*32))
+#define  NVC0TCL_CB_BIND__SIZE								0x00000005
+#define   NVC0TCL_CB_BIND_VALID								(1 <<  0)
+#define   NVC0TCL_CB_BIND_INDEX_SHIFT							4
+#define   NVC0TCL_CB_BIND_INDEX_MASK							0x000000f0
+#define  NVC0TCL_BIND_TIC(x)								(0x00002404+((x)*32))
+#define  NVC0TCL_BIND_TIC__SIZE								0x00000005
+#define   NVC0TCL_BIND_TIC_ACTIVE							(1 <<  0)
+#define   NVC0TCL_BIND_TIC_TEXTURE_SHIFT						1
+#define   NVC0TCL_BIND_TIC_TEXTURE_MASK							0x000001fe
+#define   NVC0TCL_BIND_TIC_TIC_SHIFT							9
+#define   NVC0TCL_BIND_TIC_TIC_MASK							0x7ffffe00
+#define  NVC0TCL_TEX_LIMITS(x)								(0x00002200+((x)*16))
+#define  NVC0TCL_TEX_LIMITS__SIZE							0x00000005
+#define   NVC0TCL_TEX_LIMITS_SAMPLERS_LOG2_SHIFT					0
+#define   NVC0TCL_TEX_LIMITS_SAMPLERS_LOG2_MASK						0x0000000f
+#define   NVC0TCL_TEX_LIMITS_TEXTURES_LOG2_SHIFT					4
+#define   NVC0TCL_TEX_LIMITS_TEXTURES_LOG2_MASK						0x000000f0
+#define  NVC0TCL_CB_ADDR_HIGH								0x00002384
+#define  NVC0TCL_CB_ADDR_LOW								0x00002388
+#define  NVC0TCL_CB_POS									0x0000238c
+#define  NVC0TCL_CB_DATA(x)								(0x00002390+((x)*4))
+#define  NVC0TCL_CB_DATA__SIZE								0x00000010
+#define  NVC0TCL_TFB_VARYING_LOCS(x)							(0x00002800+((x)*4))
+#define  NVC0TCL_TFB_VARYING_LOCS__SIZE							0x00000080
+#define  NVC0TCL_UNK_UPLOAD_POS								0x00003800
+#define  NVC0TCL_UNK_UPLOAD_DATA							0x00003804
+#define  NVC0TCL_VERTEX_ARRAY_SELECT							0x00003820
+#define  NVC0TCL_VERTEX_ARRAY_ADDRESS							0x00003824
+#define  NVC0TCL_BLEND_ENABLEI								0x00003858
+#define  NVC0TCL_POLYGON_MODE_FRONT							0x00003868
+#define   NVC0TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NVC0TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NVC0TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NVC0TCL_POLYGON_MODE_BACK							0x00003870
+#define   NVC0TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NVC0TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NVC0TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NVC0TCL_GP_SELECT								0x00003878
+#define   NVC0TCL_GP_SELECT_ENABLE							(1 <<  0)
+#define   NVC0TCL_GP_SELECT_PROGRAM_SHIFT						4
+#define   NVC0TCL_GP_SELECT_PROGRAM_MASK						0x000000f0
+#define  NVC0TCL_TEP_SELECT								0x00003880
+#define   NVC0TCL_TEP_SELECT_ENABLE							(1 <<  0)
+#define   NVC0TCL_TEP_SELECT_PROGRAM_SHIFT						4
+#define   NVC0TCL_TEP_SELECT_PROGRAM_MASK						0x000000f0
+
+
+#define NVC0_COMPUTE									0x000090c0
+
+#define  NVC0_COMPUTE_NOP								0x00000100
+#define  NVC0_COMPUTE_NOTIFY								0x00000104
+#define  NVC0_COMPUTE_SERIALIZE								0x00000110
+#define  NVC0_COMPUTE_LOCAL_SIZE							0x00000204
+#define  NVC0_COMPUTE_SHARED_BASE							0x00000214
+#define  NVC0_COMPUTE_GRIDDIM_YX							0x00000238
+#define   NVC0_COMPUTE_GRIDDIM_YX_X_SHIFT						0
+#define   NVC0_COMPUTE_GRIDDIM_YX_X_MASK						0x0000ffff
+#define   NVC0_COMPUTE_GRIDDIM_YX_Y_SHIFT						16
+#define   NVC0_COMPUTE_GRIDDIM_YX_Y_MASK						0xffff0000
+#define  NVC0_COMPUTE_GRIDDIM_Z								0x0000023c
+#define  NVC0_COMPUTE_SHARED_SIZE							0x0000024c
+#define  NVC0_COMPUTE_BLOCK_ALLOC							0x00000250
+#define   NVC0_COMPUTE_BLOCK_ALLOC_THREADS_SHIFT					0
+#define   NVC0_COMPUTE_BLOCK_ALLOC_THREADS_MASK						0x0000ffff
+#define   NVC0_COMPUTE_BLOCK_ALLOC_BARRIERS_SHIFT					16
+#define   NVC0_COMPUTE_BLOCK_ALLOC_BARRIERS_MASK					0xffff0000
+#define  NVC0_COMPUTE_CP_GPR_ALLOC							0x000002c0
+#define  NVC0_COMPUTE_GLOBAL_BASE							0x000002c8
+#define   NVC0_COMPUTE_GLOBAL_BASE_HIGH_SHIFT						0
+#define   NVC0_COMPUTE_GLOBAL_BASE_HIGH_MASK						0x000000ff
+#define   NVC0_COMPUTE_GLOBAL_BASE_INDEX_SHIFT						16
+#define   NVC0_COMPUTE_GLOBAL_BASE_INDEX_MASK						0x00ff0000
+#define   NVC0_COMPUTE_GLOBAL_BASE_FLAGS_SHIFT						28
+#define   NVC0_COMPUTE_GLOBAL_BASE_FLAGS_MASK						0xf0000000
+#define  NVC0_COMPUTE_LAUNCH								0x00000368
+#define  NVC0_COMPUTE_BLOCKDIM_YX							0x000003ac
+#define   NVC0_COMPUTE_BLOCKDIM_YX_X_SHIFT						0
+#define   NVC0_COMPUTE_BLOCKDIM_YX_X_MASK						0x0000ffff
+#define   NVC0_COMPUTE_BLOCKDIM_YX_Y_SHIFT						16
+#define   NVC0_COMPUTE_BLOCKDIM_YX_Y_MASK						0xffff0000
+#define  NVC0_COMPUTE_BLOCKDIM_Z							0x000003b0
+#define  NVC0_COMPUTE_CP_START_ID							0x000003b4
+#define  NVC0_COMPUTE_LOCAL_BASE							0x0000077c
+#define  NVC0_COMPUTE_UNK0790_ADDRESS_HIGH						0x00000790
+#define  NVC0_COMPUTE_UNK0790_ADDRESS_LOW						0x00000794
+#define  NVC0_COMPUTE_LINKED_TSC							0x00001234
+#define  NVC0_COMPUTE_TSC_ADDRESS_HIGH							0x0000155c
+#define  NVC0_COMPUTE_TSC_ADDRESS_LOW							0x00001560
+#define  NVC0_COMPUTE_TSC_LIMIT								0x00001564
+#define  NVC0_COMPUTE_TIC_ADDRESS_HIGH							0x00001574
+#define  NVC0_COMPUTE_TIC_ADDRESS_LOW							0x00001578
+#define  NVC0_COMPUTE_TIC_LIMIT								0x0000157c
+#define  NVC0_COMPUTE_CODE_ADDRESS_HIGH							0x00001608
+#define  NVC0_COMPUTE_CODE_ADDRESS_LOW							0x0000160c
+#define  NVC0_COMPUTE_CB_BIND								0x00001694
+#define   NVC0_COMPUTE_CB_BIND_INDEX_SHIFT						1
+#define   NVC0_COMPUTE_CB_BIND_INDEX_MASK						0xfffffffe
+#define   NVC0_COMPUTE_CB_BIND_VALID							(1 <<  0)
+#define  NVC0_COMPUTE_QUERY_ADDRESS_HIGH						0x00001b00
+#define  NVC0_COMPUTE_QUERY_ADDRESS_LOW							0x00001b04
+#define  NVC0_COMPUTE_QUERY_SEQUENCE							0x00001b08
+#define  NVC0_COMPUTE_QUERY_GET								0x00001b0c
+#define  NVC0_COMPUTE_CB_ADDRESS_HIGH							0x00002384
+#define  NVC0_COMPUTE_CB_ADDRESS_LOW							0x00002388
+#define  NVC0_COMPUTE_CB_POS								0x0000238c
+#define  NVC0_COMPUTE_CB_DATA								0x00002390
+
+
 #endif /* NOUVEAU_REG_H */
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 0d744ab788..88fee3630b 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -48,6 +48,53 @@ nv50_colormask(unsigned mask)
 	return cmask;
 }
 
+static INLINE uint32_t
+nv50_blend_func(unsigned factor)
+{
+	switch (factor) {
+	case PIPE_BLENDFACTOR_ZERO:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ZERO;
+	case PIPE_BLENDFACTOR_ONE:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA;
+	case PIPE_BLENDFACTOR_DST_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA;
+	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA;
+	case PIPE_BLENDFACTOR_DST_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR;
+	case PIPE_BLENDFACTOR_INV_DST_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE;
+	case PIPE_BLENDFACTOR_CONST_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR;
+	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR;
+	case PIPE_BLENDFACTOR_CONST_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA;
+	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA;
+	case PIPE_BLENDFACTOR_SRC1_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR;
+	case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR;
+	case PIPE_BLENDFACTOR_SRC1_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA;
+	case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA;
+	default:
+		return NV50TCL_BLEND_FUNC_SRC_RGB_ZERO;
+	}
+}
+
 static void *
 nv50_blend_state_create(struct pipe_context *pipe,
 			const struct pipe_blend_state *cso)
@@ -80,12 +127,12 @@ nv50_blend_state_create(struct pipe_context *pipe,
 	if (blend_enabled) {
 		so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5);
 		so_data  (so, nvgl_blend_eqn(cso->rt[0].rgb_func));
-		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_src_factor));
-		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_dst_factor));
+		so_data  (so, nv50_blend_func(cso->rt[0].rgb_src_factor));
+		so_data  (so, nv50_blend_func(cso->rt[0].rgb_dst_factor));
 		so_data  (so, nvgl_blend_eqn(cso->rt[0].alpha_func));
-		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_src_factor));
+		so_data  (so, nv50_blend_func(cso->rt[0].alpha_src_factor));
 		so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1);
-		so_data  (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_dst_factor));
+		so_data  (so, nv50_blend_func(cso->rt[0].alpha_dst_factor));
 	}
 
 	if (cso->logicop_enable == 0 ) {
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 6bd52884b5..996844b18f 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -29,6 +29,9 @@
 #include "nv50_context.h"
 #include "nv50_resource.h"
 
+/* VERTEX_ARRAY_ATTRIB_TYPE is duplicated for unknown reason */
+#define NV50_VAT(x) ((x) | ((x) << 3))
+
 static INLINE uint32_t
 nv50_vbo_type_to_hw(enum pipe_format format)
 {
@@ -39,22 +42,22 @@ nv50_vbo_type_to_hw(enum pipe_format format)
 
 	switch (desc->channel[0].type) {
 	case UTIL_FORMAT_TYPE_FLOAT:
-		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT;
+		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT);
 	case UTIL_FORMAT_TYPE_UNSIGNED:
 		if (desc->channel[0].normalized) {
-			return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM;
+			return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM);
 		}
-		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED;
+		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED);
 	case UTIL_FORMAT_TYPE_SIGNED:
 		if (desc->channel[0].normalized) {
-			return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM;
+			return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM);
 		}
-		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED;
+		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED);
 	/*
 	case PIPE_FORMAT_TYPE_UINT:
-		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT;
+		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT);
 	case PIPE_FORMAT_TYPE_SINT:
-		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT; */
+		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT); */
 	default:
 		return 0;
 	}
-- 
cgit v1.2.3


From d7aac107e64e1c4c1af30806817a2888e7a4a96c Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 24 Jul 2010 14:46:44 +0200
Subject: nv50: introduce the big formats table

---
 src/gallium/drivers/nv50/Makefile              |   1 +
 src/gallium/drivers/nv50/SConscript            |   1 +
 src/gallium/drivers/nv50/nv50_formats.c        | 427 +++++++++++++++++++++++++
 src/gallium/drivers/nv50/nv50_miptree.c        |   3 +
 src/gallium/drivers/nv50/nv50_screen.c         |  81 ++---
 src/gallium/drivers/nv50/nv50_screen.h         |   9 +
 src/gallium/drivers/nv50/nv50_state_validate.c |  75 +----
 src/gallium/drivers/nv50/nv50_tex.c            |  52 +--
 src/gallium/drivers/nv50/nv50_texture.h        |   9 +
 src/gallium/drivers/nv50/nv50_vbo.c            | 100 +-----
 10 files changed, 486 insertions(+), 272 deletions(-)
 create mode 100644 src/gallium/drivers/nv50/nv50_formats.c

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile
index 3943a9e257..bf1e8201a0 100644
--- a/src/gallium/drivers/nv50/Makefile
+++ b/src/gallium/drivers/nv50/Makefile
@@ -8,6 +8,7 @@ C_SOURCES = \
 	nv50_clear.c \
 	nv50_context.c \
 	nv50_draw.c \
+	nv50_formats.c \
 	nv50_miptree.c \
 	nv50_query.c \
 	nv50_resource.c \
diff --git a/src/gallium/drivers/nv50/SConscript b/src/gallium/drivers/nv50/SConscript
index 8625f92622..e4a93c15ce 100644
--- a/src/gallium/drivers/nv50/SConscript
+++ b/src/gallium/drivers/nv50/SConscript
@@ -9,6 +9,7 @@ nv50 = env.ConvenienceLibrary(
         'nv50_clear.c',
         'nv50_context.c',
         'nv50_draw.c',
+        'nv50_formats.c',
         'nv50_miptree.c',
         'nv50_query.c',
         'nv50_program.c',
diff --git a/src/gallium/drivers/nv50/nv50_formats.c b/src/gallium/drivers/nv50/nv50_formats.c
new file mode 100644
index 0000000000..5b65cdaa02
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_formats.c
@@ -0,0 +1,427 @@
+
+#include "nv50_screen.h"
+#include "nv50_texture.h"
+#include "nouveau/nouveau_class.h"
+#include "pipe/p_defines.h"
+
+#define A_(cr, cg, cb, ca, t0, t1, t2, t3, sz, r)	    \
+   NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 |         \
+   NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 |	    \
+   NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 |	    \
+   NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 |         \
+   NV50TIC_0_0_FMT_##sz,                                    \
+   NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_##sz |                \
+   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_##t0 |                  \
+   (NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_##t0 << 3) | (r << 31)
+
+#define B_(cr, cg, cb, ca, t0, t1, t2, t3, sz, r)   \
+   NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \
+   NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \
+   NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \
+   NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \
+   NV50TIC_0_0_FMT_##sz, 0
+
+#define VERTEX_BUFFER PIPE_BIND_VERTEX_BUFFER
+#define SAMPLER_VIEW  PIPE_BIND_SAMPLER_VIEW
+#define RENDER_TARGET PIPE_BIND_RENDER_TARGET
+#define DEPTH_STENCIL PIPE_BIND_DEPTH_STENCIL
+#define SCANOUT       PIPE_BIND_SCANOUT
+
+/* for vertex buffers: */
+#define NV50TIC_0_0_FMT_8_8_8 NV50TIC_0_0_FMT_8_8_8_8
+#define NV50TIC_0_0_FMT_16_16_16 NV50TIC_0_0_FMT_16_16_16_16
+#define NV50TIC_0_0_FMT_32_32_32 NV50TIC_0_0_FMT_32_32_32_32
+
+const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
+{
+   /* COMMON FORMATS */
+
+   [PIPE_FORMAT_B8G8R8A8_UNORM] = { NV50TCL_RT_FORMAT_A8R8G8B8_UNORM,
+    A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET | SCANOUT },
+
+   [PIPE_FORMAT_B8G8R8X8_UNORM] = { NV50TCL_RT_FORMAT_X8R8G8B8_UNORM,
+    A_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET | SCANOUT },
+
+   [PIPE_FORMAT_B8G8R8A8_SRGB] = { NV50TCL_RT_FORMAT_A8R8G8B8_SRGB,
+    A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_B8G8R8X8_SRGB] = { NV50TCL_RT_FORMAT_X8R8G8B8_SRGB,
+    A_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_B5G6R5_UNORM] = { NV50TCL_RT_FORMAT_R5G6B5_UNORM,
+    B_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 5_6_5, 1),
+    SAMPLER_VIEW | RENDER_TARGET | SCANOUT },
+
+   [PIPE_FORMAT_B5G5R5A1_UNORM] = { NV50TCL_RT_FORMAT_A1R5G5B5_UNORM,
+    B_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 1_5_5_5, 1),
+    SAMPLER_VIEW | RENDER_TARGET | SCANOUT },
+
+   [PIPE_FORMAT_B4G4R4A4_UNORM] = { 0,
+    B_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 4_4_4_4, 1),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R10G10B10A2_UNORM] = { NV50TCL_RT_FORMAT_A2B10G10R10_UNORM,
+    A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1),
+    SAMPLER_VIEW | RENDER_TARGET | SCANOUT },
+
+   [PIPE_FORMAT_B10G10R10A2_UNORM] = { NV50TCL_RT_FORMAT_A2R10G10B10_UNORM,
+    A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1),
+    SAMPLER_VIEW | RENDER_TARGET },
+
+   /* DEPTH/STENCIL FORMATS */
+
+   [PIPE_FORMAT_Z16_UNORM] = { NV50TCL_ZETA_FORMAT_Z16_UNORM,
+    B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 16_DEPTH, 0),
+    SAMPLER_VIEW | DEPTH_STENCIL },
+
+   [PIPE_FORMAT_Z24_UNORM_S8_USCALED] = { NV50TCL_ZETA_FORMAT_S8Z24_UNORM,
+    B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 8_24, 0),
+    SAMPLER_VIEW | DEPTH_STENCIL },
+
+   [PIPE_FORMAT_Z24X8_UNORM] = { NV50TCL_ZETA_FORMAT_X8Z24_UNORM,
+    B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 8_24, 0),
+    SAMPLER_VIEW | DEPTH_STENCIL },
+
+   [PIPE_FORMAT_S8_USCALED_Z24_UNORM] = { NV50TCL_ZETA_FORMAT_S8Z24_UNORM,
+    B_(C1, C1, C1, ONE, UINT, UNORM, UINT, UINT, 24_8, 0),
+    SAMPLER_VIEW | DEPTH_STENCIL },
+
+   [PIPE_FORMAT_Z32_FLOAT] = { NV50TCL_ZETA_FORMAT_Z32_FLOAT,
+    B_(C0, C0, C0, ONE, FLOAT, UINT, UINT, UINT, 32_DEPTH, 0),
+    SAMPLER_VIEW | DEPTH_STENCIL },
+
+   [PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED] = {
+    NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM,
+    B_(C0, C0, C0, ONE, FLOAT, UINT, UINT, UINT, 32_8, 0),
+    SAMPLER_VIEW | DEPTH_STENCIL },
+
+   /* LUMINANCE, ALPHA, INTENSITY */
+
+   [PIPE_FORMAT_L8_UNORM] = { 0,
+    A_(C0, C0, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_L8_SRGB] = { 0,
+    A_(C0, C0, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_I8_UNORM] = { 0,
+    A_(C0, C0, C0, C0, UNORM, UNORM, UNORM, UNORM, 8, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_A8_UNORM] = { NV50TCL_RT_FORMAT_A8_UNORM,
+    A_(ZERO, ZERO, ZERO, C0, UNORM, UNORM, UNORM, UNORM, 8, 0),
+    SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_L8A8_UNORM] = { 0,
+    A_(C0, C0, C0, C1, UNORM, UNORM, UNORM, UNORM, 8_8, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_L8A8_SRGB] = { 0,
+    A_(C0, C0, C0, C1, UNORM, UNORM, UNORM, UNORM, 8_8, 0),
+    SAMPLER_VIEW },
+
+   /* DXT, RGTC */
+
+   [PIPE_FORMAT_DXT1_RGB] = { 0,
+    B_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, DXT1, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_DXT1_RGBA] = { 0,
+    B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT1, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_DXT3_RGBA] = { 0,
+    B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT3, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_DXT5_RGBA] = { 0,
+    B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT5, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_RGTC1_UNORM] = { 0,
+    B_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC1, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_RGTC2_UNORM] = { 0,
+    B_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC2, 0),
+    SAMPLER_VIEW },
+
+   [PIPE_FORMAT_RGTC2_SNORM] = { 0,
+    B_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, RGTC2, 0),
+    SAMPLER_VIEW },
+
+   /* FLOAT 16 */
+
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT,
+    A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 16_16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R16G16B16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16B16X16_FLOAT,
+    A_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R16G16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16_FLOAT,
+    A_(C0, C1, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R16_FLOAT] = { NV50TCL_RT_FORMAT_R16_FLOAT,
+    A_(C0, ZERO, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   /* FLOAT 32 */
+
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT,
+    A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R32G32B32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32B32X32_FLOAT,
+    A_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R32G32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32_FLOAT,
+    A_(C0, C1, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R32_FLOAT] = { NV50TCL_RT_FORMAT_R32_FLOAT,
+    A_(C0, ZERO, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   /* ODD FORMATS */
+
+   [PIPE_FORMAT_R11G11B10_FLOAT] = { NV50TCL_RT_FORMAT_B10G11R11_FLOAT,
+    B_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 10_11_11, 0),
+    SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R9G9B9E5_FLOAT] = { 0,
+    B_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 5_9_9_9, 0),
+    SAMPLER_VIEW },
+
+   /* SNORM 32 */
+
+   [PIPE_FORMAT_R32G32B32A32_SNORM] = { 0,
+    A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32B32_SNORM] = { 0,
+    A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32_SNORM] = { 0,
+    A_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32_SNORM] = { 0,
+    A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* UNORM 32 */
+
+   [PIPE_FORMAT_R32G32B32A32_UNORM] = { 0,
+    A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32B32_UNORM] = { 0,
+    A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32_UNORM] = { 0,
+    A_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32_UNORM] = { 0,
+    A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* SNORM 16 */
+
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = { NV50TCL_RT_FORMAT_R16G16B16A16_SNORM,
+    A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 16_16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R16G16B16_SNORM] = { 0,
+    A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16G16_SNORM] = { NV50TCL_RT_FORMAT_R16G16_SNORM,
+    A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R16_SNORM] = { 0,
+    A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* UNORM 16 */
+
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = { NV50TCL_RT_FORMAT_R16G16B16A16_UNORM,
+    A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 16_16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R16G16B16_UNORM] = { 0,
+    A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16G16_UNORM] = { NV50TCL_RT_FORMAT_R16G16_UNORM,
+    A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R16_UNORM] = { 0,
+    A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* SNORM 8 */
+
+   [PIPE_FORMAT_R8G8B8A8_SNORM] = { NV50TCL_RT_FORMAT_A8B8G8R8_SNORM,
+    A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 8_8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R8G8B8_SNORM] = { 0,
+    A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R8G8_SNORM] = { NV50TCL_RT_FORMAT_R8G8_SNORM,
+    A_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R8_SNORM] = { NV50TCL_RT_FORMAT_R8_SNORM,
+    A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   /* UNORM 8 */
+
+   [PIPE_FORMAT_R8G8B8A8_UNORM] = { NV50TCL_RT_FORMAT_A8B8G8R8_UNORM,
+    A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R8G8B8A8_SRGB] = { NV50TCL_RT_FORMAT_A8B8G8R8_SRGB,
+    A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 0),
+    SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R8G8B8_UNORM] = { NV50TCL_RT_FORMAT_X8B8G8R8_UNORM,
+    A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R8G8B8_SRGB] = { NV50TCL_RT_FORMAT_X8B8G8R8_SRGB,
+    A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8, 0),
+    SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R8G8_UNORM] = { NV50TCL_RT_FORMAT_R8G8_UNORM,
+    A_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   [PIPE_FORMAT_R8_UNORM] = { NV50TCL_RT_FORMAT_R8_UNORM,
+    A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET },
+
+   /* SSCALED 32 */
+
+   [PIPE_FORMAT_R32G32B32A32_SSCALED] = { 0,
+    A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 32_32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32B32_SSCALED] = { 0,
+    A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32_SSCALED] = { 0,
+    A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32_SSCALED] = { 0,
+    A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* USCALED 32 */
+
+   [PIPE_FORMAT_R32G32B32A32_USCALED] = { 0,
+    A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 32_32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32B32_USCALED] = { 0,
+    A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 32_32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32G32_USCALED] = { 0,
+    A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 32_32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R32_USCALED] = { 0,
+    A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 32, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* SSCALED 16 */
+
+   [PIPE_FORMAT_R16G16B16A16_SSCALED] = { 0,
+    A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 16_16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16G16B16_SSCALED] = { 0,
+    A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16G16_SSCALED] = { 0,
+    A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16_SSCALED] = { 0,
+    A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* USCALED 16 */
+
+   [PIPE_FORMAT_R16G16B16A16_USCALED] = { 0,
+    A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 16_16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16G16B16_USCALED] = { 0,
+    A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 16_16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16G16_USCALED] = { 0,
+    A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 16_16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R16_USCALED] = { 0,
+    A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 16, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* SSCALED 8 */
+
+   [PIPE_FORMAT_R8G8B8A8_SSCALED] = { 0,
+    A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 8_8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R8G8B8_SSCALED] = { 0,
+    A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R8G8_SSCALED] = { 0,
+    A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R8_SSCALED] = { 0,
+    A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   /* USCALED 8 */
+
+   [PIPE_FORMAT_R8G8B8A8_USCALED] = { 0,
+    A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 8_8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R8G8B8_USCALED] = { 0,
+    A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 8_8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R8G8_USCALED] = { 0,
+    A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 8_8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+
+   [PIPE_FORMAT_R8_USCALED] = { 0,
+    A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 8, 0),
+    VERTEX_BUFFER | SAMPLER_VIEW },
+};
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index b7cd92158f..12b5ad106c 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -159,6 +159,9 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *tmp
 	case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
 		tile_flags = 0x2800;
 		break;
+	case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED:
+		tile_flags = 0xe000;
+		break;
 	case PIPE_FORMAT_R32G32B32A32_FLOAT:
 	case PIPE_FORMAT_R32G32B32_FLOAT:
 		tile_flags = 0x7400;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index ca4b01b12b..e0c06c29ba 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -34,75 +34,38 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 				enum pipe_format format,
 				enum pipe_texture_target target,
 				unsigned sample_count,
-				unsigned tex_usage, unsigned geom_flags)
+				unsigned usage, unsigned geom_flags)
 {
 	if (sample_count > 1)
 		return FALSE;
 
-	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+	if (!util_format_s3tc_enabled) {
 		switch (format) {
-		case PIPE_FORMAT_B8G8R8X8_UNORM:
-		case PIPE_FORMAT_B8G8R8A8_UNORM:
-		case PIPE_FORMAT_B5G6R5_UNORM:
-		case PIPE_FORMAT_R16G16B16A16_SNORM:
-		case PIPE_FORMAT_R16G16B16A16_UNORM:
-		case PIPE_FORMAT_R32G32B32A32_FLOAT:
-		case PIPE_FORMAT_R16G16_SNORM:
-		case PIPE_FORMAT_R16G16_UNORM:
-			return TRUE;
-		default:
-			break;
-		}
-	} else
-	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
-		switch (format) {
-		case PIPE_FORMAT_Z32_FLOAT:
-		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-		case PIPE_FORMAT_Z24X8_UNORM:
-		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-			return TRUE;
-		default:
-			break;
-		}
-	} else {
-		if (tex_usage & PIPE_BIND_SAMPLER_VIEW) {
-			switch (format) {
-			case PIPE_FORMAT_DXT1_RGB:
-			case PIPE_FORMAT_DXT1_RGBA:
-			case PIPE_FORMAT_DXT3_RGBA:
-			case PIPE_FORMAT_DXT5_RGBA:
-				return util_format_s3tc_enabled;
-			default:
-				break;
-			}
-		}
-		switch (format) {
-		case PIPE_FORMAT_B8G8R8A8_UNORM:
-		case PIPE_FORMAT_B8G8R8X8_UNORM:
-		case PIPE_FORMAT_B8G8R8A8_SRGB:
-		case PIPE_FORMAT_B8G8R8X8_SRGB:
-		case PIPE_FORMAT_B5G5R5A1_UNORM:
-		case PIPE_FORMAT_B4G4R4A4_UNORM:
-		case PIPE_FORMAT_B5G6R5_UNORM:
-		case PIPE_FORMAT_L8_UNORM:
-		case PIPE_FORMAT_A8_UNORM:
-		case PIPE_FORMAT_I8_UNORM:
-		case PIPE_FORMAT_L8A8_UNORM:
-		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-		case PIPE_FORMAT_Z32_FLOAT:
-		case PIPE_FORMAT_R16G16B16A16_SNORM:
-		case PIPE_FORMAT_R16G16B16A16_UNORM:
-		case PIPE_FORMAT_R32G32B32A32_FLOAT:
-		case PIPE_FORMAT_R16G16_SNORM:
-		case PIPE_FORMAT_R16G16_UNORM:
-			return TRUE;
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return FALSE;
 		default:
 			break;
 		}
 	}
 
-	return FALSE;
+	switch (format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		if ((nouveau_screen(pscreen)->device->chipset & 0xf0) != 0xa0)
+			return FALSE;
+		break;
+	default:
+		break;
+	}
+
+	/* transfers & shared are always supported */
+	usage &= ~(PIPE_BIND_TRANSFER_READ |
+		   PIPE_BIND_TRANSFER_WRITE |
+		   PIPE_BIND_SHARED);
+
+	return (nv50_format_table[format].usage & usage) == usage;
 }
 
 static int
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index fbf15a7596..a491ba31b2 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -38,4 +38,13 @@ nv50_screen(struct pipe_screen *screen)
 
 extern void nv50_screen_relocs(struct nv50_screen *);
 
+struct nv50_format {
+	uint32_t rt;
+	uint32_t tic;
+	uint32_t vtx;
+	uint32_t usage;
+};
+
+extern const struct nv50_format nv50_format_table[];
+
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 8d662d8f60..f1d8202dff 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -56,6 +56,8 @@ validate_fb(struct nv50_context *nv50)
 			assert(h == fb->cbufs[i]->height);
 		}
 
+		assert(nv50_format_table[fb->cbufs[i]->format].rt);
+
 		so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2);
 		so_data  (so, fb->cbufs[i]->width);
 		so_data  (so, fb->cbufs[i]->height);
@@ -65,42 +67,9 @@ validate_fb(struct nv50_context *nv50)
 			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
 		so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM |
 			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
-		switch (fb->cbufs[i]->format) {
-		case PIPE_FORMAT_B8G8R8A8_UNORM:
-			so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM);
-			break;
-		case PIPE_FORMAT_B8G8R8X8_UNORM:
-			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
-			break;
-		case PIPE_FORMAT_B5G6R5_UNORM:
-			so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM);
-			break;
-		case PIPE_FORMAT_R16G16B16A16_SNORM:
-			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_SNORM);
-			break;
-		case PIPE_FORMAT_R16G16B16A16_UNORM:
-			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM);
-			break;
-		case PIPE_FORMAT_R16G16B16A16_FLOAT:
-			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT);
-			break;
-		case PIPE_FORMAT_R32G32B32A32_FLOAT:
-			so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT);
-			break;
-		case PIPE_FORMAT_R16G16_SNORM:
-			so_data(so, NV50TCL_RT_FORMAT_R16G16_SNORM);
-			break;
-		case PIPE_FORMAT_R16G16_UNORM:
-			so_data(so, NV50TCL_RT_FORMAT_R16G16_UNORM);
-			break;
-		default:
-			NOUVEAU_ERR("AIIII unknown format %s\n",
-			            util_format_name(fb->cbufs[i]->format));
-			so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM);
-			break;
-		}
-		so_data(so, nv50_miptree(pt)->
-				level[fb->cbufs[i]->level].tile_mode << 4);
+		so_data  (so, nv50_format_table[fb->cbufs[i]->format].rt);
+		so_data  (so, nv50_miptree(pt)->
+			      level[fb->cbufs[i]->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
 		so_method(so, tesla, NV50TCL_RT_ARRAY_MODE, 1);
@@ -120,39 +89,17 @@ validate_fb(struct nv50_context *nv50)
 			assert(h == fb->zsbuf->height);
 		}
 
+		assert(nv50_format_table[fb->zsbuf->format].rt);
+
 		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
 		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
 			      NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0);
 		so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM |
 			      NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0);
-		switch (fb->zsbuf->format) {
-		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
-			break;
-		case PIPE_FORMAT_Z24X8_UNORM:
-			so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM);
-			break;
-		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-			so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM);
-			break;
-		case PIPE_FORMAT_Z32_FLOAT:
-			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT);
-			break;
-		case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED:
-			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM);
-			break;
-		case PIPE_FORMAT_Z16_UNORM:
-			so_data(so, NV50TCL_ZETA_FORMAT_Z16_UNORM);
-			break;
-		default:
-			NOUVEAU_ERR("AIIII unknown format %s\n",
-			            util_format_name(fb->zsbuf->format));
-			so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM);
-			break;
-		}
-		so_data(so, nv50_miptree(pt)->
-				level[fb->zsbuf->level].tile_mode << 4);
-		so_data(so, 0x00000000);
+		so_data  (so, nv50_format_table[fb->zsbuf->format].rt);
+		so_data  (so, nv50_miptree(pt)->
+			      level[fb->zsbuf->level].tile_mode << 4);
+		so_data  (so, 0x00000000);
 
 		so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1);
 		so_data  (so, 1);
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index 5ea0c1d726..5535818370 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -29,56 +29,6 @@
 
 #include "util/u_format.h"
 
-#define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f)		\
-[PIPE_FORMAT_##pf] = (						\
-	NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 |	\
-	NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 |	\
-	NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 |	\
-	NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 |	\
-	NV50TIC_0_0_FMT_##f)
-
-#define _(pf, t, cr, cg, cb, ca, f) _MIXED(pf, t, t, t, t, cr, cg, cb, ca, f)
-
-static const uint32_t nv50_texture_formats[PIPE_FORMAT_COUNT] =
-{
-	_(B8G8R8A8_UNORM, UNORM, C2, C1, C0, C3,  8_8_8_8),
-	_(B8G8R8A8_SRGB,  UNORM, C2, C1, C0, C3,  8_8_8_8),
-	_(B8G8R8X8_UNORM, UNORM, C2, C1, C0, ONE, 8_8_8_8),
-	_(B8G8R8X8_SRGB,  UNORM, C2, C1, C0, ONE, 8_8_8_8),
-	_(B5G5R5A1_UNORM, UNORM, C2, C1, C0, C3,  1_5_5_5),
-	_(B4G4R4A4_UNORM, UNORM, C2, C1, C0, C3,  4_4_4_4),
-
-	_(B5G6R5_UNORM, UNORM, C2, C1, C0, ONE, 5_6_5),
-
-	_(L8_UNORM, UNORM, C0, C0, C0, ONE, 8),
-	_(L8_SRGB,  UNORM, C0, C0, C0, ONE, 8),
-	_(A8_UNORM, UNORM, ZERO, ZERO, ZERO, C0, 8),
-	_(I8_UNORM, UNORM, C0, C0, C0, C0, 8),
-
-	_(L8A8_UNORM, UNORM, C0, C0, C0, C1, 8_8),
-	_(L8A8_SRGB,  UNORM, C0, C0, C0, C1, 8_8),
-
-	_(DXT1_RGB, UNORM, C0, C1, C2, ONE, DXT1),
-	_(DXT1_RGBA, UNORM, C0, C1, C2, C3, DXT1),
-	_(DXT3_RGBA, UNORM, C0, C1, C2, C3, DXT3),
-	_(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5),
-
-	_MIXED(S8_USCALED_Z24_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8),
-	_MIXED(Z24_UNORM_S8_USCALED, UNORM, UINT, UINT, UINT, C0, C0, C0, ONE, 8_24),
-
-	_(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16),
-	_(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16),
-	_(R32G32B32A32_FLOAT, FLOAT, C0, C1, C2, C3, 32_32_32_32),
-
-	_(R16G16_SNORM, SNORM, C0, C1, ZERO, ONE, 16_16),
-	_(R16G16_UNORM, UNORM, C0, C1, ZERO, ONE, 16_16),
-
-	_MIXED(Z32_FLOAT, FLOAT, UINT, UINT, UINT, C0, C0, C0, ONE, 32_DEPTH)
-};
-
-#undef _
-#undef _MIXED
-
 static INLINE uint32_t
 nv50_tic_swizzle(uint32_t tc, unsigned swz)
 {
@@ -106,7 +56,7 @@ nv50_tex_construct(struct nv50_sampler_view *view)
 	struct nv50_miptree *mt = nv50_miptree(view->pipe.texture);
 	uint32_t swz[4], *tic = view->tic;
 
-	tic[0] = nv50_texture_formats[view->pipe.format];
+	tic[0] = nv50_format_table[view->pipe.format].tic;
 
 	swz[0] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_r);
 	swz[1] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_g);
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
index 3475d3e432..b4939943e8 100644
--- a/src/gallium/drivers/nv50/nv50_texture.h
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -45,24 +45,32 @@
 #define NV50TIC_0_0_TYPEA_SNORM                                   0x00008000
 #define NV50TIC_0_0_TYPEA_SINT                                    0x00018000
 #define NV50TIC_0_0_TYPEA_UINT                                    0x00020000
+#define NV50TIC_0_0_TYPEA_SSCALED                                 0x00028000
+#define NV50TIC_0_0_TYPEA_USCALED                                 0x00030000
 #define NV50TIC_0_0_TYPEA_FLOAT                                   0x00038000
 #define NV50TIC_0_0_TYPEB_MASK                                    0x00007000
 #define NV50TIC_0_0_TYPEB_UNORM                                   0x00002000
 #define NV50TIC_0_0_TYPEB_SNORM                                   0x00001000
 #define NV50TIC_0_0_TYPEB_SINT                                    0x00003000
 #define NV50TIC_0_0_TYPEB_UINT                                    0x00004000
+#define NV50TIC_0_0_TYPEB_SSCALED                                 0x00005000
+#define NV50TIC_0_0_TYPEB_USCALED                                 0x00006000
 #define NV50TIC_0_0_TYPEB_FLOAT                                   0x00007000
 #define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
 #define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
 #define NV50TIC_0_0_TYPEG_SNORM                                   0x00000200
 #define NV50TIC_0_0_TYPEG_SINT                                    0x00000600
 #define NV50TIC_0_0_TYPEG_UINT                                    0x00000800
+#define NV50TIC_0_0_TYPEG_SSCALED                                 0x00000a00
+#define NV50TIC_0_0_TYPEG_USCALED                                 0x00000c00
 #define NV50TIC_0_0_TYPEG_FLOAT                                   0x00000e00
 #define NV50TIC_0_0_TYPER_MASK                                    0x000001c0
 #define NV50TIC_0_0_TYPER_UNORM                                   0x00000080
 #define NV50TIC_0_0_TYPER_SNORM                                   0x00000040
 #define NV50TIC_0_0_TYPER_SINT                                    0x000000c0
 #define NV50TIC_0_0_TYPER_UINT                                    0x00000100
+#define NV50TIC_0_0_TYPER_SSCALED                                 0x00000140
+#define NV50TIC_0_0_TYPER_USCALED                                 0x00000180
 #define NV50TIC_0_0_TYPER_FLOAT                                   0x000001c0
 #define NV50TIC_0_0_FMT_MASK                                      0x0000003f
 #define NV50TIC_0_0_FMT_32_32_32_32                               0x00000001
@@ -90,6 +98,7 @@
 #define NV50TIC_0_0_FMT_8_24                                      0x0000002a
 #define NV50TIC_0_0_FMT_32_DEPTH                                  0x0000002f
 #define NV50TIC_0_0_FMT_32_8                                      0x00000030
+#define NV50TIC_0_0_FMT_16_DEPTH                                  0x0000003a
 
 #define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
 #define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 996844b18f..4fe0df5683 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -29,99 +29,6 @@
 #include "nv50_context.h"
 #include "nv50_resource.h"
 
-/* VERTEX_ARRAY_ATTRIB_TYPE is duplicated for unknown reason */
-#define NV50_VAT(x) ((x) | ((x) << 3))
-
-static INLINE uint32_t
-nv50_vbo_type_to_hw(enum pipe_format format)
-{
-	const struct util_format_description *desc;
-
-	desc = util_format_description(format);
-	assert(desc);
-
-	switch (desc->channel[0].type) {
-	case UTIL_FORMAT_TYPE_FLOAT:
-		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT);
-	case UTIL_FORMAT_TYPE_UNSIGNED:
-		if (desc->channel[0].normalized) {
-			return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM);
-		}
-		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED);
-	case UTIL_FORMAT_TYPE_SIGNED:
-		if (desc->channel[0].normalized) {
-			return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM);
-		}
-		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED);
-	/*
-	case PIPE_FORMAT_TYPE_UINT:
-		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT);
-	case PIPE_FORMAT_TYPE_SINT:
-		return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT); */
-	default:
-		return 0;
-	}
-}
-
-static INLINE uint32_t
-nv50_vbo_size_to_hw(unsigned size, unsigned nr_c)
-{
-	static const uint32_t hw_values[] = {
-		0, 0, 0, 0,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16,
-		0, 0, 0, 0,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 };
-
-	/* we'd also have R11G11B10 and R10G10B10A2 */
-
-	assert(nr_c > 0 && nr_c <= 4);
-
-	if (size > 32)
-		return 0;
-	size >>= (3 - 2);
-
-	return hw_values[size + (nr_c - 1)];
-}
-
-static INLINE uint32_t
-nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
-{
-	uint32_t hw_type, hw_size;
-	enum pipe_format pf = ve->src_format;
-	const struct util_format_description *desc;
-	unsigned size, nr_components;
-
-	desc = util_format_description(pf);
-	assert(desc);
-
-	size = util_format_get_component_bits(pf, UTIL_FORMAT_COLORSPACE_RGB, 0);
-	nr_components = util_format_get_nr_components(pf);
-
-	hw_type = nv50_vbo_type_to_hw(pf);
-	hw_size = nv50_vbo_size_to_hw(size, nr_components);
-
-	if (!hw_type || !hw_size) {
-		NOUVEAU_ERR("unsupported vbo format: %s\n", util_format_name(pf));
-		abort();
-		return 0x24e80000;
-	}
-
-	if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_Z) /* BGRA */
-		hw_size |= (1 << 31); /* no real swizzle bits :-( */
-
-	return (hw_type | hw_size);
-}
-
 struct instance {
 	struct nouveau_bo *bo;
 	unsigned delta;
@@ -543,11 +450,8 @@ nv50_vtxelt_construct(struct nv50_vtxelt_stateobj *cso)
 {
 	unsigned i;
 
-	for (i = 0; i < cso->num_elements; ++i) {
-		struct pipe_vertex_element *ve = &cso->pipe[i];
-
-		cso->hw[i] = nv50_vbo_vtxelt_to_hw(ve);
-	}
+	for (i = 0; i < cso->num_elements; ++i)
+		cso->hw[i] = nv50_format_table[cso->pipe[i].src_format].vtx;
 }
 
 struct nouveau_stateobj *
-- 
cgit v1.2.3


From 1d1bb206122b719d6959eceddd511a0294816a9a Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 24 Jul 2010 21:17:21 +0200
Subject: nv50: don't produce MOV immediate to output reg in store opt

---
 src/gallium/drivers/nv50/nv50_pc_emit.c     | 12 ++++++------
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 12 ++++++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index b917d23232..51304670a1 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -539,26 +539,26 @@ emit_mov(struct nv_pc *pc, struct nv_instruction *i)
       pc->emit[1] |= DREG(i->def[0])->id << 4;
    } else
    if (SFILE(i, 0) == NV_FILE_IMM) {
-      if (i->opcode == NV_OP_LDA)
+      if (i->opcode == NV_OP_LDA) {
          emit_ld(pc, i);
-      else {
+      } else {
          pc->emit[0] = 0x10008001;
          pc->emit[1] = 0x00000003;
 
-	 emit_form_IMM(pc, i, 0);
+         emit_form_IMM(pc, i, 0);
       }
    } else {
       pc->emit[0] = 0x10000000;
       pc->emit[0] |= DREG(i->def[0])->id << 2;
       pc->emit[0] |= SREG(i->src[0])->id << 9;
 
-      if (!i->is_long)
+      if (!i->is_long) {
          pc->emit[0] |= 0x8000;
-      else {
+      } else {
          pc->emit[0] |= 0x00000001;
          pc->emit[1] = 0x0403c000;
 
-	 set_pred(pc, i);
+         set_pred(pc, i);
       }
    }
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 0811420e42..f81384f00d 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -266,11 +266,10 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
    int j;
 
    for (sti = b->entry; sti; sti = sti->next) {
-      if (!sti->def[0])
+      if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT)
          continue;
 
-      if (sti->def[0]->reg.file != NV_FILE_OUT)
-         continue;
+      /* only handling MOV to $oX here */
       if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA)
          continue;
 
@@ -282,8 +281,13 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
       if (nvi->def[0]->refc > 1)
          continue;
 
+      /* cannot MOV immediate to $oX */
+      if (nvi->src[0]->value->reg.file == NV_FILE_IMM)
+         continue;
+
       nvi->def[0] = sti->def[0];
-      nvi->fixed = 1;
+      sti->def[0] = NULL;
+      nvi->fixed = sti->fixed;
       sti->fixed = 0;
    }
    DESCEND_ARBITRARY(j, nv_pass_fold_stores);
-- 
cgit v1.2.3


From 4baaf1d4c32053a191d8718e46dab95d25f119a5 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 24 Jul 2010 21:18:51 +0200
Subject: nv50: change back accidentally swapped UNORM,SNORM vertex type

---
 src/gallium/drivers/nouveau/nouveau_class.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
index 975fd8f35a..f44979e562 100644
--- a/src/gallium/drivers/nouveau/nouveau_class.h
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -8949,8 +8949,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SHIFT					25
 #define   NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK						0x0e000000
 #define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT					0x0e000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM					0x02000000
-#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM					0x04000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM					0x02000000
+#define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM					0x04000000
 #define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED					0x0a000000
 #define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED					0x0c000000
 #define    NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT					0x08000000
@@ -9352,8 +9352,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NVC0TCL_VTX_ATTR_DEFINE_TYPE_SHIFT						16
 #define   NVC0TCL_VTX_ATTR_DEFINE_TYPE_MASK						0x000f0000
 #define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_FLOAT						0x00070000
-#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_UNORM						0x00010000
-#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_SNORM						0x00020000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_SNORM						0x00010000
+#define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_UNORM						0x00020000
 #define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_USCALED						0x00050000
 #define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_SSCALED						0x00060000
 #define    NVC0TCL_VTX_ATTR_DEFINE_TYPE_UINT						0x00040000
@@ -9385,8 +9385,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define   NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SHIFT					27
 #define   NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_MASK					0x78000000
 #define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT					0x38000000
-#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UNORM					0x08000000
-#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SNORM					0x10000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SNORM					0x08000000
+#define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UNORM					0x10000000
 #define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_USCALED					0x28000000
 #define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SSCALED					0x30000000
 #define    NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UINT					0x20000000
-- 
cgit v1.2.3


From bb9d634730b7e97050e50d9238764a99099fbc7f Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 24 Jul 2010 22:16:05 +0200
Subject: nv50: add/fix some license headers

---
 src/gallium/drivers/nv50/nv50_formats.c     | 21 ++++++++++++++++
 src/gallium/drivers/nv50/nv50_pc.c          | 21 ++++++++++++++++
 src/gallium/drivers/nv50/nv50_pc.h          | 37 ++++++++++++++++-------------
 src/gallium/drivers/nv50/nv50_pc_emit.c     | 37 ++++++++++++++++-------------
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 21 ++++++++++++++++
 src/gallium/drivers/nv50/nv50_pc_print.c    | 21 ++++++++++++++++
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 22 +++++++++++++++++
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 21 ++++++++++++++++
 8 files changed, 169 insertions(+), 32 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_formats.c b/src/gallium/drivers/nv50/nv50_formats.c
index 5b65cdaa02..433c74e611 100644
--- a/src/gallium/drivers/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nv50/nv50_formats.c
@@ -1,3 +1,24 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #include "nv50_screen.h"
 #include "nv50_texture.h"
diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 8aba0a32b7..89dbc7aa20 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -1,3 +1,24 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #include "nv50_pc.h"
 #include "nv50_program.h"
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 3ab48d0afd..3db300dabb 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -1,19 +1,24 @@
-/*************************************************************************/
-/* Copyright (C) 2010 I                                                  */
-/*                                                                       */
-/* This program is free software: you can redistribute it and/or modify  */
-/* it under the terms of the GNU General Public License as published by  */
-/* the Free Software Foundation, either version 3 of the License, or     */
-/* (at your option) any later version.                                   */
-/*                                                                       */
-/* This program is distributed in the hope that it will be useful,       */
-/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */
-/* GNU General Public License for more details.                          */
-/*                                                                       */
-/* You should have received a copy of the GNU General Public License     */
-/* along with this program.  If not, see <http://www.gnu.org/licenses/>. */
-/*************************************************************************/
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #ifndef __NV50_COMPILER_H__
 #define __NV50_COMPILER_H__
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index 51304670a1..728e2b145d 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -1,19 +1,24 @@
-/*************************************************************************/
-/* Copyright (C) 2009                                                    */
-/*                                                                       */
-/* This program is free software: you can redistribute it and/or modify  */
-/* it under the terms of the GNU General Public License as published by  */
-/* the Free Software Foundation, either version 3 of the License, or     */
-/* (at your option) any later version.                                   */
-/*                                                                       */
-/* This program is distributed in the hope that it will be useful,       */
-/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */
-/* GNU General Public License for more details.                          */
-/*                                                                       */
-/* You should have received a copy of the GNU General Public License     */
-/* along with this program.  If not, see <http://www.gnu.org/licenses/>. */
-/*************************************************************************/
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #include "nv50_context.h"
 #include "nv50_pc.h"
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index f81384f00d..a514c59e6a 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -1,3 +1,24 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #include "nv50_pc.h"
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 09512ffb88..00b50b4edc 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -1,3 +1,24 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #include "nv50_context.h"
 #include "nv50_pc.h"
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index eb446d641a..3cec219d1a 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -1,3 +1,25 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
 /*
  * XXX: phi function live intervals start at first ordinary instruction,
  *      add_range should be taking care of that already ...
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index aa15917774..5b69d520bc 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1,3 +1,24 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #include <unistd.h>
 
-- 
cgit v1.2.3


From 5811c6926450c4aafd2f9c87a2c6fe73b517f2c6 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 25 Jul 2010 22:21:38 +0200
Subject: nv50: simple reload elimination and local CSE

---
 src/gallium/drivers/nv50/nv50_pc.c          |  18 +++
 src/gallium/drivers/nv50/nv50_pc.h          |  10 +-
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 168 +++++++++++++++++++++++++---
 src/gallium/drivers/nv50/nv50_pc_print.c    |   6 +-
 4 files changed, 178 insertions(+), 24 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 89dbc7aa20..e09f94074d 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -163,6 +163,24 @@ nv_nvi_refcount(struct nv_instruction *nvi)
    return rc;
 }
 
+int
+nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val,
+                   struct nv_value *new_val)
+{
+   int i, n;
+
+   if (old_val == new_val)
+      return old_val->refc;
+
+   for (i = 0, n = 0; i < pc->num_refs; ++i) {
+      if (pc->refs[i]->value == old_val) {
+         ++n;
+         nv_reference(pc, &pc->refs[i], new_val);
+      }
+   }
+   return n;
+}
+
 static void
 nv_pc_free_refs(struct nv_pc *pc)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 3db300dabb..ffcdaf44af 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -363,11 +363,11 @@ new_ref(struct nv_pc *pc, struct nv_value *val)
       const unsigned old_size = pc->num_refs * sizeof(struct nv_ref *);
       const unsigned new_size = (pc->num_refs + 64) * sizeof(struct nv_ref *);
 
-	   pc->refs = REALLOC(pc->refs, old_size, new_size);
+      pc->refs = REALLOC(pc->refs, old_size, new_size);
 
-	   ref = CALLOC(64, sizeof(struct nv_ref));
-	   for (i = 0; i < 64; ++i)
-		   pc->refs[pc->num_refs + i] = &ref[i];
+      ref = CALLOC(64, sizeof(struct nv_ref));
+      for (i = 0; i < 64; ++i)
+         pc->refs[pc->num_refs + i] = &ref[i];
    }
 
    ref = pc->refs[pc->num_refs++];
@@ -426,6 +426,8 @@ int nv_nvi_refcount(struct nv_instruction *);
 void nv_nvi_delete(struct nv_instruction *);
 void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *);
 void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *);
+int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val,
+                       struct nv_value *new_val);
 
 int nv_pc_exec_pass0(struct nv_pc *pc);
 int nv_pc_exec_pass1(struct nv_pc *pc);
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index a514c59e6a..0018131fb5 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -570,31 +570,99 @@ nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b)
 }
 #endif
 
-/* TODO: reload elimination, redundant store elimination */
+/* TODO: redundant store elimination */
 
-struct nv_pass_reldelim {
+struct load_record {
+   struct load_record *next;
+   uint64_t data;
+   struct nv_value *value;
+};
+
+#define LOAD_RECORD_POOL_SIZE 1024
+
+struct nv_pass_reld_elim {
    struct nv_pc *pc;
+
+   struct load_record *imm;
+   struct load_record *mem_s;
+   struct load_record *mem_v;
+   struct load_record *mem_c[16];
+   struct load_record *mem_l;
+
+   struct load_record pool[LOAD_RECORD_POOL_SIZE];
+   int alloc;
 };
 
 static int
-nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b)
+nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
 {
-   int j;
+   struct load_record **rec, *it;
    struct nv_instruction *ld, *next;
+   uint64_t data;
+   struct nv_value *val;
+   int j;
 
    for (ld = b->entry; ld; ld = next) {
       next = ld->next;
+      if (!ld->src[0])
+         continue;
+      val = ld->src[0]->value;
+      rec = NULL;
 
       if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) {
-
+         data = val->reg.id;
+         rec = &ctx->mem_v;
       } else
       if (ld->opcode == NV_OP_LDA) {
-         
+         data = val->reg.id;
+         if (val->reg.file >= NV_FILE_MEM_C(0) &&
+             val->reg.file <= NV_FILE_MEM_C(15))
+            rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)];
+         else
+         if (val->reg.file == NV_FILE_MEM_S)
+            rec = &ctx->mem_s;
+         else
+         if (val->reg.file == NV_FILE_MEM_L)
+            rec = &ctx->mem_l;
       } else
-      if (ld->opcode == NV_OP_MOV) {
-         
+      if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) {
+         data = val->reg.imm.u32;
+         rec = &ctx->imm;
+      }
+
+      if (!rec || !ld->def[0]->refc)
+         continue;
+
+      for (it = *rec; it; it = it->next)
+         if (it->data == data)
+            break;
+
+      if (it) {
+#if 1
+         nvcg_replace_value(ctx->pc, ld->def[0], it->value);
+#else
+         ld->opcode = NV_OP_MOV;
+         nv_reference(ctx->pc, &ld->src[0], it->value);
+#endif
+      } else {
+         if (ctx->alloc == LOAD_RECORD_POOL_SIZE)
+            continue;
+         it = &ctx->pool[ctx->alloc++];
+         it->next = *rec;
+         it->data = data;
+         it->value = ld->def[0];
+         *rec = it;
       }
    }
+
+   ctx->imm = NULL;
+   ctx->mem_s = NULL;
+   ctx->mem_v = NULL;
+   for (j = 0; j < 16; ++j)
+      ctx->mem_c[j] = NULL;
+   ctx->mem_l = NULL;
+   ctx->alloc = 0;
+
    DESCEND_ARBITRARY(j, nv_pass_reload_elim);
 
    return 0;
@@ -678,23 +746,74 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
    return 0;
 }
 
+/* local common subexpression elimination, stupid O(n^2) implementation */
+static int
+nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *ir, *ik, *next;
+   struct nv_instruction *entry = b->phi ? b->phi : b->entry;
+   int s;
+   unsigned int reps;
+
+   do {
+      reps = 0;
+      for (ir = entry; ir; ir = next) {
+         next = ir->next;
+         for (ik = entry; ik != ir; ik = ik->next) {
+            if (ir->opcode != ik->opcode)
+               continue;
+
+            if (ik->opcode == NV_OP_LDA ||
+                ik->opcode == NV_OP_STA ||
+                ik->opcode == NV_OP_MOV ||
+                nv_is_vector_op(ik->opcode))
+               continue; /* ignore loads, stores & moves */
+
+            if (ik->src[4] || ir->src[4])
+               continue; /* don't mess with address registers */
+
+            for (s = 0; s < 3; ++s) {
+               struct nv_value *a, *b;
+
+               if (!ik->src[s]) {
+                  if (ir->src[s])
+                     break;
+                  continue;
+               }
+               if (ik->src[s]->mod != ir->src[s]->mod)
+                  break;
+               a = ik->src[s]->value;
+               b = ir->src[s]->value;
+               if (a == b)
+                  continue;
+               if (a->reg.file != b->reg.file ||
+                   a->reg.id < 0 ||
+                   a->reg.id != b->reg.id)
+                  break;
+            }
+            if (s == 3) {
+               nv_nvi_delete(ir);
+               ++reps;
+               nvcg_replace_value(ctx->pc, ir->def[0], ik->def[0]);
+               break;
+            }
+         }
+      }
+   } while(reps);
+
+   DESCEND_ARBITRARY(s, nv_pass_cse);
+
+   return 0;
+}
+
 int
 nv_pc_exec_pass0(struct nv_pc *pc)
 {
-   struct nv_pass_reldelim *reldelim;
+   struct nv_pass_reld_elim *reldelim;
    struct nv_pass pass;
    struct nv_pass_dce dce;
    int ret;
 
-   reldelim = CALLOC_STRUCT(nv_pass_reldelim);
-   reldelim->pc = pc;
-
-   ret = nv_pass_reload_elim(reldelim, pc->root);
-
-   FREE(reldelim);
-   if (ret)
-      return ret;
-
    pass.pc = pc;
 
    pc->pass_seq++;
@@ -720,6 +839,19 @@ nv_pc_exec_pass0(struct nv_pc *pc)
    if (ret)
       return ret;
 
+   reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
+   reldelim->pc = pc;
+   pc->pass_seq++;
+   ret = nv_pass_reload_elim(reldelim, pc->root);
+   FREE(reldelim);
+   if (ret)
+      return ret;
+
+   pc->pass_seq++;
+   ret = nv_pass_cse(&pass, pc->root);
+   if (ret)
+      return ret;
+
    pc->pass_seq++;
    ret = nv_pass_lower_mods(&pass, pc->root);
    if (ret)
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 00b50b4edc..82080779c3 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -181,7 +181,7 @@ nv_print_address(const char c, int buf, struct nv_value *a, int offset)
 static INLINE void
 nv_print_cond(struct nv_instruction *nvi)
 {
-   PRINT("%s%s%s$c%i ",
+   PRINT("%s%s %s$c%i ",
          gree, nv_cond_name(nvi->cc),
          mgta, nv_value_id(nvi->flags_src->value));
 }
@@ -198,7 +198,7 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
       PRINT(" %s%s", gree, nv_type_name(type));
 
    if (!nv_value_allocated(value))
-      reg_pfx = '%';
+      reg_pfx = nv_value_allocated(value->join) ? '&' : '%';
 
    switch (value->reg.file) {
    case NV_FILE_GPR:
@@ -268,6 +268,8 @@ nv_print_instruction(struct nv_instruction *i)
 {
    int j;
 
+   PRINT("%i: ", i->serial);
+
    if (i->flags_src)
       nv_print_cond(i);
 
-- 
cgit v1.2.3


From a3ba99b3037bad629622766d4e08d48ab6d20aae Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 25 Jul 2010 23:32:18 +0200
Subject: nv50: fix constant_operand opt mul by 2 case

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 0018131fb5..107ef0f4bf 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -465,11 +465,7 @@ constant_operand(struct nv_pc *pc,
       if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) ||
           (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) {
          nvi->opcode = NV_OP_ADD;
-         nv_reference(pc, &nvi->src[s], NULL);
-         if (!s) {
-            nvi->src[0] = nvi->src[1];
-            nvi->src[1] = NULL;
-         }
+         nv_reference(pc, &nvi->src[s], nvi->src[t]->value);
       } else
       if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) {
          nvi->opcode = NV_OP_NEG;
-- 
cgit v1.2.3


From e1ad3bd2f25832147814fcfe72166898bc07f11a Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 26 Jul 2010 00:56:12 +0200
Subject: nv50: permit usage of undefined TGSI TEMPs

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 5b69d520bc..3d5843ee0e 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -174,7 +174,8 @@ bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack)
 
    fetch_by_bb(stack, vals, &n, bld->pc->current_block);
 
-   assert(n);
+   if (n == 0)
+      return NULL;
    if (n == 1)
       return vals[0];
 
@@ -606,6 +607,7 @@ bld_export_outputs(struct bld_context *bld)
          if (!bld_is_output_written(bld, i, c))
             continue;
          vals[n] = bld_fetch_global(bld, &bld->ovs[i][c]);
+         assert(vals[n]);
          vals[n] = bld_insn_1(bld, NV_OP_MOV, vals[n]);
          vals[n++]->reg.id = bld->ti->output_map[i][c];
       }
@@ -734,6 +736,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       abort();
       break;	   
    }
+   if (!res) {
+      debug_printf("WARNING: undefined source value in TGSI instruction\n");
+      return bld_load_imm_u32(bld, 0);
+   }
 
    switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
    case TGSI_UTIL_SIGN_KEEP:
-- 
cgit v1.2.3


From 7d34e79e449284c6a833c2e58c714ea1e48669dd Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 26 Jul 2010 11:18:56 +0200
Subject: nv50: add missing 2nd source for POW multiplication

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 3d5843ee0e..da7fe746f4 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -278,13 +278,21 @@ bld_insn_3(struct bld_context *bld, uint opcode,
       (d)->insn->src[0]->typecast = NV_TYPE_##s0t;  \
    } while(0)
 
+#define BLD_INSN_2_EX(d, op, dt, s0, s0t, s1, s1t)       \
+   do {                                                  \
+      (d) = bld_insn_2(bld, (NV_OP_##op), (s0), (s1));   \
+      (d)->reg.type = NV_TYPE_##dt;                      \
+      (d)->insn->src[0]->typecast = NV_TYPE_##s0t;       \
+      (d)->insn->src[1]->typecast = NV_TYPE_##s1t;       \
+   } while(0)
+
 static struct nv_value *
 bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e)
 {
    struct nv_value *val;
 
    BLD_INSN_1_EX(val, LG2, F32, x, F32);
-   BLD_INSN_1_EX(val, MUL, F32, e, F32);
+   BLD_INSN_2_EX(val, MUL, F32, e, F32, val, F32);
    val = bld_insn_1(bld, NV_OP_PREEX2, val);
    val = bld_insn_1(bld, NV_OP_EX2, val);
 
-- 
cgit v1.2.3


From 28ded2585ca856b67b8cc0dd7c1de000b3fc729b Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 26 Jul 2010 11:32:27 +0200
Subject: nv50: add signed RGTC1 to format table, allow 2_10_10_10 for vbufs

---
 src/gallium/drivers/nv50/nv50_formats.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_formats.c b/src/gallium/drivers/nv50/nv50_formats.c
index 433c74e611..e1c7dae306 100644
--- a/src/gallium/drivers/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nv50/nv50_formats.c
@@ -86,12 +86,12 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
     SAMPLER_VIEW },
 
    [PIPE_FORMAT_R10G10B10A2_UNORM] = { NV50TCL_RT_FORMAT_A2B10G10R10_UNORM,
-    A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1),
-    SAMPLER_VIEW | RENDER_TARGET | SCANOUT },
+    A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 0),
+    SAMPLER_VIEW | RENDER_TARGET | VERTEX_BUFFER | SCANOUT },
 
    [PIPE_FORMAT_B10G10R10A2_UNORM] = { NV50TCL_RT_FORMAT_A2R10G10B10_UNORM,
     A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1),
-    SAMPLER_VIEW | RENDER_TARGET },
+    SAMPLER_VIEW | RENDER_TARGET | VERTEX_BUFFER },
 
    /* DEPTH/STENCIL FORMATS */
 
@@ -168,6 +168,10 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
     B_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC1, 0),
     SAMPLER_VIEW },
 
+   [PIPE_FORMAT_RGTC1_SNORM] = { 0,
+    B_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, RGTC1, 0),
+    SAMPLER_VIEW },
+
    [PIPE_FORMAT_RGTC2_UNORM] = { 0,
     B_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC2, 0),
     SAMPLER_VIEW },
-- 
cgit v1.2.3


From 582311ca979ac2316807cdffb15e7a25000693f4 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 26 Jul 2010 15:06:58 +0200
Subject: nv50: fix for empty BBs

---
 src/gallium/drivers/nv50/nv50_pc.c          | 30 +++++++++++-----------------
 src/gallium/drivers/nv50/nv50_pc.h          |  1 -
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 31 +++++++++++++++++------------
 src/gallium/drivers/nv50/nv50_pc_print.c    |  7 ++++---
 4 files changed, 34 insertions(+), 35 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index e09f94074d..0e8aadf5a9 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -254,7 +254,7 @@ nv50_emit_program(struct nv_pc *pc)
    assert(pc->emit == &code[pc->bin_size / 4]);
 
    /* XXX: we can do better than this ... */
-   if ((pc->emit[-1] & 3) == 3) {
+   if ((pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) {
       pc->emit[0] = 0xf0000001;
       pc->emit[1] = 0xe0000000;
       pc->bin_size += 8;
@@ -347,16 +347,16 @@ nvbb_insert_phi(struct nv_basic_block *b, struct nv_instruction *i)
          b->entry->prev = i;
       } else {
          b->entry = i;
-	 b->exit = i;
+         b->exit = i;
       }
    } else {
       assert(b->entry);
       if (b->entry->opcode == NV_OP_PHI) { /* insert after entry */
-	 assert(b->entry == b->exit);
+         assert(b->entry == b->exit);
          b->entry->next = i;
          i->prev = b->entry;
          b->entry = i;
-	 b->exit = i;
+         b->exit = i;
       } else { /* insert before entry */
          assert(b->entry->prev && b->exit);
          i->next = b->entry;
@@ -396,12 +396,9 @@ nv_nvi_delete(struct nv_instruction *nvi)
 
    debug_printf("REM: "); nv_print_instruction(nvi);
 
-   for (j = 0; j < 4; ++j) {
-      if (!nvi->src[j])
-         break;
-      --(nvi->src[j]->value->refc);
-      nvi->src[j] = NULL;
-   }	       
+   for (j = 0; j < 5; ++j)
+      nv_reference(NULL, &nvi->src[j], NULL);
+   nv_reference(NULL, &nvi->flags_src, NULL);
 
    if (nvi->next)
       nvi->next->prev = nvi->prev;
@@ -414,19 +411,16 @@ nv_nvi_delete(struct nv_instruction *nvi)
       nvi->prev->next = nvi->next;
 
    if (nvi == b->entry) {
-      assert(nvi->opcode != NV_OP_PHI || !nvi->next);
-
-      if (!nvi->next || (nvi->opcode == NV_OP_PHI))
-         b->entry = nvi->prev;
-      else
-         b->entry = nvi->next;
+      /* PHIs don't get hooked to b->entry */
+      b->entry = nvi->next;
+      assert(!nvi->prev || nvi->prev->opcode == NV_OP_PHI);
    }
 
    if (nvi == b->phi) {
-      assert(!nvi->prev);
       if (nvi->opcode != NV_OP_PHI)
-         debug_printf("WARN: b->phi points to non-PHI instruction\n");
+         debug_printf("NOTE: b->phi points to non-PHI instruction\n");
 
+      assert(!nvi->prev);
       if (!nvi->next || nvi->next->opcode != NV_OP_PHI)
          b->phi = NULL;
       else
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index ffcdaf44af..da3f984783 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -402,7 +402,6 @@ nv_reference(struct nv_pc *pc, struct nv_ref **d, struct nv_value *s)
          ++(s->refc);
       }
    } else {
-      assert(*d);
       *d = NULL;
    }
 }
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 107ef0f4bf..42f3a8634e 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -122,15 +122,29 @@ nvi_isnop(struct nv_instruction *nvi)
 static void
 nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b)
 {
+   struct nv_basic_block *in;
    struct nv_instruction *nvi, *next;
    int j;
    uint size, n32 = 0;
 
    b->priv = 0;
 
-   if (pc->num_blocks)
-      b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos +
-                   pc->bb_list[pc->num_blocks - 1]->bin_size;
+   for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->bin_size; --j);
+   if (j >= 0) {
+      in = pc->bb_list[j];
+
+      /* check for no-op branches (BRA $PC+8) */
+      if (in->exit && in->exit->opcode == NV_OP_BRA && in->exit->target == b) {
+         in->bin_size -= 8;
+         pc->bin_size -= 8;
+
+         for (++j; j < pc->num_blocks; ++j)
+            pc->bb_list[j]->bin_pos -= 8;
+
+         nv_nvi_delete(in->exit);
+      }
+      b->bin_pos = in->bin_pos + in->bin_size;
+   }
 
    pc->bb_list[pc->num_blocks++] = b;
 
@@ -183,7 +197,7 @@ nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b)
          b->exit->prev->is_long = 1;
       }
    }
-   assert(!b->exit || b->exit->is_long);
+   assert(!b->entry || (b->exit && b->exit->is_long));
 
    pc->bin_size += b->bin_size *= 4;
 
@@ -194,15 +208,6 @@ nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b)
    if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
       return;
 
-#if 0
-   /* delete ELSE branch */
-   if (b->entry &&
-       b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) {
-      nv_nvi_delete(b->entry);
-      b->bin_size -= 2;
-      pc->bin_size -= 8;
-   }
-#endif
    for (j = 0; j < 2; ++j)
       if (b->out[j] && b->out[j] != b)
          nv_pc_pass_pre_emission(pc, b->out[j]);
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 82080779c3..c2c3eb25bc 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -290,6 +290,9 @@ nv_print_instruction(struct nv_instruction *i)
     */
    if (i->def[0])
       nv_print_value(i->def[0], NULL, NV_TYPE_ANY);
+   else
+   if (i->target)
+      PRINT(" %s(BB:%i)", orng, i->target->id);
    else
       PRINT(" #");
 
@@ -304,7 +307,5 @@ nv_print_instruction(struct nv_instruction *i)
                    (j == nv50_indirect_opnd(i)) ?
                    i->src[4]->value : NULL);
    }
-   if (!i->is_long)
-      PRINT(" %ss", norm);
-   PRINT("\n");
+   PRINT(" %s%c\n", norm, i->is_long ? 'l' : 's');
 }
-- 
cgit v1.2.3


From 5de5e4fd5c7c6d55e9b3aadbaae0ca34e2662e2c Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 27 Jul 2010 17:56:13 +0200
Subject: nv50: insert MOVs also for PHI sources from dominating block

Otherwise we get live range conflicts for operands that are written
only in e.g. an ELSE block but not the IF block.
---
 src/gallium/drivers/nv50/nv50_pc_print.c    | 12 ++++---
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 53 ++++++++++++++---------------
 2 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index c2c3eb25bc..c812dbd066 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -181,9 +181,11 @@ nv_print_address(const char c, int buf, struct nv_value *a, int offset)
 static INLINE void
 nv_print_cond(struct nv_instruction *nvi)
 {
-   PRINT("%s%s %s$c%i ",
+   char pfx = nv_value_allocated(nvi->flags_src->value->join) ? '$' : '%';
+
+   PRINT("%s%s %s%cc%i ",
          gree, nv_cond_name(nvi->cc),
-         mgta, nv_value_id(nvi->flags_src->value));
+         mgta, pfx, nv_value_id(nvi->flags_src->value));
 }
 
 static INLINE void
@@ -197,8 +199,8 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
    if (value->reg.file != NV_FILE_FLAGS)
       PRINT(" %s%s", gree, nv_type_name(type));
 
-   if (!nv_value_allocated(value))
-      reg_pfx = nv_value_allocated(value->join) ? '&' : '%';
+   if (!nv_value_allocated(value->join))
+      reg_pfx = '%';
 
    switch (value->reg.file) {
    case NV_FILE_GPR:
@@ -301,7 +303,7 @@ nv_print_instruction(struct nv_instruction *i)
          continue;
 
       if (i->src[j]->mod)
-         PRINT(" %s", nv_modifier_string(i->src[j]->mod));
+         PRINT(" %s%s", gree, nv_modifier_string(i->src[j]->mod));
 
       nv_print_ref(i->src[j],
                    (j == nv50_indirect_opnd(i)) ?
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 3cec219d1a..568384fd82 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -56,6 +56,25 @@ struct nv_pc_pass {
    uint pass_seq;
 };
 
+/* check if bf (future) can be reached from bp (past) */
+static boolean
+bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
+                struct nv_basic_block *bt)
+{
+   if (bf == bp)
+      return TRUE;
+   if (bp == bt)
+      return FALSE;
+
+   if (bp->out[0] && bp->out[0] != bp &&
+       bb_reachable_by(bf, bp->out[0], bt))
+      return TRUE;
+   if (bp->out[1] && bp->out[1] != bp &&
+       bb_reachable_by(bf, bp->out[1], bt))
+      return TRUE;
+   return FALSE;
+}
+
 static void
 ranges_coalesce(struct nv_range *range)
 {
@@ -422,7 +441,7 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
             if (!i->src[j])
                j = 3;
             else
-            if (i->src[j]->value->insn->bb == p)
+            if (bb_reachable_by(pn, i->src[j]->value->insn->bb, b))
                break;
          }
          if (j >= 4)
@@ -580,25 +599,6 @@ live_set_test(struct nv_basic_block *b, struct nv_ref *ref)
    return b->live_set[n / 32] & (1 << (n % 32));
 }
 
-/* check if bf (future) can be reached from bp (past) */
-static boolean
-bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
-		struct nv_basic_block *bt)
-{
-   if (bf == bp)
-      return TRUE;
-   if (bp == bt)
-      return FALSE;
-
-   if (bp->out[0] && bp->out[0] != bp &&
-       bb_reachable_by(bf, bp->out[0], bt))
-      return TRUE;
-   if (bp->out[1] && bp->out[1] != bp &&
-       bb_reachable_by(bf, bp->out[1], bt))
-      return TRUE;
-   return FALSE;
-}
-
 /* The live set of a block contains those values that are live immediately
  * before the beginning of the block.
  */
@@ -918,12 +918,6 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter)
    return 0;
 }
 
-static int
-pass_eliminate_moves(struct nv_pc_pass *ctx)
-{
-   return 0;
-}
-
 int
 nv_pc_exec_pass1(struct nv_pc *pc)
 {
@@ -971,6 +965,11 @@ nv_pc_exec_pass1(struct nv_pc *pc)
       goto out;
    }
 
+#ifdef NV50_RA_DEBUG_LIVEI
+   for (i = 0; i < pc->num_values; ++i)
+      livei_print(&pc->values[i]);
+#endif
+
    for (i = 0; i < 2; ++i) {
       ret = pass_join_values(ctx, i);
       if (ret)
@@ -981,8 +980,6 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    }
    assert(!ret && "joining");
 
-   ret = pass_eliminate_moves(ctx);
-
    for (i = 0; i < pc->num_values; ++i)
       livei_release(&pc->values[i]);
 
-- 
cgit v1.2.3


From 5705b45b6a050f908120779e6049853931a8025a Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 27 Jul 2010 18:25:37 +0200
Subject: nv50: explicitly set src type for SET ops

Need to do this more nicely for all ops.
---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index da7fe746f4..aafb5e8295 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -20,6 +20,8 @@
  * SOFTWARE.
  */
 
+/* XXX: need to clean this up so we get the typecasting right more naturally */
+
 #include <unistd.h>
 
 #include "nv50_context.h"
@@ -1173,6 +1175,10 @@ bld_instruction(struct bld_context *bld,
          dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode);
          dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode);
 
+         dst0[c]->insn->src[0]->typecast =
+         dst0[c]->insn->src[1]->typecast =
+            infer_src_type(insn->Instruction.Opcode);
+
          if (dst0[c]->reg.type != NV_TYPE_F32)
             break;
          dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]);
-- 
cgit v1.2.3


From fa67cabe7a9f1343e96c7c8a105e82dc05e3de44 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 31 Jul 2010 17:52:54 +0200
Subject: nv50: fixes for nested IFs

---
 src/gallium/drivers/nv50/nv50_pc.c          |  15 +++
 src/gallium/drivers/nv50/nv50_pc.h          |   1 +
 src/gallium/drivers/nv50/nv50_pc_optimize.c |   1 +
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 175 +++++++++++++++++-----------
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  |   4 -
 5 files changed, 127 insertions(+), 69 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 0e8aadf5a9..614982db2d 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -464,3 +464,18 @@ void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b)
 
    b->in[b->num_in++] = parent;
 }
+
+int
+nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d)
+{
+   int j, n;
+
+   if (b == d)
+      return 1;
+
+   n = 0;
+   for (j = 0; j < b->num_in; ++j)
+      n += nvbb_dominated_by(b->in[j], d);
+
+   return n && (n == b->num_in);
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index da3f984783..4b191c508a 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -425,6 +425,7 @@ int nv_nvi_refcount(struct nv_instruction *);
 void nv_nvi_delete(struct nv_instruction *);
 void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *);
 void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *);
+int nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *);
 int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val,
                        struct nv_value *new_val);
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 42f3a8634e..1f2f1630f4 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -815,6 +815,7 @@ nv_pc_exec_pass0(struct nv_pc *pc)
    struct nv_pass_dce dce;
    int ret;
 
+   pass.n = 0;
    pass.pc = pc;
 
    pc->pass_seq++;
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 568384fd82..941ec9f6f8 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -20,19 +20,6 @@
  * SOFTWARE.
  */
 
-/*
- * XXX: phi function live intervals start at first ordinary instruction,
- *      add_range should be taking care of that already ...
- *
- * XXX: TEX must choose TEX's def as representative
- *
- * XXX: Aieee! Must materialize MOVs if source is in other basic block!
- *       -- absolutely, or we cannot execute the MOV conditionally at all
- * XXX: Aieee! Must include PHIs in LVA so we pull through liveness if
- *      PHI source is e.g. in dominator block.
- *       -- seems we lose liveness somehow, track that
- */
-
 #include "nv50_context.h"
 #include "nv50_pc.h"
 
@@ -143,7 +130,6 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end)
    bgn = val->insn->serial;
    if (bgn < b->entry->serial || bgn > b->exit->serial)
       bgn = b->entry->serial;
-   // debug_printf("add_range(value %i): [%i, %i)\n", val->n, bgn, end);
 
    if (bgn > end) {
       debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n",
@@ -391,25 +377,45 @@ try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b)
    do_join_values(ctx, a, b);
 }
 
-/* For each operand of each phi in b, generate a new value by inserting a MOV
- * at the end of the block it is coming from and replace the operand with it.
- * This eliminates liveness conflicts.
+/* For phi functions with sources from blocks that are not direct predecessors,
+ * if such a source is to be used in an earlier predecessor, we need to add an
+ * additional phi function. Used when inserting the MOVs below.
+ */
+static struct nv_value *
+propagate_phi(struct nv_pc *pc, struct nv_instruction *phi, int s)
+{
+   struct nv_basic_block *b = pc->current_block;
+   struct nv_value *val = phi->src[s]->value;
+   struct nv_instruction *nvi = new_instruction(pc, NV_OP_PHI);
+   int i, k;
+
+   (nvi->def[0] = new_value(pc, val->reg.file, val->reg.type))->insn = nvi;
+
+   for (k = 0, i = 0; i < 4 && phi->src[i]; ++i) {
+      if (bb_reachable_by(b, phi->src[i]->value->insn->bb, b))
+         nvi->src[k++] = new_ref(pc, phi->src[i]->value);
+   }
+   return nvi->def[0];
+}
+
+/* For IF blocks without ELSE blocks, insert an empty block for the MOVs.
+ * Insert additional PHIs for cases where a direct MOV wouldn't be valid.
  */
 static int
-pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 {
-   struct nv_instruction *i, *i2;
-   struct nv_basic_block *p, *pn;
+   struct nv_instruction *i, *ni;
    struct nv_value *val;
+   struct nv_basic_block *p, *pn;
    int n, j;
 
    b->pass_seq = ctx->pc->pass_seq;
 
    for (n = 0; n < b->num_in; ++n) {
-      p = b->in[n];
+      p = pn = b->in[n];
       assert(p);
 
-      if (b->num_in > 1 && p->out[0] && p->out[1]) { /* if without else */
+      if (b->num_in > 1 && p->out[0] && p->out[1]) {
          pn = new_basic_block(ctx->pc);
 
          if (p->out[0] == b)
@@ -426,58 +432,99 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
                break;
             }
          }
-
          pn->out[0] = b;
          pn->in[0] = p;
          pn->num_in = 1;
-      } else
-         pn = p;
+      }
 
       ctx->pc->current_block = pn;
 
-      /* every block with PHIs will also have other operations */
       for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
-         for (j = 0; j < 4; ++j) {
-            if (!i->src[j])
-               j = 3;
-            else
+         for (j = 0; j < 4 && i->src[j]; ++j) {
             if (bb_reachable_by(pn, i->src[j]->value->insn->bb, b))
                break;
          }
-         if (j >= 4)
+         if (j >= 4 || !i->src[j])
             continue;
-         assert(i->src[j]);
          val = i->src[j]->value;
 
-         /* XXX: should probably not insert this after terminator */
-         i2 = new_instruction(ctx->pc, NV_OP_MOV);
-
-         i2->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type);
-         i2->src[0] = new_ref  (ctx->pc, val);
-         i2->def[0]->insn = i2;
-
-         nv_reference(ctx->pc, &i->src[j], i2->def[0]);
+         if (!nvbb_dominated_by(pn, val->insn->bb))
+            nv_reference(ctx->pc, &i->src[j], propagate_phi(ctx->pc, i, j));
       }
       if (pn != p && pn->exit) {
-         /* XXX: this branch should probably be eliminated */
          ctx->pc->current_block = b->in[n ? 0 : 1];
-         i2 = new_instruction(ctx->pc, NV_OP_BRA);
-         i2->target = b;
-         i2->is_terminator = 1;
+         ni = new_instruction(ctx->pc, NV_OP_BRA);
+         ni->target = b;
+         ni->is_terminator = 1;
       }
    }
 
-   if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) {
-      pass_generate_phi_movs(ctx, b->out[0]);
-   }
+   for (j = 0; j < 2; ++j)
+      if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq)
+         pass_generate_phi_movs_1(ctx, b->out[j]);
+
+   return 0;
+}
+
+/* Now everything should be in order and we can insert the MOVs. */
+static int
+pass_generate_phi_movs_2(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+{
+   struct nv_instruction *i, *mov;
+   struct nv_value *val;
+   struct nv_basic_block *p;
+   int n, j;
+
+   b->pass_seq = ctx->pc->pass_seq;
+
+   for (n = 0; n < b->num_in; ++n) {
+      ctx->pc->current_block = p = b->in[n];
+
+      for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
+         for (j = 0; j < 4 && i->src[j]; ++j) {
+            if (bb_reachable_by(p, i->src[j]->value->insn->bb, b))
+               break;
+         }
+         if (j >= 4 || !i->src[j])
+            continue;
+         val = i->src[j]->value;
+
+         mov = new_instruction(ctx->pc, NV_OP_MOV);
+
+         /* TODO: insert instruction at correct position in the first place */
+         if (mov->prev && mov->prev->target)
+            nv_nvi_permute(mov->prev, mov);
+
+         mov->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type);
+         mov->def[0]->insn = mov;
+         mov->src[0] = new_ref(ctx->pc, val);
 
-   if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) {
-      pass_generate_phi_movs(ctx, b->out[1]);
+         nv_reference(ctx->pc, &i->src[j], mov->def[0]);
+      }
    }
 
+   for (j = 1; j >= 0; --j) /* different order for the sake of diversity */
+      if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq)
+         pass_generate_phi_movs_2(ctx, b->out[j]);
+
    return 0;
 }
 
+/* For each operand of each PHI in b, generate a new value by inserting a MOV
+ * at the end of the block it is coming from and replace the operand with its
+ * result. This eliminates liveness conflicts and enables us to let values be
+ * copied to the right register if such a conflict exists nonetheless.
+ */
+static INLINE int
+pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+{
+   if (pass_generate_phi_movs_1(ctx, b))
+      return 1;
+
+   ++ctx->pc->pass_seq;
+   return pass_generate_phi_movs_2(ctx, b);
+}
+
 static int
 pass_join_values(struct nv_pc_pass *ctx, int iter)
 {
@@ -525,6 +572,7 @@ pass_join_values(struct nv_pc_pass *ctx, int iter)
    return 0;
 }
 
+/* Order the instructions so that live intervals can be expressed in numbers. */
 static int
 pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 {
@@ -560,7 +608,7 @@ bb_live_set_print(struct nv_pc *pc, struct nv_basic_block *b)
    int j;
    struct nv_value *val;
 
-   debug_printf("live_set of %p: ", b);
+   debug_printf("LIVE-INs of BB:%i: ", b->id);
 
    for (j = 0; j < pc->num_values; ++j) {
       if (!(b->live_set[j / 32] & (1 << (j % 32))))
@@ -579,16 +627,12 @@ live_set_add(struct nv_basic_block *b, struct nv_value *val)
 {
    if (!val->insn) /* don't add non-def values */
       return;
-   /* debug_printf("live[%p] <- %i\n", b, val->n); */
-
    b->live_set[val->n / 32] |= 1 << (val->n % 32);
 }
 
 static INLINE void
 live_set_rem(struct nv_basic_block *b, struct nv_value *val)
 {
-   /* if (val->insn)
-      debug_printf("live[%p] -> %i\n", b, val->n); */
    b->live_set[val->n / 32] &= ~(1 << (val->n % 32));
 }
 
@@ -600,7 +644,7 @@ live_set_test(struct nv_basic_block *b, struct nv_ref *ref)
 }
 
 /* The live set of a block contains those values that are live immediately
- * before the beginning of the block.
+ * before the beginning of the block, so do a backwards scan.
  */
 static int
 pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
@@ -608,6 +652,14 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
    struct nv_instruction *i;
    int j, n, ret = 0;
 
+   debug_printf("pass_build_live_sets BB:%i\n", b->id);
+
+   if (b->pass_seq >= ctx->pc->pass_seq) {
+      debug_printf("already visited\n");
+      return 0;
+   }
+   b->pass_seq = ctx->pc->pass_seq;
+
    /* slight hack for undecidedness: set phi = entry if it's undefined */
    if (!b->phi)
       b->phi = b->entry;
@@ -638,23 +690,18 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 
             if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) {
                live_set_add(b, i->src[j]->value);
-               debug_printf("%p: live set + %i\n", b, i->src[j]->value->n);
+               debug_printf("BB:%i liveset + %i\n", b->id, i->src[j]->value->n);
             } else {
                live_set_rem(b, i->src[j]->value);
-               debug_printf("%p: live set - %i\n", b, i->src[j]->value->n);
+               debug_printf("BB:%i liveset - %i\n", b->id, i->src[j]->value->n);
             }
          }
       }
    }
 
-   if (b->pass_seq >= ctx->pc->pass_seq)
-      return 0;
-   b->pass_seq = ctx->pc->pass_seq;
-
-   debug_printf("%s: visiting block %p\n", __FUNCTION__, b);
-
    if (!b->entry)
       return 0;
+
    bb_live_set_print(ctx->pc, b);
 
    for (i = b->exit; i; i = i->prev) {
@@ -786,8 +833,6 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
    if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq)
       pass_build_intervals(ctx, b->out[1]);
 
-   debug_printf("built intervals for block %p\n", b);
-
    return 0;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index aafb5e8295..8846ef08b5 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -212,15 +212,11 @@ bld_imm_u32(struct bld_context *bld, uint32_t u)
    int i;
    unsigned n = bld->num_immds;
 
-   debug_printf("bld_imm_u32: 0x%08x\n", u);
-
    for (i = 0; i < n; ++i)
       if (bld->saved_immd[i]->reg.imm.u32 == u)
          return bld->saved_immd[i];
    assert(n < BLD_MAX_IMMDS);
 
-   debug_printf("need new one\n");
-
    bld->num_immds++;
 
    bld->saved_immd[n] = new_value(bld->pc, NV_FILE_IMM, NV_TYPE_U32);
-- 
cgit v1.2.3


From 2c695d38e6b194572becf82300fba5e34b1fd7d7 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 31 Jul 2010 20:56:42 +0200
Subject: nv50: don't eliminate loads to dedicated values

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 1f2f1630f4..324f8bb2da 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -639,12 +639,10 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
             break;
 
       if (it) {
-#if 1
-         nvcg_replace_value(ctx->pc, ld->def[0], it->value);
-#else
-         ld->opcode = NV_OP_MOV;
-         nv_reference(ctx->pc, &ld->src[0], it->value);
-#endif
+         if (ld->def[0]->reg.id >= 0)
+            it->value = ld->def[0];
+         else
+            nvcg_replace_value(ctx->pc, ld->def[0], it->value);
       } else {
          if (ctx->alloc == LOAD_RECORD_POOL_SIZE)
             continue;
-- 
cgit v1.2.3


From 720e0c430d0a66cbf5adfcf40030f27e55ad6c6a Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 31 Jul 2010 21:30:35 +0200
Subject: nv50: fix constbuf validation

We only uploaded up to the highest offset a program would use,
and if the constant buffer isn't changed when a new program is
used, the new program is missing the rest of them.

Might want to introduce a "fill state" for user mem constbufs.
---
 src/gallium/drivers/nv50/nv50_shader_state.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index f7e6355286..3d5df596ef 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -44,7 +44,7 @@ nv50_transfer_constbuf(struct nv50_context *nv50,
    if (!map)
       return;
 
-   count = MIN2(buf->width0, size);
+   count = buf->width0; /* MIN2(buf->width0, size); */
    start = 0;
 
    while (count) {
@@ -92,8 +92,13 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
       }
    }
 
+   /* If the state tracker doesn't change the constbuf, and it is first
+    * validated with a program that doesn't use it, this check prevents
+    * it from even being uploaded. */
+   /*
    if (p->parm_size == 0)
       return;
+   */
 
    switch (p->type) {
    case PIPE_SHADER_VERTEX:
-- 
cgit v1.2.3


From aaa8802a22d83fd89d7e306b7d03fa587a19aa0a Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 5 Aug 2010 00:11:56 +0200
Subject: nv50: build proper phi functions in the first place

---
 src/gallium/drivers/nv50/nv50_pc.c          |  39 +++++++-
 src/gallium/drivers/nv50/nv50_pc.h          |   3 +
 src/gallium/drivers/nv50/nv50_pc_optimize.c |   4 +
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 140 +++++-----------------------
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 127 +++++++++++++++++++------
 5 files changed, 166 insertions(+), 147 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 614982db2d..e32d28a9ce 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -394,7 +394,7 @@ nv_nvi_delete(struct nv_instruction *nvi)
    struct nv_basic_block *b = nvi->bb;
    int j;
 
-   debug_printf("REM: "); nv_print_instruction(nvi);
+   /* debug_printf("REM: "); nv_print_instruction(nvi); */
 
    for (j = 0; j < 5; ++j)
       nv_reference(NULL, &nvi->src[j], NULL);
@@ -477,5 +477,40 @@ nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d)
    for (j = 0; j < b->num_in; ++j)
       n += nvbb_dominated_by(b->in[j], d);
 
-   return n && (n == b->num_in);
+   return (n && (n == b->num_in)) ? 1 : 0;
+}
+
+/* check if bf (future) can be reached from bp (past) */
+boolean
+nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
+                  struct nv_basic_block *bt)
+{
+   if (bf == bp)
+      return TRUE;
+   if (bp == bt)
+      return FALSE;
+
+   if (bp->out[0] && bp->out[0] != bp &&
+       nvbb_reachable_by(bf, bp->out[0], bt))
+      return TRUE;
+   if (bp->out[1] && bp->out[1] != bp &&
+       nvbb_reachable_by(bf, bp->out[1], bt))
+      return TRUE;
+   return FALSE;
+}
+
+struct nv_basic_block *
+nvbb_dom_frontier(struct nv_basic_block *b)
+{
+   struct nv_basic_block *df = b->out[0];
+
+   assert(df);
+   while (nvbb_dominated_by(df, b) ||
+          (!nvbb_dominated_by(df->in[0], b) &&
+           (!df->in[1] || !nvbb_dominated_by(df->in[1], b)))) {
+      df = df->out[0];
+      assert(df);
+   }
+   assert(df);
+   return df;
 }
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 4b191c508a..987043c7a0 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -426,6 +426,9 @@ void nv_nvi_delete(struct nv_instruction *);
 void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *);
 void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *);
 int nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *);
+boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *,
+                          struct nv_basic_block *);
+struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *);
 int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val,
                        struct nv_value *new_val);
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 324f8bb2da..f2f8d0eaa3 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -771,6 +771,10 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
             if (ik->src[4] || ir->src[4])
                continue; /* don't mess with address registers */
 
+            if (ik->flags_src || ir->flags_src ||
+                ik->flags_def || ir->flags_def)
+               continue; /* and also not with flags, for now */
+
             for (s = 0; s < 3; ++s) {
                struct nv_value *a, *b;
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 941ec9f6f8..172e44f62b 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -43,25 +43,6 @@ struct nv_pc_pass {
    uint pass_seq;
 };
 
-/* check if bf (future) can be reached from bp (past) */
-static boolean
-bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
-                struct nv_basic_block *bt)
-{
-   if (bf == bp)
-      return TRUE;
-   if (bp == bt)
-      return FALSE;
-
-   if (bp->out[0] && bp->out[0] != bp &&
-       bb_reachable_by(bf, bp->out[0], bt))
-      return TRUE;
-   if (bp->out[1] && bp->out[1] != bp &&
-       bb_reachable_by(bf, bp->out[1], bt))
-      return TRUE;
-   return FALSE;
-}
-
 static void
 ranges_coalesce(struct nv_range *range)
 {
@@ -377,32 +358,13 @@ try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b)
    do_join_values(ctx, a, b);
 }
 
-/* For phi functions with sources from blocks that are not direct predecessors,
- * if such a source is to be used in an earlier predecessor, we need to add an
- * additional phi function. Used when inserting the MOVs below.
- */
-static struct nv_value *
-propagate_phi(struct nv_pc *pc, struct nv_instruction *phi, int s)
-{
-   struct nv_basic_block *b = pc->current_block;
-   struct nv_value *val = phi->src[s]->value;
-   struct nv_instruction *nvi = new_instruction(pc, NV_OP_PHI);
-   int i, k;
-
-   (nvi->def[0] = new_value(pc, val->reg.file, val->reg.type))->insn = nvi;
-
-   for (k = 0, i = 0; i < 4 && phi->src[i]; ++i) {
-      if (bb_reachable_by(b, phi->src[i]->value->insn->bb, b))
-         nvi->src[k++] = new_ref(pc, phi->src[i]->value);
-   }
-   return nvi->def[0];
-}
-
-/* For IF blocks without ELSE blocks, insert an empty block for the MOVs.
- * Insert additional PHIs for cases where a direct MOV wouldn't be valid.
+/* For each operand of each PHI in b, generate a new value by inserting a MOV
+ * at the end of the block it is coming from and replace the operand with its
+ * result. This eliminates liveness conflicts and enables us to let values be
+ * copied to the right register if such a conflict exists nonetheless.
  */
 static int
-pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 {
    struct nv_instruction *i, *ni;
    struct nv_value *val;
@@ -426,31 +388,36 @@ pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b)
          if (p->exit->target == b) /* target to new else-block */
             p->exit->target = pn;
 
-         for (j = 0; j < b->num_in; ++j) {
-            if (b->in[j] == p) {
-               b->in[j] = pn;
-               break;
-            }
-         }
+         b->in[n] = pn;
+
          pn->out[0] = b;
          pn->in[0] = p;
          pn->num_in = 1;
       }
-
       ctx->pc->current_block = pn;
 
       for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
          for (j = 0; j < 4 && i->src[j]; ++j) {
-            if (bb_reachable_by(pn, i->src[j]->value->insn->bb, b))
+            if (nvbb_reachable_by(p, i->src[j]->value->insn->bb, b))
                break;
          }
          if (j >= 4 || !i->src[j])
             continue;
          val = i->src[j]->value;
 
-         if (!nvbb_dominated_by(pn, val->insn->bb))
-            nv_reference(ctx->pc, &i->src[j], propagate_phi(ctx->pc, i, j));
+         ni = new_instruction(ctx->pc, NV_OP_MOV);
+
+         /* TODO: insert instruction at correct position in the first place */
+         if (ni->prev && ni->prev->target)
+            nv_nvi_permute(ni->prev, ni);
+
+         ni->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type);
+         ni->def[0]->insn = ni;
+         ni->src[0] = new_ref(ctx->pc, val);
+
+         nv_reference(ctx->pc, &i->src[j], ni->def[0]);
       }
+
       if (pn != p && pn->exit) {
          ctx->pc->current_block = b->in[n ? 0 : 1];
          ni = new_instruction(ctx->pc, NV_OP_BRA);
@@ -461,70 +428,11 @@ pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 
    for (j = 0; j < 2; ++j)
       if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq)
-         pass_generate_phi_movs_1(ctx, b->out[j]);
+         pass_generate_phi_movs(ctx, b->out[j]);
 
    return 0;
 }
 
-/* Now everything should be in order and we can insert the MOVs. */
-static int
-pass_generate_phi_movs_2(struct nv_pc_pass *ctx, struct nv_basic_block *b)
-{
-   struct nv_instruction *i, *mov;
-   struct nv_value *val;
-   struct nv_basic_block *p;
-   int n, j;
-
-   b->pass_seq = ctx->pc->pass_seq;
-
-   for (n = 0; n < b->num_in; ++n) {
-      ctx->pc->current_block = p = b->in[n];
-
-      for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
-         for (j = 0; j < 4 && i->src[j]; ++j) {
-            if (bb_reachable_by(p, i->src[j]->value->insn->bb, b))
-               break;
-         }
-         if (j >= 4 || !i->src[j])
-            continue;
-         val = i->src[j]->value;
-
-         mov = new_instruction(ctx->pc, NV_OP_MOV);
-
-         /* TODO: insert instruction at correct position in the first place */
-         if (mov->prev && mov->prev->target)
-            nv_nvi_permute(mov->prev, mov);
-
-         mov->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type);
-         mov->def[0]->insn = mov;
-         mov->src[0] = new_ref(ctx->pc, val);
-
-         nv_reference(ctx->pc, &i->src[j], mov->def[0]);
-      }
-   }
-
-   for (j = 1; j >= 0; --j) /* different order for the sake of diversity */
-      if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq)
-         pass_generate_phi_movs_2(ctx, b->out[j]);
-
-   return 0;
-}
-
-/* For each operand of each PHI in b, generate a new value by inserting a MOV
- * at the end of the block it is coming from and replace the operand with its
- * result. This eliminates liveness conflicts and enables us to let values be
- * copied to the right register if such a conflict exists nonetheless.
- */
-static INLINE int
-pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
-{
-   if (pass_generate_phi_movs_1(ctx, b))
-      return 1;
-
-   ++ctx->pc->pass_seq;
-   return pass_generate_phi_movs_2(ctx, b);
-}
-
 static int
 pass_join_values(struct nv_pc_pass *ctx, int iter)
 {
@@ -688,7 +596,7 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
                break;
             assert(i->src[j]->value->insn);
 
-            if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) {
+            if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) {
                live_set_add(b, i->src[j]->value);
                debug_printf("BB:%i liveset + %i\n", b->id, i->src[j]->value->n);
             } else {
@@ -774,7 +682,7 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
             if (!i->src[s])
                break;
             assert(i->src[s]->value->insn);
-            if (bb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j]))
+            if (nvbb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j]))
                live_set_add(b, i->src[s]->value);
             else
                live_set_rem(b, i->src[s]->value);
@@ -978,7 +886,7 @@ nv_pc_exec_pass1(struct nv_pc *pc)
 
    nv_print_program(ctx->pc->root);
 
-   ctx->insns = CALLOC(pc->num_instructions, sizeof(struct nv_instruction *));
+   ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *));
 
    pc->pass_seq++;
    ret = pass_generate_phi_movs(ctx, pc->root);
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 8846ef08b5..6a9259c898 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -51,16 +51,22 @@ struct bld_value_stack {
 };
 
 static INLINE void
-bld_push_value(struct bld_value_stack *stk)
+bld_vals_push_val(struct bld_value_stack *stk, struct nv_value *val)
 {
-   assert(!stk->size || (stk->body[stk->size - 1] != stk->top));
+   assert(!stk->size || (stk->body[stk->size - 1] != val));
 
    if (!(stk->size % 8)) {
       unsigned old_sz = (stk->size + 0) * sizeof(struct nv_value *);
       unsigned new_sz = (stk->size + 8) * sizeof(struct nv_value *);
       stk->body = (struct nv_value **)REALLOC(stk->body, old_sz, new_sz);
    }
-   stk->body[stk->size++] = stk->top;
+   stk->body[stk->size++] = val;
+}
+
+static INLINE void
+bld_vals_push(struct bld_value_stack *stk)
+{
+   bld_vals_push_val(stk, stk->top);
    stk->top = NULL;
 }
 
@@ -72,7 +78,7 @@ bld_push_values(struct bld_value_stack *stacks, int n)
    for (i = 0; i < n; ++i)
       for (c = 0; c < 4; ++c)
          if (stacks[i * 4 + c].top)
-            bld_push_value(&stacks[i * 4 + c]);
+            bld_vals_push(&stacks[i * 4 + c]);
 }
 
 #define FETCH_TEMP(i, c)    (bld->tvs[i][c].top)
@@ -121,6 +127,17 @@ struct bld_context {
    uint num_immds;
 };
 
+static INLINE void
+bld_warn_uninitialized(struct bld_context *bld, int kind,
+                       struct bld_value_stack *stk, struct nv_basic_block *b)
+{
+   long i = (stk - &bld->tvs[0][0]) / 4;
+   long c = (stk - &bld->tvs[0][0]) & 3;
+
+   debug_printf("WARNING: TEMP[%li].%li %s used uninitialized in BB:%i\n",
+                i, c, kind ? "may be" : "is", b->id);
+}
+
 static INLINE struct nv_value *
 bld_def(struct nv_instruction *i, int c, struct nv_value *value)
 {
@@ -168,42 +185,91 @@ fetch_by_bb(struct bld_value_stack *stack,
       fetch_by_bb(stack, vals, n, b->in[i]);
 }
 
+static INLINE struct nv_value *
+bld_load_imm_u32(struct bld_context *bld, uint32_t u);
+
 static struct nv_value *
-bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack)
+bld_phi(struct bld_context *bld, struct nv_basic_block *b,
+        struct bld_value_stack *stack)
 {
-   struct nv_value *vals[16], *phi = NULL;
-   int j, i = 0, n = 0;
+   struct nv_basic_block *in;
+   struct nv_value *vals[16], *val;
+   struct nv_instruction *phi;
+   int i, j, n;
+
+   do {
+      i = n = 0;
+      fetch_by_bb(stack, vals, &n, b);
+
+      if (!n) {
+         bld_warn_uninitialized(bld, 0, stack, b);
+         return NULL;
+      }
 
-   fetch_by_bb(stack, vals, &n, bld->pc->current_block);
+      if (n == 1) {
+         if (nvbb_dominated_by(b, vals[0]->insn->bb))
+            break;
 
-   if (n == 0)
-      return NULL;
-   if (n == 1)
-      return vals[0];
+         bld_warn_uninitialized(bld, 1, stack, b);
+
+         /* back-tracking to insert missing value of other path */
+         in = b;
+         while (in->in[0]) {
+            if (in->num_in == 1) {
+               in = in->in[0];
+            } else {
+               if (!nvbb_reachable_by(in->in[0], vals[0]->insn->bb, b)) {
+                  in = in->in[0];
+                  break;
+               }
+               if (!nvbb_reachable_by(in->in[1], vals[0]->insn->bb, b)) {
+                  in = in->in[1];
+                  break;
+               }
+               in = in->in[0];
+            }
+         }
+         bld->pc->current_block = in;
+
+         /* should make this a no-op */
+         bld_vals_push_val(stack, bld_load_imm_u32(bld, 0));
+         continue;
+      }
 
-   debug_printf("phi required: %i candidates\n", n);
+      for (i = 0; i < n; ++i) {
+         if (nvbb_dominated_by(b, vals[i]->insn->bb))
+            continue;
 
-   while (i < n) {
-      struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_PHI);
+         for (j = 0; j < b->num_in; ++j)
+            if (nvbb_dominated_by(b->in[j], vals[i]->insn->bb))
+               break;
+         if (j == b->num_in) {
+            in = nvbb_dom_frontier(vals[i]->insn->bb);
+            val = bld_phi(bld, in, stack);
+            bld_vals_push_val(stack, val);
+            break;
+         }
+      }
+   } while(i < n);
 
-      j = phi ? 1 : 0;
-      if (phi)
-         insn->src[0] = new_ref(bld->pc, phi);
+   bld->pc->current_block = b;
 
-      phi = new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type);
+   if (n == 1)
+      return vals[0];
 
-      bld_def(insn, 0, phi);
+   phi = new_instruction(bld->pc, NV_OP_PHI);
 
-      for (; j < 4; ++j) {
-         insn->src[j] = new_ref(bld->pc, vals[i++]);
-         if (i == n)
-            break;
-      }
-      debug_printf("new phi: %i, %i in\n", phi->n, j);
-   }
+   bld_def(phi, 0, new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type));
+   for (i = 0; i < n; ++i)
+      phi->src[i] = new_ref(bld->pc, vals[i]);
 
-   /* insert_at_head(list, phi) is done at end of block */
-   return phi;
+   return phi->def[0];
+}
+
+static INLINE struct nv_value *
+bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack)
+{
+   return bld_phi(bld, bld->pc->current_block, stack);
 }
 
 static INLINE struct nv_value *
@@ -640,6 +706,9 @@ bld_new_block(struct bld_context *bld, struct nv_basic_block *b)
 
    for (i = 0; i < 4; ++i)
       bld->saved_addr[i][0] = NULL;
+
+   for (i = 0; i < 128; ++i)
+      bld->saved_inputs[i] = NULL;
 }
 
 static struct nv_value *
-- 
cgit v1.2.3


From fc1d72d15d929b629be399d977ad05611f01fc59 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 5 Aug 2010 12:29:23 +0200
Subject: nv50: fix reg count

---
 src/gallium/drivers/nv50/nv50_pc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index e32d28a9ce..ed92261488 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -312,8 +312,8 @@ nv50_generate_code(struct nv50_translation_info *ti)
    ti->p->immd_size = pc->immd_count * 4;
    ti->p->immd = pc->immd_buf;
 
-   ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] + 1) >> 1;
-   ti->p->max_gpr++;
+   /* highest 16 bit reg to num of 32 bit regs */
+   ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] >> 1) + 1;
 
    ti->p->fixups = pc->fixups;
    ti->p->num_fixups = pc->num_fixups;
-- 
cgit v1.2.3


From 3a68fcfb6b406cf864afbf200e436fc384fd0865 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 10 Aug 2010 17:36:25 +0200
Subject: nv50: begin implementing loops

---
 src/gallium/drivers/nv50/nv50_pc.c          | 168 +++++++++++++-----
 src/gallium/drivers/nv50/nv50_pc.h          |  20 ++-
 src/gallium/drivers/nv50/nv50_pc_emit.c     |   2 +-
 src/gallium/drivers/nv50/nv50_pc_optimize.c |  28 +--
 src/gallium/drivers/nv50/nv50_pc_regalloc.c |  52 +++---
 src/gallium/drivers/nv50/nv50_program.c     |   3 +
 src/gallium/drivers/nv50/nv50_program.h     |  11 ++
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 259 ++++++++++++++++++++++++----
 8 files changed, 416 insertions(+), 127 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index ed92261488..7601049126 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -75,7 +75,8 @@ nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s)
    case NV_OP_XOR:
    case NV_OP_SHL:
    case NV_OP_SHR:
-      return (s == 1) && (nvi->def[0]->reg.file == NV_FILE_GPR);
+      return (s == 1) && (nvi->src[0]->value->reg.file == NV_FILE_GPR) &&
+         (nvi->def[0]->reg.file == NV_FILE_GPR);
    case NV_OP_MOV:
       assert(s == 0);
       return (nvi->def[0]->reg.file == NV_FILE_GPR);
@@ -87,6 +88,12 @@ nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s)
 boolean
 nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)
 {
+   int i;
+
+   for (i = 0; i < 3 && nvi->src[i]; ++i)
+      if (nvi->src[i]->value->reg.file == NV_FILE_IMM)
+         return FALSE;
+
    switch (nvi->opcode) {
    case NV_OP_ABS:
    case NV_OP_ADD:
@@ -189,37 +196,89 @@ nv_pc_free_refs(struct nv_pc *pc)
       FREE(pc->refs[i]);
 }
 
+static const char *
+edge_name(ubyte type)
+{
+   switch (type) {
+   case CFG_EDGE_FORWARD: return "forward";
+   case CFG_EDGE_BACK: return "back";
+   case CFG_EDGE_LOOP_ENTER: return "loop";
+   case CFG_EDGE_LOOP_LEAVE: return "break";
+   default:
+      return "?";
+   }
+}
+
 void
-nv_print_program(struct nv_basic_block *b)
+nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
 {
-   struct nv_instruction *i = b->phi;
+   struct nv_basic_block *bb[64], *bbb[16], *b;
+   int j, p, pp;
+
+   bb[0] = root;
+   p = 1;
+   pp = 0;
+
+   while (p > 0) {
+      b = bb[--p];
+      b->priv = 0;
+
+      for (j = 1; j >= 0; --j) {
+         if (!b->out[j])
+            continue;
+
+         switch (b->out_kind[j]) {
+         case CFG_EDGE_BACK:
+            continue;
+         case CFG_EDGE_FORWARD:
+            if (++b->out[j]->priv == b->out[j]->num_in)
+               bb[p++] = b->out[j];
+            break;
+         case CFG_EDGE_LOOP_ENTER:
+            bb[p++] = b->out[j];
+            break;
+         case CFG_EDGE_LOOP_LEAVE:
+            bbb[pp++] = b->out[j];
+            break;
+         default:
+            assert(0);
+            break;
+         }
+      }
+
+      f(priv, b);
 
-   b->priv = 0;
+      if (!p)
+         while (pp > 0)
+            bb[p++] = bbb[--pp];
+   }
+}
+
+static void
+nv_do_print_program(void *priv, struct nv_basic_block *b)
+{
+   struct nv_instruction *i = b->phi;
 
    debug_printf("=== BB %i ", b->id);
    if (b->out[0])
-      debug_printf("(--0> %i) ", b->out[0]->id);
+      debug_printf("[%s -> %i] ", edge_name(b->out_kind[0]), b->out[0]->id);
    if (b->out[1])
-      debug_printf("(--1> %i) ", b->out[1]->id);
+      debug_printf("[%s -> %i] ", edge_name(b->out_kind[1]), b->out[1]->id);
    debug_printf("===\n");
 
+   i = b->phi;
    if (!i)
       i = b->entry;
    for (; i; i = i->next)
       nv_print_instruction(i);
+}
 
-   if (!b->out[0]) {
-      debug_printf("END\n\n");
-      return;
-   }
-   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
-      return;
-
-   if (b->out[0] != b)
-      nv_print_program(b->out[0]);
+void
+nv_print_program(struct nv_basic_block *root)
+{
+   nv_pc_pass_in_order(root, nv_do_print_program, root);
 
-   if (b->out[1] && b->out[1] != b)
-      nv_print_program(b->out[1]);
+   debug_printf("END\n\n");
 }
 
 static INLINE void
@@ -254,7 +313,7 @@ nv50_emit_program(struct nv_pc *pc)
    assert(pc->emit == &code[pc->bin_size / 4]);
 
    /* XXX: we can do better than this ... */
-   if ((pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) {
+   if (!(pc->emit[-2] & 1) || (pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) {
       pc->emit[0] = 0xf0000001;
       pc->emit[1] = 0xe0000000;
       pc->bin_size += 8;
@@ -281,6 +340,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    ret = nv50_tgsi_to_nc(pc, ti);
    if (ret)
       goto out;
+   nv_print_program(pc->root);
 
    /* optimization */
    ret = nv_pc_exec_pass0(pc);
@@ -454,30 +514,40 @@ nv_nvi_permute(struct nv_instruction *i1, struct nv_instruction *i2)
       i1->next->prev = i1;
 }
 
-void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b)
+void
+nvbb_attach_block(struct nv_basic_block *parent,
+                  struct nv_basic_block *b, ubyte edge_kind)
 {
+   assert(b->num_in < 8);
+
    if (parent->out[0]) {
       assert(!parent->out[1]);
       parent->out[1] = b;
-   } else
+      parent->out_kind[1] = edge_kind;
+   } else {
       parent->out[0] = b;
+      parent->out_kind[0] = edge_kind;
+   }
 
-   b->in[b->num_in++] = parent;
+   b->in[b->num_in] = parent;
+   b->in_kind[b->num_in++] = edge_kind;
 }
 
-int
+/* NOTE: all BRKs are treated as conditional, so there are 2 outgoing BBs */
+
+boolean
 nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d)
 {
-   int j, n;
+   int j;
 
    if (b == d)
-      return 1;
+      return TRUE;
 
-   n = 0;
    for (j = 0; j < b->num_in; ++j)
-      n += nvbb_dominated_by(b->in[j], d);
+      if ((b->in_kind[j] != CFG_EDGE_BACK) && !nvbb_dominated_by(b->in[j], d))
+         return FALSE;
 
-   return (n && (n == b->num_in)) ? 1 : 0;
+   return j ? TRUE : FALSE;
 }
 
 /* check if bf (future) can be reached from bp (past) */
@@ -490,27 +560,45 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
    if (bp == bt)
       return FALSE;
 
-   if (bp->out[0] && bp->out[0] != bp &&
+   if (bp->out[0] && bp->out_kind[0] != CFG_EDGE_BACK &&
        nvbb_reachable_by(bf, bp->out[0], bt))
       return TRUE;
-   if (bp->out[1] && bp->out[1] != bp &&
+   if (bp->out[1] && bp->out_kind[1] != CFG_EDGE_BACK &&
        nvbb_reachable_by(bf, bp->out[1], bt))
       return TRUE;
    return FALSE;
 }
 
+static struct nv_basic_block *
+nvbb_find_dom_frontier(struct nv_basic_block *b, struct nv_basic_block *df)
+{
+   int i;
+
+   if (!nvbb_dominated_by(df, b)) {
+      for (i = 0; i < df->num_in; ++i) {
+         if (df->in_kind[i] == CFG_EDGE_BACK)
+            continue;
+         if (nvbb_dominated_by(df->in[i], b))
+            return df;
+      }
+   }
+   for (i = 0; i < 2 && b->out[i]; ++i) {
+      if (b->out_kind[i] == CFG_EDGE_BACK)
+         continue;
+      if ((df = nvbb_find_dom_frontier(b, b->out[i])))
+         return df;
+   }
+   return NULL;
+}
+
 struct nv_basic_block *
 nvbb_dom_frontier(struct nv_basic_block *b)
 {
-   struct nv_basic_block *df = b->out[0];
-
-   assert(df);
-   while (nvbb_dominated_by(df, b) ||
-          (!nvbb_dominated_by(df->in[0], b) &&
-           (!df->in[1] || !nvbb_dominated_by(df->in[1], b)))) {
-      df = df->out[0];
-      assert(df);
-   }
-   assert(df);
-   return df;
+   struct nv_basic_block *df;
+   int i;
+
+   for (i = 0; i < 2 && b->out[i]; ++i)
+      if ((df = nvbb_find_dom_frontier(b, b->out[i])))
+         return df;
+   return NULL;
 }
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 987043c7a0..8b1c9b3a72 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -246,6 +246,11 @@ struct nv_instruction {
    ubyte quadop;
 };
 
+#define CFG_EDGE_FORWARD     0
+#define CFG_EDGE_BACK        1
+#define CFG_EDGE_LOOP_ENTER  2
+#define CFG_EDGE_LOOP_LEAVE  4
+
 struct nv_basic_block {
    struct nv_instruction *entry; /* first non-phi instruction */
    struct nv_instruction *exit;
@@ -253,8 +258,10 @@ struct nv_basic_block {
    int num_instructions;
 
    struct nv_basic_block *out[2]; /* no indirect branches -> 2 */
-   struct nv_basic_block **in;
+   struct nv_basic_block *in[8]; /* hope that suffices */
    uint num_in;
+   ubyte out_kind[2];
+   ubyte in_kind[8];
 
    int id;
    struct nv_basic_block *last_visitor;
@@ -383,7 +390,6 @@ new_basic_block(struct nv_pc *pc)
 {
    struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block);
 
-   bb->in = CALLOC(sizeof(struct nv_basic_block *), 4);
    bb->id = pc->num_blocks++;
    return bb;
 }
@@ -414,6 +420,7 @@ const char *nv_opcode_name(uint opcode);
 void nv_print_instruction(struct nv_instruction *);
 
 /* nv50_pc.c */
+
 void nv_print_program(struct nv_basic_block *b);
 
 boolean nv_op_commutative(uint opcode);
@@ -424,14 +431,19 @@ ubyte nv50_supported_src_mods(uint opcode, int s);
 int nv_nvi_refcount(struct nv_instruction *);
 void nv_nvi_delete(struct nv_instruction *);
 void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *);
-void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *);
-int nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *);
+void nvbb_attach_block(struct nv_basic_block *parent,
+                       struct nv_basic_block *, ubyte edge_kind);
+boolean nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *);
 boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *,
                           struct nv_basic_block *);
 struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *);
 int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val,
                        struct nv_value *new_val);
 
+typedef void (*nv_pc_pass_func)(void *priv, struct nv_basic_block *b);
+
+void nv_pc_pass_in_order(struct nv_basic_block *, nv_pc_pass_func, void *);
+
 int nv_pc_exec_pass0(struct nv_pc *pc);
 int nv_pc_exec_pass1(struct nv_pc *pc);
 int nv_pc_exec_pass2(struct nv_pc *pc);
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index 728e2b145d..35bd5ff10f 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -694,7 +694,7 @@ emit_flow(struct nv_pc *pc, struct nv_instruction *i, ubyte flow_op)
 
    set_pred(pc, i);
 
-   if (i->target) {
+   if (i->target && (i->opcode != NV_OP_BREAK)) {
       new_fixup(pc, NV_FIXUP_CFLOW_RELOC, i->target->bin_pos, 0x7ff800, 11);
       pc->emit[0] |= (i->target->bin_pos / 4) << 11;
    }
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index f2f8d0eaa3..e4b5d321db 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -120,15 +120,14 @@ nvi_isnop(struct nv_instruction *nvi)
 }
 
 static void
-nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b)
+nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
 {
+   struct nv_pc *pc = (struct nv_pc *)priv;
    struct nv_basic_block *in;
    struct nv_instruction *nvi, *next;
    int j;
    uint size, n32 = 0;
 
-   b->priv = 0;
-
    for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->bin_size; --j);
    if (j >= 0) {
       in = pc->bb_list[j];
@@ -200,17 +199,6 @@ nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b)
    assert(!b->entry || (b->exit && b->exit->is_long));
 
    pc->bin_size += b->bin_size *= 4;
-
-   /* descend CFG */
-
-   if (!b->out[0])
-      return;
-   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
-      return;
-
-   for (j = 0; j < 2; ++j)
-      if (b->out[j] && b->out[j] != b)
-         nv_pc_pass_pre_emission(pc, b->out[j]);
 }
 
 int
@@ -219,9 +207,9 @@ nv_pc_exec_pass2(struct nv_pc *pc)
    debug_printf("preparing %u blocks for emission\n", pc->num_blocks);
 
    pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
-  
    pc->num_blocks = 0;
-   nv_pc_pass_pre_emission(pc, pc->root);
+
+   nv_pc_pass_in_order(pc->root, nv_pc_pass_pre_emission, pc);
 
    return 0;
 }
@@ -307,8 +295,11 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
       if (nvi->def[0]->refc > 1)
          continue;
 
-      /* cannot MOV immediate to $oX */
-      if (nvi->src[0]->value->reg.file == NV_FILE_IMM)
+      /* cannot write to $oX when using immediate */
+      for (j = 0; j < 4 && nvi->src[j]; ++j)
+         if (nvi->src[j]->value->reg.file == NV_FILE_IMM)
+            break;
+      if (j < 4)
          continue;
 
       nvi->def[0] = sti->def[0];
@@ -339,7 +330,6 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
 
          if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) {
             nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
-            debug_printf("folded immediate %i\n", ld->def[0]->n);
             continue;
          }
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 172e44f62b..d45dd7f95f 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -358,6 +358,18 @@ try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b)
    do_join_values(ctx, a, b);
 }
 
+static INLINE boolean
+need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p)
+{
+   int i = 0, n = 0;
+
+   for (; i < 2; ++i)
+      if (p->out[i] && p->out_kind[i] != CFG_EDGE_LOOP_LEAVE)
+         ++n;
+
+   return (b->num_in > 1) && (n == 2);
+}
+
 /* For each operand of each PHI in b, generate a new value by inserting a MOV
  * at the end of the block it is coming from and replace the operand with its
  * result. This eliminates liveness conflicts and enables us to let values be
@@ -377,7 +389,7 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       p = pn = b->in[n];
       assert(p);
 
-      if (b->num_in > 1 && p->out[0] && p->out[1]) {
+      if (need_new_else_block(b, p)) {
          pn = new_basic_block(ctx->pc);
 
          if (p->out[0] == b)
@@ -481,32 +493,19 @@ pass_join_values(struct nv_pc_pass *ctx, int iter)
 }
 
 /* Order the instructions so that live intervals can be expressed in numbers. */
-static int
-pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b)
+static void
+pass_order_instructions(void *priv, struct nv_basic_block *b)
 {
+   struct nv_pc_pass *ctx = (struct nv_pc_pass *)priv;
    struct nv_instruction *i;
 
-   b->priv = 0;
+   b->pass_seq = ctx->pc->pass_seq;
 
    assert(!b->exit || !b->exit->next);
    for (i = b->phi; i; i = i->next) {
       i->serial = ctx->num_insns;
       ctx->insns[ctx->num_insns++] = i;
    }
-
-   b->pass_seq = ctx->pc->pass_seq;
-
-   if (!b->out[0])
-      return 0;
-   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in)
-      return 0;
-
-   if (b->out[0] != b)
-      pass_order_instructions(ctx, b->out[0]);
-   if (b->out[1] && b->out[1] != b)
-      pass_order_instructions(ctx, b->out[1]);
-
-   return 0;
 }
 
 static void
@@ -691,13 +690,15 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
    }
 
    /* remaining live-outs are live until the end */
-   for (j = 0; j < ctx->pc->num_values; ++j) {
-      if (!(b->live_set[j / 32] & (1 << (j % 32))))
-         continue;
+   if (b->exit) {
+      for (j = 0; j < ctx->pc->num_values; ++j) {
+         if (!(b->live_set[j / 32] & (1 << (j % 32))))
+            continue;
 #ifdef NV50_RA_DEBUG_LIVEI
-      debug_printf("adding range for live value %i\n", j);
+         debug_printf("adding range for live value %i\n", j);
 #endif
-      add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
+         add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
+      }
    }
    debug_printf("%s: looping through instructions now\n", __func__);
 
@@ -905,10 +906,7 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    }
 
    pc->pass_seq++;
-   ret = pass_order_instructions(ctx, pc->root);
-   assert(!ret && "order instructions");
-   if (ret)
-      goto out;
+   nv_pc_pass_in_order(pc->root, pass_order_instructions, ctx);
 
    pc->pass_seq++;
    ret = pass_build_intervals(ctx, pc->root);
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 26d1be8db8..54cd36f868 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -27,6 +27,7 @@
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_dump.h"
 
 static INLINE unsigned
 bitcount4(const uint32_t val)
@@ -186,6 +187,8 @@ prog_immediate(struct nv50_translation_info *ti,
    int c;
    unsigned n = ++ti->immd32_nr;
 
+   tgsi_dump_immediate(imm);
+
    if (n == (1 << (ffs(n) - 1)))
       ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16);
 
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 654bce59f3..1184d9be3b 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -92,6 +92,15 @@ struct nv50_program {
 #define NV50_INTERP_FLAT     (1 << 1)
 #define NV50_INTERP_CENTROID (1 << 2)
 
+#define NV50_PROG_MAX_SUBROUTINES 8
+
+/* analyze TGSI and see which TEMP[] are used as subroutine inputs/outputs */
+struct nv50_subroutine {
+   int id;
+   uint32_t argv[4][1]; /* 4 bitmasks, for each of xyzw, only allow 32 TEMPs */
+   uint32_t retv[4][1];
+};
+
 struct nv50_translation_info {
    struct nv50_program *p;
    unsigned inst_nr;
@@ -108,6 +117,8 @@ struct nv50_translation_info {
    uint32_t *immd32;
    unsigned immd32_nr;
    ubyte edgeflag_out;
+   struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES];
+   int subr_nr;
 };
 
 int nv50_generate_code(struct nv50_translation_info *ti);
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 6a9259c898..da33adcaa4 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -22,6 +22,19 @@
 
 /* XXX: need to clean this up so we get the typecasting right more naturally */
 
+/* LOOP FIXME 1
+ * In bld_store_loop_var, only replace values that belong to the TGSI register
+ * written.
+ * For TGSI MOV, we only associate the source value with the value tracker of
+ * the destination, instead of generating an actual MOV.
+ *
+ * Possible solution: generate PHI functions in loop headers in advance.
+ */
+/* LOOP FIXME 2:
+ * In fetch_by_bb, when going back through a break-block, we miss all of the
+ * definitions from inside the loop.
+ */
+
 #include <unistd.h>
 
 #include "nv50_context.h"
@@ -48,6 +61,8 @@ struct bld_value_stack {
    struct nv_value *top;
    struct nv_value **body;
    unsigned size;
+   uint16_t loop_use; /* 1 bit per loop level, indicates if used/defd */
+   uint16_t loop_def;
 };
 
 static INLINE void
@@ -81,19 +96,6 @@ bld_push_values(struct bld_value_stack *stacks, int n)
             bld_vals_push(&stacks[i * 4 + c]);
 }
 
-#define FETCH_TEMP(i, c)    (bld->tvs[i][c].top)
-#define STORE_TEMP(i, c, v) (bld->tvs[i][c].top = (v))
-#define FETCH_ADDR(i, c)    (bld->avs[i][c].top)
-#define STORE_ADDR(i, c, v) (bld->avs[i][c].top = (v))
-#define FETCH_PRED(i, c)    (bld->pvs[i][c].top)
-#define STORE_PRED(i, c, v) (bld->pvs[i][c].top = (v))
-#define FETCH_OUTR(i, c)    (bld->ovs[i][c].top)
-#define STORE_OUTR(i, c, v)                                         \
-   do {                                                             \
-      bld->ovs[i][c].top = (v);                                     \
-      bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \
-   } while (0)
-
 struct bld_context {
    struct nv50_translation_info *ti;
 
@@ -108,6 +110,7 @@ struct bld_context {
    struct nv_basic_block *else_bb[BLD_MAX_COND_NESTING];
    int cond_lvl;
    struct nv_basic_block *loop_bb[BLD_MAX_LOOP_NESTING];
+   struct nv_basic_block *brkt_bb[BLD_MAX_LOOP_NESTING];
    int loop_lvl;
 
    struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */
@@ -127,6 +130,51 @@ struct bld_context {
    uint num_immds;
 };
 
+static INLINE struct nv_value *
+bld_fetch(struct bld_context *bld, struct bld_value_stack *stk, int i, int c)
+{
+   stk[i * 4 + c].loop_use |= 1 << bld->loop_lvl;
+
+   return stk[i * 4 + c].top;
+}
+
+static void
+bld_store_loop_var(struct bld_context *, struct bld_value_stack *);
+
+static INLINE void
+bld_store(struct bld_context *bld, struct bld_value_stack *stk, int i, int c,
+          struct nv_value *val)
+{
+   bld_store_loop_var(bld, &stk[i * 4 + c]);
+
+   stk[i * 4 + c].top = val;
+}
+
+static INLINE void
+bld_clear_def_use(struct bld_value_stack *stk, int n, int lvl)
+{
+   int i;
+   const uint16_t mask = ~(1 << lvl);
+
+   for (i = 0; i < n * 4; ++i) {
+      stk[i].loop_def &= mask;
+      stk[i].loop_use &= mask;
+   }
+}
+
+#define FETCH_TEMP(i, c)    bld_fetch(bld, &bld->tvs[0][0], i, c)
+#define STORE_TEMP(i, c, v) bld_store(bld, &bld->tvs[0][0], i, c, (v))
+#define FETCH_ADDR(i, c)    bld_fetch(bld, &bld->avs[0][0], i, c)
+#define STORE_ADDR(i, c, v) bld_store(bld, &bld->avs[0][0], i, c, (v))
+#define FETCH_PRED(i, c)    bld_fetch(bld, &bld->pvs[0][0], i, c)
+#define STORE_PRED(i, c, v) bld_store(bld, &bld->pvs[0][0], i, c, (v))
+
+#define STORE_OUTR(i, c, v)                                         \
+   do {                                                             \
+      bld->ovs[i][c].top = (v);                                     \
+      bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \
+   } while (0)
+
 static INLINE void
 bld_warn_uninitialized(struct bld_context *bld, int kind,
                        struct bld_value_stack *stk, struct nv_basic_block *b)
@@ -134,8 +182,8 @@ bld_warn_uninitialized(struct bld_context *bld, int kind,
    long i = (stk - &bld->tvs[0][0]) / 4;
    long c = (stk - &bld->tvs[0][0]) & 3;
 
-   debug_printf("WARNING: TEMP[%li].%li %s used uninitialized in BB:%i\n",
-                i, c, kind ? "may be" : "is", b->id);
+   debug_printf("WARNING: TEMP[%li].%c %s used uninitialized in BB:%i\n",
+                i, (int)('x' + c), kind ? "may be" : "is", b->id);
 }
 
 static INLINE struct nv_value *
@@ -182,7 +230,8 @@ fetch_by_bb(struct bld_value_stack *stack,
       return;
    }
    for (i = 0; i < b->num_in; ++i)
-      fetch_by_bb(stack, vals, n, b->in[i]);
+      if (b->in_kind[i] != CFG_EDGE_BACK)
+         fetch_by_bb(stack, vals, n, b->in[i]);
 }
 
 static INLINE struct nv_value *
@@ -237,12 +286,15 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b,
       }
 
       for (i = 0; i < n; ++i) {
+         /* if value dominates b, continue to the redefinitions */
          if (nvbb_dominated_by(b, vals[i]->insn->bb))
             continue;
 
+         /* if value dominates any in-block, b should be the dom frontier */
          for (j = 0; j < b->num_in; ++j)
             if (nvbb_dominated_by(b->in[j], vals[i]->insn->bb))
                break;
+         /* otherwise, find the dominance frontier and put the phi there */
          if (j == b->num_in) {
             in = nvbb_dom_frontier(vals[i]->insn->bb);
             val = bld_phi(bld, in, stack);
@@ -269,6 +321,7 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b,
 static INLINE struct nv_value *
 bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack)
 {
+   stack->loop_use |= 1 << bld->loop_lvl;
    return bld_phi(bld, bld->pc->current_block, stack);
 }
 
@@ -290,6 +343,79 @@ bld_imm_u32(struct bld_context *bld, uint32_t u)
    return bld->saved_immd[n];
 }
 
+static void
+bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *,
+                  struct nv_value *);
+
+/* When setting a variable inside a loop, and we have used it before in the
+ * loop, we need to insert a phi function in the loop header.
+ */
+static void
+bld_store_loop_var(struct bld_context *bld, struct bld_value_stack *stk)
+{
+   struct nv_basic_block *bb;
+   struct nv_instruction *phi;
+   struct nv_value *val;
+   int ll;
+   uint16_t loop_def = stk->loop_def;
+
+   if (!(ll = bld->loop_lvl))
+      return;
+   stk->loop_def |= 1 << ll;
+
+   if ((~stk->loop_use | loop_def) & (1 << ll))
+      return;
+
+#if 0
+   debug_printf("TEMP[%li].%c used before loop redef (def=%x/use=%x)\n",
+                (stk - &bld->tvs[0][0]) / 4,
+                (int)('x' + ((stk - &bld->tvs[0][0]) & 3)),
+                loop_def, stk->loop_use);
+#endif
+
+   stk->loop_def |= 1 << ll;
+
+   assert(bld->loop_bb[ll - 1]->num_in == 1);
+
+   /* get last assignment from outside this loop, could be from bld_phi */
+   val = stk->body[stk->size - 1];
+
+   /* create the phi in the loop entry block */
+
+   bb = bld->pc->current_block;
+   bld->pc->current_block = bld->loop_bb[ll - 1];
+
+   phi = new_instruction(bld->pc, NV_OP_PHI);
+
+   bld_def(phi, 0, new_value(bld->pc, val->reg.file, val->reg.type));
+
+   bld->pc->pass_seq++;
+   bld_replace_value(bld->pc, bld->loop_bb[ll - 1], val, phi->def[0]);
+
+   assert(!stk->top);
+   bld_vals_push_val(stk, phi->def[0]);
+
+   phi->target = (struct nv_basic_block *)stk; /* cheat */
+
+   nv_reference(bld->pc, &phi->src[0], val);
+   nv_reference(bld->pc, &phi->src[1], phi->def[0]);
+
+   bld->pc->current_block = bb;
+}
+
+static void
+bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb)
+{
+   struct nv_instruction *phi;
+   struct nv_value *val;
+
+   for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = phi->next) {
+      val = bld_fetch_global(bld, (struct bld_value_stack *)phi->target);
+      nv_reference(bld->pc, &phi->src[1], val);
+      phi->target = NULL;
+   }
+}
+
 static INLINE struct nv_value *
 bld_imm_f32(struct bld_context *bld, float f)
 {
@@ -432,7 +558,8 @@ bld_kil(struct bld_context *bld, struct nv_value *src)
 
 static void
 bld_flow(struct bld_context *bld, uint opcode, ubyte cc,
-         struct nv_value *src, boolean plan_reconverge)
+         struct nv_value *src, struct nv_basic_block *target,
+         boolean plan_reconverge)
 {
    struct nv_instruction *nvi;
 
@@ -442,7 +569,9 @@ bld_flow(struct bld_context *bld, uint opcode, ubyte cc,
    nvi = new_instruction(bld->pc, opcode);
    nvi->is_terminator = 1;
    nvi->cc = cc;
-   nvi->flags_src = new_ref(bld->pc, src);
+   nvi->target = target;
+   if (src)
+      nvi->flags_src = new_ref(bld->pc, src);
 }
 
 static ubyte
@@ -1105,14 +1234,14 @@ bld_instruction(struct bld_context *bld,
    {
       struct nv_basic_block *b = new_basic_block(bld->pc);
 
-      nvbb_attach_block(bld->pc->current_block, b);
+      nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD);
 
       bld->join_bb[bld->cond_lvl] = bld->pc->current_block;
       bld->cond_bb[bld->cond_lvl] = bld->pc->current_block;
 
       src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0));
 
-      bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, FALSE);
+      bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE);
 
       ++bld->cond_lvl;
       bld_new_block(bld, b);
@@ -1123,7 +1252,7 @@ bld_instruction(struct bld_context *bld,
       struct nv_basic_block *b = new_basic_block(bld->pc);
 
       --bld->cond_lvl;
-      nvbb_attach_block(bld->join_bb[bld->cond_lvl], b);
+      nvbb_attach_block(bld->join_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD);
 
       bld->cond_bb[bld->cond_lvl]->exit->target = b;
       bld->cond_bb[bld->cond_lvl] = bld->pc->current_block;
@@ -1134,13 +1263,13 @@ bld_instruction(struct bld_context *bld,
       bld_new_block(bld, b);
    }
       break;
-   case TGSI_OPCODE_ENDIF: /* XXX: deal with ENDIF; ENDIF; */
+   case TGSI_OPCODE_ENDIF:
    {
       struct nv_basic_block *b = new_basic_block(bld->pc);
 
       --bld->cond_lvl;
-      nvbb_attach_block(bld->pc->current_block, b);
-      nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b);
+      nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD);
+      nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD);
 
       bld->cond_bb[bld->cond_lvl]->exit->target = b;
 
@@ -1154,16 +1283,58 @@ bld_instruction(struct bld_context *bld,
    }
       break;
    case TGSI_OPCODE_BGNLOOP:
-      assert(0);
+   {
+      struct nv_basic_block *bl = new_basic_block(bld->pc);
+      struct nv_basic_block *bb = new_basic_block(bld->pc);
+
+      bld->loop_bb[bld->loop_lvl] = bl;
+      bld->brkt_bb[bld->loop_lvl] = bb;
+
+      bld_flow(bld, NV_OP_BREAKADDR, NV_CC_TR, NULL, bb, FALSE);
+
+      nvbb_attach_block(bld->pc->current_block, bl, CFG_EDGE_LOOP_ENTER);
+
+      bld_new_block(bld, bld->loop_bb[bld->loop_lvl++]);
+
+      if (bld->loop_lvl == bld->pc->loop_nesting_bound)
+         bld->pc->loop_nesting_bound++;
+
+      bld_clear_def_use(&bld->tvs[0][0], BLD_MAX_TEMPS, bld->loop_lvl);
+      bld_clear_def_use(&bld->avs[0][0], BLD_MAX_ADDRS, bld->loop_lvl);
+      bld_clear_def_use(&bld->pvs[0][0], BLD_MAX_PREDS, bld->loop_lvl);
+   }
       break;
    case TGSI_OPCODE_BRK:
-      assert(0);
+   {
+      struct nv_basic_block *bb = bld->brkt_bb[bld->loop_lvl - 1];
+
+      bld_flow(bld, NV_OP_BREAK, NV_CC_TR, NULL, bb, FALSE);
+
+      /* XXX: don't do this for redundant BRKs */
+      nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE);
+   }
       break;
    case TGSI_OPCODE_CONT:
-      assert(0);
+   {
+      struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1];
+
+      bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE);
+
+      nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK);
+   }
       break;
    case TGSI_OPCODE_ENDLOOP:
-      assert(0);
+   {
+      struct nv_basic_block *bb = bld->loop_bb[--bld->loop_lvl];
+
+      bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE);
+
+      nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK);
+
+      bld_loop_end(bld, bb); /* replace loop-side operand of the phis */
+
+      bld_new_block(bld, bld->brkt_bb[bld->loop_lvl]);
+   }
       break;
    case TGSI_OPCODE_ABS:
    case TGSI_OPCODE_CEIL:
@@ -1298,6 +1469,17 @@ bld_instruction(struct bld_context *bld,
       emit_store(bld, insn, c, dst0[c]);
 }
 
+static INLINE void
+bld_free_value_trackers(struct bld_value_stack *base, int n)
+{
+   int i, c;
+
+   for (i = 0; i < n; ++i)
+      for (c = 0; c < 4; ++c)
+         if (base[i * 4 + c].body)
+            FREE(base[i * 4 + c].body);
+}
+
 int
 nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
 {
@@ -1309,7 +1491,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
    bld->pc = pc;
    bld->ti = ti;
 
-   pc->loop_nesting_bound = 1; /* XXX: should work with 0 */
+   pc->loop_nesting_bound = 1;
 
    c = util_bitcount(bld->ti->p->fp.interp >> 24);
    if (c && ti->p->type == PIPE_SHADER_FRAGMENT) {
@@ -1335,18 +1517,23 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
       }
    }
 
+   bld_free_value_trackers(&bld->tvs[0][0], BLD_MAX_TEMPS);
+   bld_free_value_trackers(&bld->avs[0][0], BLD_MAX_ADDRS);
+   bld_free_value_trackers(&bld->pvs[0][0], BLD_MAX_PREDS);
+
+   bld_free_value_trackers(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS);
+
    FREE(bld);
    return 0;
 }
 
-#if 0
 /* If a variable is assigned in a loop, replace all references to the value
  * from outside the loop with a phi value.
  */
 static void
-bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b,
-                   struct nv_value *old_val,
-                   struct nv_value *new_val)
+bld_replace_value(struct nv_pc *pc, struct nv_basic_block *b,
+                  struct nv_value *old_val,
+                  struct nv_value *new_val)
 {
    struct nv_instruction *nvi;
 
@@ -1361,12 +1548,12 @@ bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b,
       if (nvi->flags_src && nvi->flags_src->value == old_val)
          nv_reference(pc, &nvi->flags_src, new_val);
    }
+
    b->pass_seq = pc->pass_seq;
 
    if (b->out[0] && b->out[0]->pass_seq < pc->pass_seq)
-      bld_adjust_nv_refs(pc, b, old_val, new_val);
+      bld_replace_value(pc, b->out[0], old_val, new_val);
 
    if (b->out[1] && b->out[1]->pass_seq < pc->pass_seq)
-      bld_adjust_nv_refs(pc, b, old_val, new_val);
+      bld_replace_value(pc, b->out[1], old_val, new_val);
 }
-#endif
-- 
cgit v1.2.3


From 34e0db4c509fd669a7713c63848a98d89463ce1a Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Wed, 11 Aug 2010 18:44:26 +0200
Subject: nv50: more constant folding

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 204 ++++++++++++++++++++++++----
 1 file changed, 177 insertions(+), 27 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index e4b5d321db..64ffeaf430 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -248,18 +248,24 @@ check_swap_src_0_1(struct nv_instruction *nvi)
       return;
    assert(src0 && src1);
 
+   if (src1->value->reg.file == NV_FILE_IMM) {
+      /* should only be present from folding a constant MUL part of a MAD */
+      assert(nvi->opcode == NV_OP_ADD);
+      return;
+   }
+
    if (is_cmem_load(src0->value->insn)) {
       if (!is_cmem_load(src1->value->insn)) {
          nvi->src[0] = src1;
-	 nvi->src[1] = src0;
-	 /* debug_printf("swapping cmem load to 1\n"); */
+         nvi->src[1] = src0;
+         /* debug_printf("swapping cmem load to 1\n"); */
       }
    } else
    if (is_smem_load(src1->value->insn)) {
       if (!is_smem_load(src0->value->insn)) {
          nvi->src[0] = src1;
-	 nvi->src[1] = src0;
-	 /* debug_printf("swapping smem load to 0\n"); */
+         nvi->src[1] = src0;
+         /* debug_printf("swapping smem load to 0\n"); */
       }
    }
 
@@ -435,47 +441,168 @@ find_immediate(struct nv_ref *ref)
    return (src->reg.file == NV_FILE_IMM) ? src : NULL;
 }
 
+static void
+modifiers_apply(uint32_t *val, ubyte type, ubyte mod)
+{
+   if (mod & NV_MOD_ABS) {
+      if (type == NV_TYPE_F32)
+         *val &= 0x7fffffff;
+      else
+      if ((*val) & (1 << 31))
+         *val = ~(*val) + 1;
+   }
+   if (mod & NV_MOD_NEG) {
+      if (type == NV_TYPE_F32)
+         *val ^= 0x80000000;
+      else
+         *val = ~(*val) + 1;
+   }
+}
+
+static INLINE uint
+modifiers_opcode(ubyte mod)
+{
+   switch (mod) {
+   case NV_MOD_NEG: return NV_OP_NEG;
+   case NV_MOD_ABS: return NV_OP_ABS;
+   case 0:
+      return NV_OP_MOV;
+   default:
+      return NV_OP_NOP;
+   }
+}
+
+static void
+constant_expression(struct nv_pc *pc, struct nv_instruction *nvi,
+                    struct nv_value *src0, struct nv_value *src1)
+{
+   struct nv_value *val;
+   union {
+      float f32;
+      uint32_t u32;
+      int32_t s32;
+   } u0, u1, u;
+   ubyte type;
+
+   if (!nvi->def[0])
+      return;
+   type = nvi->def[0]->reg.type;
+
+   u.u32 = 0;
+   u0.u32 = src0->reg.imm.u32;
+   u1.u32 = src1->reg.imm.u32;
+
+   modifiers_apply(&u0.u32, type, nvi->src[0]->mod);
+   modifiers_apply(&u0.u32, type, nvi->src[1]->mod);
+
+   switch (nvi->opcode) {
+   case NV_OP_MAD:
+      if (nvi->src[2]->value->reg.file != NV_FILE_GPR)
+         return;
+      /* fall through */
+   case NV_OP_MUL:
+      switch (type) {
+      case NV_TYPE_F32: u.f32 = u0.f32 * u1.f32; break;
+      case NV_TYPE_U32: u.u32 = u0.u32 * u1.u32; break;
+      case NV_TYPE_S32: u.s32 = u0.s32 * u1.s32; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case NV_OP_ADD:
+      switch (type) {
+      case NV_TYPE_F32: u.f32 = u0.f32 + u1.f32; break;
+      case NV_TYPE_U32: u.u32 = u0.u32 + u1.u32; break;
+      case NV_TYPE_S32: u.s32 = u0.s32 + u1.s32; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case NV_OP_SUB:
+      switch (type) {
+      case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32;
+      case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32;
+      case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   default:
+      return;
+   }
+
+   nvi->opcode = NV_OP_MOV;
+
+   val = new_value(pc, NV_FILE_IMM, type);
+
+   val->reg.imm.u32 = u.u32;
+
+   nv_reference(pc, &nvi->src[1], NULL);
+   nv_reference(pc, &nvi->src[0], val);
+
+   if (nvi->src[2]) { /* from MAD */
+      nvi->src[1] = nvi->src[0];
+      nvi->src[0] = nvi->src[2];
+      nvi->src[2] = NULL;
+      nvi->opcode = NV_OP_ADD;
+   }
+}
+
 static void
 constant_operand(struct nv_pc *pc,
                  struct nv_instruction *nvi, struct nv_value *val, int s)
 {
+   union {
+      float f32;
+      uint32_t u32;
+      int32_t s32;
+   } u;
    int t = s ? 0 : 1;
+   uint op;
    ubyte type;
 
    if (!nvi->def[0])
       return;
    type = nvi->def[0]->reg.type;
 
+   u.u32 = val->reg.imm.u32;
+   modifiers_apply(&u.u32, type, nvi->src[s]->mod);
+
    switch (nvi->opcode) {
    case NV_OP_MUL:
-      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) ||
-          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) {
-         nvi->opcode = NV_OP_MOV;
+      if ((type == NV_TYPE_F32 && u.f32 == 1.0f) ||
+          (NV_TYPE_ISINT(type) && u.u32 == 1)) {
+         if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP)
+            break;
+         nvi->opcode = op;
          nv_reference(pc, &nvi->src[s], NULL);
-         if (!s) {
-            nvi->src[0] = nvi->src[1];
-            nvi->src[1] = NULL;
-         }
+         nvi->src[0] = nvi->src[t];
+         nvi->src[1] = NULL;
       } else
-      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) ||
-          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) {
+      if ((type == NV_TYPE_F32 && u.f32 == 2.0f) ||
+          (NV_TYPE_ISINT(type) && u.u32 == 2)) {
          nvi->opcode = NV_OP_ADD;
          nv_reference(pc, &nvi->src[s], nvi->src[t]->value);
+         nvi->src[s]->mod = nvi->src[t]->mod;
       } else
-      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) {
-         nvi->opcode = NV_OP_NEG;
+      if (type == NV_TYPE_F32 && u.f32 == -1.0f) {
+         if (nvi->src[t]->mod & NV_MOD_NEG)
+            nvi->opcode = NV_OP_MOV;
+         else
+            nvi->opcode = NV_OP_NEG;
          nv_reference(pc, &nvi->src[s], NULL);
          nvi->src[0] = nvi->src[t];
          nvi->src[1] = NULL;
       } else
-      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) {
+      if (type == NV_TYPE_F32 && u.f32 == -2.0f) {
          nvi->opcode = NV_OP_ADD;
-         assert(!nvi->src[s]->mod);
          nv_reference(pc, &nvi->src[s], nvi->src[t]->value);
-         nvi->src[t]->mod ^= NV_MOD_NEG;
-         nvi->src[s]->mod |= NV_MOD_NEG;
+         nvi->src[s]->mod = (nvi->src[t]->mod ^= NV_MOD_NEG);
       } else
-      if (val->reg.imm.u32 == 0) {
+      if (u.u32 == 0) {
          nvi->opcode = NV_OP_MOV;
          nv_reference(pc, &nvi->src[t], NULL);
          if (s) {
@@ -485,13 +612,29 @@ constant_operand(struct nv_pc *pc,
       }
       break;
    case NV_OP_ADD:
-      if (val->reg.imm.u32 == 0) {
-         nvi->opcode = NV_OP_MOV;
+      if (u.u32 == 0) {
+         if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP)
+            break;
+         nvi->opcode = op;
          nv_reference(pc, &nvi->src[s], NULL);
          nvi->src[0] = nvi->src[t];
          nvi->src[1] = NULL;
       }
       break;
+   case NV_OP_RCP:
+      u.f32 = 1.0f / u.f32;
+      (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32;
+      nvi->opcode = NV_OP_MOV;
+      assert(s == 0);
+      nv_reference(pc, &nvi->src[0], val);
+      break;
+   case NV_OP_RSQ:
+      u.f32 = 1.0f / sqrtf(u.f32);
+      (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32;
+      nvi->opcode = NV_OP_MOV;
+      assert(s == 0);
+      nv_reference(pc, &nvi->src[0], val);
+      break;
    default:
       break;
    }
@@ -509,11 +652,18 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
 
       next = nvi->next;
 
-      if ((src = find_immediate(nvi->src[0])) != NULL)
-         constant_operand(ctx->pc, nvi, src, 0);
-      else
-      if ((src = find_immediate(nvi->src[1])) != NULL)
-         constant_operand(ctx->pc, nvi, src, 1);
+      src0 = find_immediate(nvi->src[0]);
+      src1 = find_immediate(nvi->src[1]);
+
+      if (src0 && src1)
+         constant_expression(ctx->pc, nvi, src0, src1);
+      else {
+         if (src0)
+            constant_operand(ctx->pc, nvi, src0, 0);
+         else
+         if (src1)
+            constant_operand(ctx->pc, nvi, src1, 1);
+      }
 
       /* try to combine MUL, ADD into MAD */
       if (nvi->opcode != NV_OP_ADD)
-- 
cgit v1.2.3


From 4de293bb9acd1ecda683f735af32f7485a0f213e Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 15 Aug 2010 21:37:50 +0200
Subject: nv50: loops part 2

At least the mesa demo glsl/mandelbrot should work now.
---
 src/gallium/drivers/nv50/nv50_pc.h          |   8 +-
 src/gallium/drivers/nv50/nv50_pc_emit.c     |   1 +
 src/gallium/drivers/nv50/nv50_pc_optimize.c |   4 +-
 src/gallium/drivers/nv50/nv50_pc_print.c    |   2 +-
 src/gallium/drivers/nv50/nv50_screen.c      |  27 ++++
 src/gallium/drivers/nv50/nv50_screen.h      |   4 +-
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 233 ++++++++++++++++++----------
 7 files changed, 189 insertions(+), 90 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 8b1c9b3a72..b24a3067b8 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -47,7 +47,7 @@
 #define NV_OP_SHL       17
 #define NV_OP_SHR       18
 #define NV_OP_RCP       19
-/* gap */
+#define NV_OP_UNDEF     20
 #define NV_OP_RSQ       21
 #define NV_OP_LG2       22
 #define NV_OP_SIN       23
@@ -360,6 +360,12 @@ new_value(struct nv_pc *pc, ubyte file, ubyte type)
    return value;
 }
 
+static INLINE struct nv_value *
+new_value_like(struct nv_pc *pc, struct nv_value *like)
+{
+   return new_value(pc, like->reg.file, like->reg.type);
+}
+
 static INLINE struct nv_ref *
 new_ref(struct nv_pc *pc, struct nv_value *val)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index 35bd5ff10f..fe44b327ab 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -1130,6 +1130,7 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
       pc->emit[1] = 0xe0000000;
       break;
    case NV_OP_PHI:
+   case NV_OP_UNDEF:
    case NV_OP_SUB:
       NOUVEAU_ERR("operation \"%s\" should have been eliminated\n",
 		  nv_opcode_name(i->opcode));
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 64ffeaf430..daf63a3d20 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -89,7 +89,7 @@ inst_cullable(struct nv_instruction *nvi)
 static INLINE boolean
 nvi_isnop(struct nv_instruction *nvi)
 {
-   if (nvi->opcode == NV_OP_EXPORT)
+   if (nvi->opcode == NV_OP_EXPORT || nvi->opcode == NV_OP_UNDEF)
       return TRUE;
 
    if (nvi->fixed ||
@@ -849,7 +849,7 @@ nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)
    int j;
    struct nv_instruction *nvi, *next;
 
-   for (nvi = b->entry; nvi; nvi = next) {
+   for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = next) {
       next = nvi->next;
 
       if (inst_cullable(nvi)) {
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index c812dbd066..a4f567bde4 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -59,7 +59,7 @@ static const char *nv_opcode_names[NV_OP_COUNT + 1] = {
    "shl",
    "shr",
    "rcp",
-   "(undefined)",
+   "undef",
    "rsqrt",
    "lg2",
    "sin",
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index e0c06c29ba..78137d6940 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -253,14 +253,23 @@ nv50_screen_relocs(struct nv50_screen *screen)
 	}
 }
 
+#ifndef NOUVEAU_GETPARAM_GRAPH_UNITS
+# define NOUVEAU_GETPARAM_GRAPH_UNITS 13
+#endif
+
+extern int nouveau_device_get_param(struct nouveau_device *dev,
+                                    uint64_t param, uint64_t *value);
+
 struct pipe_screen *
 nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
 	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
 	struct nouveau_channel *chan;
 	struct pipe_screen *pscreen;
+	uint64_t value;
 	unsigned chipset = dev->chipset;
 	unsigned tesla_class = 0;
+	unsigned stack_size;
 	int ret, i;
 	const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
 
@@ -478,6 +487,24 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	OUT_RING  (chan, 0x121 | (NV50_CB_PGP << 12));
 	OUT_RING  (chan, 0x131 | (NV50_CB_PFP << 12));
 
+	/* shader stack */
+	nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
+
+	stack_size  = util_bitcount(value & 0xffff);
+	stack_size *= util_bitcount((value >> 24) & 0xf);
+	stack_size *= 32 * 64 * 8;
+
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+			     stack_size, &screen->stack_bo);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+	BEGIN_RING(chan, screen->tesla, NV50TCL_STACK_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RING  (chan, 4);
+
 	/* Vertex array limits - max them out */
 	for (i = 0; i < 16; i++) {
 		BEGIN_RING(chan, screen->tesla,
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index a491ba31b2..1517f5608f 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -22,11 +22,11 @@ struct nv50_screen {
 
 	struct nouveau_resource *immd_heap;
 
-	struct pipe_resource *strm_vbuf[16];
-
 	struct nouveau_bo *tic;
 	struct nouveau_bo *tsc;
 
+	struct nouveau_bo *stack_bo;
+
 	boolean force_push;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index da33adcaa4..7e77ed6ef6 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -22,19 +22,6 @@
 
 /* XXX: need to clean this up so we get the typecasting right more naturally */
 
-/* LOOP FIXME 1
- * In bld_store_loop_var, only replace values that belong to the TGSI register
- * written.
- * For TGSI MOV, we only associate the source value with the value tracker of
- * the destination, instead of generating an actual MOV.
- *
- * Possible solution: generate PHI functions in loop headers in advance.
- */
-/* LOOP FIXME 2:
- * In fetch_by_bb, when going back through a break-block, we miss all of the
- * definitions from inside the loop.
- */
-
 #include <unistd.h>
 
 #include "nv50_context.h"
@@ -78,6 +65,24 @@ bld_vals_push_val(struct bld_value_stack *stk, struct nv_value *val)
    stk->body[stk->size++] = val;
 }
 
+static INLINE boolean
+bld_vals_del_val(struct bld_value_stack *stk, struct nv_value *val)
+{
+   unsigned i;
+
+   for (i = stk->size - 1; i >= 0; --i)
+      if (stk->body[i] == val)
+         break;
+   if (i < 0)
+      return FALSE;
+
+   if (i != stk->size - 1)
+      stk->body[i] = stk->body[stk->size - 1];
+
+   --stk->size; /* XXX: old size in REALLOC */
+   return TRUE;
+}
+
 static INLINE void
 bld_vals_push(struct bld_value_stack *stk)
 {
@@ -118,7 +123,7 @@ struct bld_context {
    struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */
    struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4];
 
-   uint32_t outputs_written[PIPE_MAX_SHADER_OUTPUTS / 32];
+   uint32_t outputs_written[(PIPE_MAX_SHADER_OUTPUTS + 31) / 32];
 
    struct nv_value *frgcrd[4];
    struct nv_value *sysval[4];
@@ -130,6 +135,21 @@ struct bld_context {
    uint num_immds;
 };
 
+static INLINE ubyte
+bld_stack_file(struct bld_context *bld, struct bld_value_stack *stk)
+{
+   if (stk < &bld->avs[0][0])
+      return NV_FILE_GPR;
+   else
+   if (stk < &bld->pvs[0][0])
+      return NV_FILE_ADDR;
+   else
+   if (stk < &bld->ovs[0][0])
+      return NV_FILE_FLAGS;
+   else
+      return NV_FILE_OUT;
+}
+
 static INLINE struct nv_value *
 bld_fetch(struct bld_context *bld, struct bld_value_stack *stk, int i, int c)
 {
@@ -138,16 +158,29 @@ bld_fetch(struct bld_context *bld, struct bld_value_stack *stk, int i, int c)
    return stk[i * 4 + c].top;
 }
 
-static void
-bld_store_loop_var(struct bld_context *, struct bld_value_stack *);
+static struct nv_value *
+bld_loop_phi(struct bld_context *, struct bld_value_stack *, struct nv_value *);
 
+/* If a variable is defined in a loop without prior use, we don't need
+ * a phi in the loop header to account for backwards flow.
+ *
+ * However, if this variable is then also used outside the loop, we do
+ * need a phi after all. But we must not use this phi's def inside the
+ * loop, so we can eliminate the phi if it is unused later.
+ */
 static INLINE void
 bld_store(struct bld_context *bld, struct bld_value_stack *stk, int i, int c,
           struct nv_value *val)
 {
-   bld_store_loop_var(bld, &stk[i * 4 + c]);
+   const uint16_t m = 1 << bld->loop_lvl;
+
+   stk = &stk[i * 4 + c];
 
-   stk[i * 4 + c].top = val;
+   if (bld->loop_lvl && !(m & (stk->loop_def | stk->loop_use)))
+      bld_loop_phi(bld, stk, val);
+
+   stk->top = val;
+   stk->loop_def |= 1 << bld->loop_lvl;
 }
 
 static INLINE void
@@ -182,6 +215,9 @@ bld_warn_uninitialized(struct bld_context *bld, int kind,
    long i = (stk - &bld->tvs[0][0]) / 4;
    long c = (stk - &bld->tvs[0][0]) & 3;
 
+   if (c == 3)
+      c = -1;
+
    debug_printf("WARNING: TEMP[%li].%c %s used uninitialized in BB:%i\n",
                 i, (int)('x' + c), kind ? "may be" : "is", b->id);
 }
@@ -237,6 +273,14 @@ fetch_by_bb(struct bld_value_stack *stack,
 static INLINE struct nv_value *
 bld_load_imm_u32(struct bld_context *bld, uint32_t u);
 
+static INLINE struct nv_value *
+bld_undef(struct bld_context *bld, ubyte file)
+{
+   struct nv_instruction *nvi = new_instruction(bld->pc, NV_OP_UNDEF);
+
+   return bld_def(nvi, 0, new_value(bld->pc, file, NV_TYPE_U32));
+}
+
 static struct nv_value *
 bld_phi(struct bld_context *bld, struct nv_basic_block *b,
         struct bld_value_stack *stack)
@@ -267,21 +311,19 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b,
             if (in->num_in == 1) {
                in = in->in[0];
             } else {
-               if (!nvbb_reachable_by(in->in[0], vals[0]->insn->bb, b)) {
+               if (!nvbb_reachable_by(in->in[0], vals[0]->insn->bb, b))
                   in = in->in[0];
-                  break;
-               }
-               if (!nvbb_reachable_by(in->in[1], vals[0]->insn->bb, b)) {
+               else
+               if (!nvbb_reachable_by(in->in[1], vals[0]->insn->bb, b))
                   in = in->in[1];
-                  break;
-               }
-               in = in->in[0];
+               else
+                  in = in->in[0];
             }
          }
          bld->pc->current_block = in;
 
          /* should make this a no-op */
-         bld_vals_push_val(stack, bld_load_imm_u32(bld, 0));
+         bld_vals_push_val(stack, bld_undef(bld, vals[0]->reg.file));
          continue;
       }
 
@@ -318,10 +360,55 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b,
    return phi->def[0];
 }
 
+static struct nv_value *
+bld_loop_phi(struct bld_context *bld, struct bld_value_stack *stack,
+             struct nv_value *def)
+{
+   struct nv_basic_block *bb = bld->pc->current_block;
+   struct nv_instruction *phi;
+   struct nv_value *val;
+
+   val = bld_phi(bld, bld->pc->current_block, stack);
+   if (!val) {
+      bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]->in[0];
+
+      val = bld_undef(bld, bld_stack_file(bld, stack));
+   }
+
+   bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1];
+
+   phi = new_instruction(bld->pc, NV_OP_PHI);
+
+   bld_def(phi, 0, new_value_like(bld->pc, val));
+   if (!def)
+      def = phi->def[0];
+
+   bld_vals_push_val(stack, phi->def[0]);
+
+   phi->target = (struct nv_basic_block *)stack; /* cheat */
+
+   nv_reference(bld->pc, &phi->src[0], val);
+   nv_reference(bld->pc, &phi->src[1], def);
+
+   bld->pc->current_block = bb;
+
+   return phi->def[0];
+}
+
 static INLINE struct nv_value *
 bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack)
 {
-   stack->loop_use |= 1 << bld->loop_lvl;
+   const uint16_t m = 1 << bld->loop_lvl;
+   const uint16_t use = stack->loop_use;
+
+   stack->loop_use |= m;
+
+   /* If neither used nor def'd inside the loop, build a phi in foresight,
+    * so we don't have to replace stuff later on, which requires tracking.
+    */
+   if (bld->loop_lvl && !((use | stack->loop_def) & m))
+      return bld_loop_phi(bld, stack, NULL);
+
    return bld_phi(bld, bld->pc->current_block, stack);
 }
 
@@ -347,72 +434,50 @@ static void
 bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *,
                   struct nv_value *);
 
-/* When setting a variable inside a loop, and we have used it before in the
- * loop, we need to insert a phi function in the loop header.
+/* Replace the source of the phi in the loop header by the last assignment,
+ * or eliminate the phi function if there is no assignment inside the loop.
+ *
+ * Redundancy situation 1 - (used) but (not redefined) value:
+ *  %3 = phi %0, %3 = %3 is used
+ *  %3 = phi %0, %4 = is new definition
+ *
+ * Redundancy situation 2 - (not used) but (redefined) value:
+ *  %3 = phi %0, %2 = %2 is used, %3 could be used outside, deleted by DCE
  */
 static void
-bld_store_loop_var(struct bld_context *bld, struct bld_value_stack *stk)
+bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb)
 {
-   struct nv_basic_block *bb;
-   struct nv_instruction *phi;
+   struct nv_instruction *phi, *next;
    struct nv_value *val;
-   int ll;
-   uint16_t loop_def = stk->loop_def;
-
-   if (!(ll = bld->loop_lvl))
-      return;
-   stk->loop_def |= 1 << ll;
-
-   if ((~stk->loop_use | loop_def) & (1 << ll))
-      return;
-
-#if 0
-   debug_printf("TEMP[%li].%c used before loop redef (def=%x/use=%x)\n",
-                (stk - &bld->tvs[0][0]) / 4,
-                (int)('x' + ((stk - &bld->tvs[0][0]) & 3)),
-                loop_def, stk->loop_use);
-#endif
+   struct bld_value_stack *stk;
+   int s;
 
-   stk->loop_def |= 1 << ll;
+   for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = next) {
+      next = phi->next;
 
-   assert(bld->loop_bb[ll - 1]->num_in == 1);
-
-   /* get last assignment from outside this loop, could be from bld_phi */
-   val = stk->body[stk->size - 1];
-
-   /* create the phi in the loop entry block */
-
-   bb = bld->pc->current_block;
-   bld->pc->current_block = bld->loop_bb[ll - 1];
-
-   phi = new_instruction(bld->pc, NV_OP_PHI);
+      stk = (struct bld_value_stack *)phi->target;
+      phi->target = NULL;
 
-   bld_def(phi, 0, new_value(bld->pc, val->reg.file, val->reg.type));
+      val = bld_fetch_global(bld, stk);
 
-   bld->pc->pass_seq++;
-   bld_replace_value(bld->pc, bld->loop_bb[ll - 1], val, phi->def[0]);
+      nv_reference(bld->pc, &phi->src[1], val);
 
-   assert(!stk->top);
-   bld_vals_push_val(stk, phi->def[0]);
+      s = -1;
+      if (phi->src[0]->value == phi->def[0] ||
+          phi->src[0]->value == phi->src[1]->value)
+         s = 1;
+      else
+      if (phi->src[1]->value == phi->def[0])
+         s = 0;
 
-   phi->target = (struct nv_basic_block *)stk; /* cheat */
+      if (s >= 0) {
+         bld_vals_del_val(stk, phi->def[0]);
 
-   nv_reference(bld->pc, &phi->src[0], val);
-   nv_reference(bld->pc, &phi->src[1], phi->def[0]);
+         ++bld->pc->pass_seq;
+         bld_replace_value(bld->pc, bb, phi->def[0], phi->src[s]->value);
 
-   bld->pc->current_block = bb;
-}
-
-static void
-bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb)
-{
-   struct nv_instruction *phi;
-   struct nv_value *val;
-
-   for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = phi->next) {
-      val = bld_fetch_global(bld, (struct bld_value_stack *)phi->target);
-      nv_reference(bld->pc, &phi->src[1], val);
-      phi->target = NULL;
+         nv_nvi_delete(phi);
+      }
    }
 }
 
@@ -437,7 +502,7 @@ bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0)
 
 static struct nv_value *
 bld_insn_2(struct bld_context *bld, uint opcode,
-	      struct nv_value *src0, struct nv_value *src1)
+           struct nv_value *src0, struct nv_value *src1)
 {
    struct nv_instruction *insn = new_instruction(bld->pc, opcode);
 
@@ -449,8 +514,8 @@ bld_insn_2(struct bld_context *bld, uint opcode,
 
 static struct nv_value *
 bld_insn_3(struct bld_context *bld, uint opcode,
-              struct nv_value *src0, struct nv_value *src1,
-              struct nv_value *src2)
+           struct nv_value *src0, struct nv_value *src1,
+           struct nv_value *src2)
 {
    struct nv_instruction *insn = new_instruction(bld->pc, opcode);
 
-- 
cgit v1.2.3


From e7a0bfa69a6ce45bb53baa8220eae418225c5649 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 16 Aug 2010 15:21:23 +0200
Subject: nv50: flatten simple IF/ELSE/ENDIF constructs

Less branching means less instructions and less thread divergence.
---
 src/gallium/drivers/nv50/nv50_pc.c          |  14 ++++
 src/gallium/drivers/nv50/nv50_pc.h          |   1 +
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 116 +++++++++++++++++++++++-----
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  |  16 +++-
 4 files changed, 123 insertions(+), 24 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 7601049126..5041fc7505 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -125,6 +125,20 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)
    }
 }
 
+/* Return whether this instruction can be executed conditionally. */
+boolean
+nv50_nvi_can_predicate(struct nv_instruction *nvi)
+{
+   int i;
+
+   if (nvi->flags_src)
+      return FALSE;
+   for (i = 0; i < 4 && nvi->src[i]; ++i)
+      if (nvi->src[i]->value->reg.file == NV_FILE_IMM)
+         return FALSE;
+   return TRUE;
+}
+
 ubyte
 nv50_supported_src_mods(uint opcode, int s)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index b24a3067b8..28208ad247 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -432,6 +432,7 @@ void nv_print_program(struct nv_basic_block *b);
 boolean nv_op_commutative(uint opcode);
 int nv50_indirect_opnd(struct nv_instruction *);
 boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s);
+boolean nv50_nvi_can_predicate(struct nv_instruction *);
 boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *);
 ubyte nv50_supported_src_mods(uint opcode, int s);
 int nv_nvi_refcount(struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index daf63a3d20..4cf387257d 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -119,6 +119,15 @@ nvi_isnop(struct nv_instruction *nvi)
    return values_equal(nvi->def[0], nvi->src[0]->value);
 }
 
+struct nv_pass {
+   struct nv_pc *pc;
+   int n;
+   void *priv;
+};
+
+static int
+nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b);
+
 static void
 nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
 {
@@ -204,6 +213,13 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
 int
 nv_pc_exec_pass2(struct nv_pc *pc)
 {
+   struct nv_pass pass;
+
+   pass.pc = pc;
+
+   pc->pass_seq++;
+   nv_pass_flatten(&pass, pc->root);
+
    debug_printf("preparing %u blocks for emission\n", pc->num_blocks);
 
    pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
@@ -273,12 +289,6 @@ check_swap_src_0_1(struct nv_instruction *nvi)
       nvi->set_cond = cc_swapped[nvi->set_cond];
 }
 
-struct nv_pass {
-   struct nv_pc *pc;
-   int n;
-   void *priv;
-};
-
 static int
 nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
 {
@@ -863,24 +873,95 @@ nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)
    return 0;
 }
 
+/* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE.
+ * Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with
+ * BREAK and dummy ELSE block.
+ */
 static INLINE boolean
-bb_simple_if_endif(struct nv_basic_block *bb)
+bb_is_if_else_endif(struct nv_basic_block *bb)
+{
+   if (!bb->out[0] || !bb->out[1])
+      return FALSE;
+
+   if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) {
+      return (bb->out[0]->out[1] == bb->out[1]->out[0] &&
+              !bb->out[1]->out[1]);
+   } else {
+      return (bb->out[0]->out[0] == bb->out[1]->out[0] &&
+              !bb->out[0]->out[1] &&
+              !bb->out[1]->out[1]);
+   }
+}
+
+/* predicate instructions and remove branch at the end */
+static void
+predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b,
+                       struct nv_value *p, ubyte cc)
 {
-   return (bb->out[0] && bb->out[1] &&
-           bb->out[0]->out[0] == bb->out[1] &&
-           !bb->out[0]->out[1]);
+   struct nv_instruction *nvi;
+
+   if (!b->entry)
+      return;
+   for (nvi = b->entry; nvi->next; nvi = nvi->next) {
+      if (!nvi_isnop(nvi)) {
+         nvi->cc = cc;
+         nv_reference(pc, &nvi->flags_src, p);
+      }
+   }
+
+   if (nvi->opcode == NV_OP_BRA)
+      nv_nvi_delete(nvi);
+   else
+   if (!nvi_isnop(nvi)) {
+      nvi->cc = cc;
+      nv_reference(pc, &nvi->flags_src, p);
+   }
 }
 
+/* NOTE: Run this after register allocation, we can just cut out the cflow
+ * instructions and hook the predicates to the conditional OPs if they are
+ * not using immediates; better than inserting SELECT to join definitions.
+ *
+ * NOTE: Should adapt prior optimization to make this possible more often.
+ */
 static int
 nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
 {
-   int j;
+   struct nv_instruction *nvi;
+   struct nv_value *pred;
+   int i;
+   int n0 = 0, n1 = 0;
+
+   if (bb_is_if_else_endif(b)) {
+
+      debug_printf("nv_pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id);
 
-   if (bb_simple_if_endif(b)) {
-      ++ctx->n;
-      debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n);
+      for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0)
+         if (!nv50_nvi_can_predicate(nvi))
+            break;
+      if (!nvi) {
+         for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1)
+            if (!nv50_nvi_can_predicate(nvi))
+               break;
+         if (nvi) {
+            debug_printf("cannot predicate: "); nv_print_instruction(nvi);
+         }
+      } else {
+         debug_printf("cannot predicate: "); nv_print_instruction(nvi);
+      }
+
+      if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */
+         assert(b->exit && b->exit->flags_src);
+         pred = b->exit->flags_src->value;
+
+         predicate_instructions(ctx->pc, b->out[0], pred, NV_CC_NE | NV_CC_U);
+         predicate_instructions(ctx->pc, b->out[1], pred, NV_CC_EQ);
+
+         assert(b->exit && b->exit->opcode == NV_OP_BRA);
+         nv_nvi_delete(b->exit);
+      }
    }
-   DESCEND_ARBITRARY(j, nv_pass_flatten);
+   DESCEND_ARBITRARY(i, nv_pass_flatten);
 
    return 0;
 }
@@ -960,11 +1041,6 @@ nv_pc_exec_pass0(struct nv_pc *pc)
    pass.n = 0;
    pass.pc = pc;
 
-   pc->pass_seq++;
-   ret = nv_pass_flatten(&pass, pc->root);
-   if (ret)
-      return ret;
-
    /* Do this first, so we don't have to pay attention
     * to whether sources are supported memory loads.
     */
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 7e77ed6ef6..b23c285dc1 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -591,7 +591,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)
 
 
 static struct nv_value *
-bld_predicate(struct bld_context *bld, struct nv_value *src)
+bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only)
 {
    struct nv_instruction *nvi = src->insn;
 
@@ -600,6 +600,14 @@ bld_predicate(struct bld_context *bld, struct nv_value *src)
        nvi->bb != bld->pc->current_block) {
       nvi = new_instruction(bld->pc, NV_OP_CVT);
       nv_reference(bld->pc, &nvi->src[0], src);
+   } else
+   if (bool_only) {
+      while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_CVT ||
+             nvi->opcode == NV_OP_NEG) {
+         /* TGSI SET gets conversion to f32, we only need source 0/~0 */
+         if (!nvi->def[0]->insn->flags_src)
+            nvi = nvi->src[0]->value->insn;
+      }
    }
 
    if (!nvi->flags_def) {
@@ -614,7 +622,7 @@ bld_kil(struct bld_context *bld, struct nv_value *src)
 {
    struct nv_instruction *nvi;
 
-   src = bld_predicate(bld, src);
+   src = bld_predicate(bld, src, FALSE);
    nvi = new_instruction(bld->pc, NV_OP_KIL);
    nvi->fixed = 1;
    nvi->flags_src = new_ref(bld->pc, src);
@@ -1223,7 +1231,7 @@ bld_instruction(struct bld_context *bld,
          src0 = emit_fetch(bld, insn, 0, c);
          src1 = emit_fetch(bld, insn, 1, c);
          src2 = emit_fetch(bld, insn, 2, c);
-         src0 = bld_predicate(bld, src0);
+         src0 = bld_predicate(bld, src0, FALSE);
 
          src1 = bld_insn_1(bld, NV_OP_MOV, src1);
          src1->insn->flags_src = new_ref(bld->pc, src0);
@@ -1304,7 +1312,7 @@ bld_instruction(struct bld_context *bld,
       bld->join_bb[bld->cond_lvl] = bld->pc->current_block;
       bld->cond_bb[bld->cond_lvl] = bld->pc->current_block;
 
-      src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0));
+      src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0), TRUE);
 
       bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE);
 
-- 
cgit v1.2.3


From 6c5c55723d32f8933ffb5fc6b5beb209eca84ca8 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 16 Aug 2010 17:18:30 +0200
Subject: nv50: fix thinko in store to output reg possible check

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 4cf387257d..5d575461ca 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -315,7 +315,7 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
       for (j = 0; j < 4 && nvi->src[j]; ++j)
          if (nvi->src[j]->value->reg.file == NV_FILE_IMM)
             break;
-      if (j < 4)
+      if (j < 4 && nvi->src[j])
          continue;
 
       nvi->def[0] = sti->def[0];
-- 
cgit v1.2.3


From 62f933a6f617050a267079b27360eaae2d0e1a70 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 16 Aug 2010 18:00:39 +0200
Subject: nv50: generate JOINs for outermost IF clauses

---
 src/gallium/drivers/nv50/nv50_pc.h          |  3 ++-
 src/gallium/drivers/nv50/nv50_pc_emit.c     | 11 ++++++++++-
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 16 +++++++++++++---
 src/gallium/drivers/nv50/nv50_pc_print.c    |  1 +
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 11 +++++------
 5 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 28208ad247..d24375100d 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -83,7 +83,8 @@
 #define NV_OP_NOP       53
 #define NV_OP_SELECT    54
 #define NV_OP_EXPORT    55
-#define NV_OP_COUNT     56
+#define NV_OP_JOIN      56
+#define NV_OP_COUNT     57
 
 #define NV_FILE_GPR      0
 #define NV_FILE_OUT      1
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index fe44b327ab..3a3b277c13 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -38,7 +38,7 @@ const ubyte nv50_inst_min_size_tab[NV_OP_COUNT] =
    0, 0, 0, 8, 8, 4, 4, 4, 8, 4, 4, 8, 8, 8, 8, 8, /* 15 */
    8, 8, 8, 4, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 31 */
    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 47 */
-   4, 8, 8, 8, 8, 8, 0, 0
+   4, 8, 8, 8, 8, 8, 0, 0, 8
 };
 
 /* XXX: silence, you ! */
@@ -71,6 +71,9 @@ nv50_inst_min_size(struct nv_instruction *i)
    if (i->flags_def || i->flags_src || i->src[4])
       return 8;
 
+   if (i->is_join)
+      return 8;
+
    if (i->src[2]) {
       if (i->saturate || i->src[2]->mod)
          return 8;
@@ -1126,6 +1129,7 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
       emit_flow(pc, i, 0xa);
       break;
    case NV_OP_NOP:
+   case NV_OP_JOIN:
       pc->emit[0] = 0xf0000001;
       pc->emit[1] = 0xe0000000;
       break;
@@ -1141,5 +1145,10 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
       break;
    }
 
+   if (i->is_join) {
+      assert(i->is_long && !(pc->emit[1] & 1));
+      pc->emit[1] |= 2;
+   }
+
    assert((pc->emit[0] & 1) == i->is_long);
 }
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 5d575461ca..b35dd72841 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -80,7 +80,7 @@ inst_commutation_legal(struct nv_instruction *a,
 static INLINE boolean
 inst_cullable(struct nv_instruction *nvi)
 {
-   return (!(nvi->is_terminator ||
+   return (!(nvi->is_terminator || nvi->is_join ||
              nvi->target ||
              nvi->fixed ||
              nv_nvi_refcount(nvi)));
@@ -95,7 +95,8 @@ nvi_isnop(struct nv_instruction *nvi)
    if (nvi->fixed ||
        nvi->is_terminator ||
        nvi->flags_src ||
-       nvi->flags_def)
+       nvi->flags_def ||
+       nvi->is_join)
       return FALSE;
 
    if (nvi->def[0]->join->reg.id < 0)
@@ -934,7 +935,7 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
 
    if (bb_is_if_else_endif(b)) {
 
-      debug_printf("nv_pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id);
+      debug_printf("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id);
 
       for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0)
          if (!nv50_nvi_can_predicate(nvi))
@@ -959,6 +960,15 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
 
          assert(b->exit && b->exit->opcode == NV_OP_BRA);
          nv_nvi_delete(b->exit);
+
+         if (b->exit && b->exit->opcode == NV_OP_JOINAT)
+            nv_nvi_delete(b->exit);
+
+         if ((nvi = b->out[0]->out[0]->entry)) {
+            nvi->is_join = 0;
+            if (nvi->opcode == NV_OP_JOIN)
+               nv_nvi_delete(nvi);
+         }
       }
    }
    DESCEND_ARBITRARY(i, nv_pass_flatten);
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index a4f567bde4..7bdeb1c78d 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -95,6 +95,7 @@ static const char *nv_opcode_names[NV_OP_COUNT + 1] = {
    "nop",
    "select",
    "export",
+   "join",
    "BAD_OP"
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index b23c285dc1..d6c5a8d660 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1314,7 +1314,7 @@ bld_instruction(struct bld_context *bld,
 
       src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0), TRUE);
 
-      bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE);
+      bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, (bld->cond_lvl == 0));
 
       ++bld->cond_lvl;
       bld_new_block(bld, b);
@@ -1346,13 +1346,12 @@ bld_instruction(struct bld_context *bld,
 
       bld->cond_bb[bld->cond_lvl]->exit->target = b;
 
-      if (0 && bld->join_bb[bld->cond_lvl]) {
-         bld->join_bb[bld->cond_lvl]->exit->prev->target = b;
+      bld_new_block(bld, b);
 
-         new_instruction(bld->pc, NV_OP_NOP)->is_join = TRUE;
+      if (!bld->cond_lvl && bld->join_bb[bld->cond_lvl]) {
+         bld->join_bb[bld->cond_lvl]->exit->prev->target = b;
+         new_instruction(bld->pc, NV_OP_JOIN)->is_join = TRUE;
       }
-
-      bld_new_block(bld, b);
    }
       break;
    case TGSI_OPCODE_BGNLOOP:
-- 
cgit v1.2.3


From ce1629564d1cce80b2762d266640e3181a68e848 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 17 Aug 2010 11:51:51 +0200
Subject: nv50: more TGSI opcodes (SIN, SCS, ARL, RET, KILP)

---
 src/gallium/drivers/nv50/nv50_pc_emit.c     | 22 +++++++++++++++++
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 13 +++++-----
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 38 ++++++++++++++++++++++++++---
 3 files changed, 63 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index 3a3b277c13..b5f4383aa1 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -747,9 +747,31 @@ emit_bitop2(struct nv_pc *pc, struct nv_instruction *i)
    }
 }
 
+static void
+emit_arl(struct nv_pc *pc, struct nv_instruction *i)
+{
+   assert(SFILE(i, 0) == NV_FILE_GPR);
+   assert(SFILE(i, 1) == NV_FILE_IMM);
+
+   assert(!i->flags_def);
+
+   pc->emit[0] = 0x00000001;
+   pc->emit[1] = 0xc0000000;
+
+   set_dst(pc, i->def[0]);
+   set_pred(pc, i);
+   set_src_0(pc, i->src[0]);
+   pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x3f) << 16;
+}
+
 static void
 emit_shift(struct nv_pc *pc, struct nv_instruction *i)
 {
+   if (DFILE(i, 0) == NV_FILE_ADDR) {
+      emit_arl(pc, i);
+      return;
+   }
+
    pc->emit[0] = 0x30000001;
    pc->emit[1] = 0xc4000000;
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index b35dd72841..3e6e09a904 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -293,14 +293,15 @@ check_swap_src_0_1(struct nv_instruction *nvi)
 static int
 nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
 {
-   struct nv_instruction *nvi, *sti;
+   struct nv_instruction *nvi, *sti, *next;
    int j;
 
-   for (sti = b->entry; sti; sti = sti->next) {
-      if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT)
-         continue;
+   for (sti = b->entry; sti; sti = next) {
+      next = sti->next;
 
       /* only handling MOV to $oX here */
+      if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT)
+         continue;
       if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA)
          continue;
 
@@ -320,9 +321,9 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
          continue;
 
       nvi->def[0] = sti->def[0];
-      sti->def[0] = NULL;
       nvi->fixed = sti->fixed;
-      sti->fixed = 0;
+
+      nv_nvi_delete(sti);
    }
    DESCEND_ARBITRARY(j, nv_pass_fold_stores);
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index d6c5a8d660..dafff725b8 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -685,6 +685,8 @@ translate_opcode(uint opcode)
    case TGSI_OPCODE_CEIL: return NV_OP_CEIL;
    case TGSI_OPCODE_FLR: return NV_OP_FLOOR;
    case TGSI_OPCODE_TRUNC: return NV_OP_TRUNC;
+   case TGSI_OPCODE_COS: return NV_OP_COS;
+   case TGSI_OPCODE_SIN: return NV_OP_SIN;
    case TGSI_OPCODE_DDX: return NV_OP_DFDX;
    case TGSI_OPCODE_DDY: return NV_OP_DFDY;
    case TGSI_OPCODE_F2I:
@@ -1226,6 +1228,14 @@ bld_instruction(struct bld_context *bld,
          dst0[c] = bld_insn_2(bld, opcode, src0, src1);
       }
       break;
+   case TGSI_OPCODE_ARL:
+      src1 = bld_imm_u32(bld, 4);
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         (temp = bld_insn_1(bld, NV_OP_FLOOR, temp))->reg.type = NV_TYPE_S32;
+         dst0[c] = bld_insn_2(bld, NV_OP_SHL, temp, src1);
+      }
+      break;
    case TGSI_OPCODE_CMP:
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
          src0 = emit_fetch(bld, insn, 0, c);
@@ -1245,19 +1255,19 @@ bld_instruction(struct bld_context *bld,
       }
       break;
    case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
       src0 = emit_fetch(bld, insn, 0, 0);
       temp = bld_insn_1(bld, NV_OP_PRESIN, src0);
       if (insn->Dst[0].Register.WriteMask & 7)
-         temp = bld_insn_1(bld, NV_OP_COS, temp);
+         temp = bld_insn_1(bld, opcode, temp);
       for (c = 0; c < 3; ++c)
          if (insn->Dst[0].Register.WriteMask & (1 << c))
             dst0[c] = temp;
       if (!(insn->Dst[0].Register.WriteMask & (1 << 3)))
          break;
-      /* XXX: if src0.x is src0.w, don't emit new insns */
       src0 = emit_fetch(bld, insn, 0, 3);
       temp = bld_insn_1(bld, NV_OP_PRESIN, src0);
-      dst0[3] = bld_insn_1(bld, NV_OP_COS, temp);
+      dst0[3] = bld_insn_1(bld, opcode, temp);
       break;
    case TGSI_OPCODE_DP3:
       src0 = emit_fetch(bld, insn, 0, 0);
@@ -1303,6 +1313,9 @@ bld_instruction(struct bld_context *bld,
          bld_kil(bld, src0);
       }
       break;
+   case TGSI_OPCODE_KILP:
+      (new_instruction(bld->pc, NV_OP_KIL))->fixed = 1;
+      break;
    case TGSI_OPCODE_IF:
    {
       struct nv_basic_block *b = new_basic_block(bld->pc);
@@ -1496,6 +1509,20 @@ bld_instruction(struct bld_context *bld,
          dst0[c]->reg.type = NV_TYPE_F32;
       }
       break;
+   case TGSI_OPCODE_SCS:
+      if (insn->Dst[0].Register.WriteMask & 0x3) {
+         src0 = emit_fetch(bld, insn, 0, 0);
+         temp = bld_insn_1(bld, NV_OP_PRESIN, src0);
+         if (insn->Dst[0].Register.WriteMask & 0x1)
+            dst0[0] = bld_insn_1(bld, NV_OP_COS, temp);
+         if (insn->Dst[0].Register.WriteMask & 0x2)
+            dst0[1] = bld_insn_1(bld, NV_OP_SIN, temp);
+      }
+      if (insn->Dst[0].Register.WriteMask & 0x4)
+         dst0[2] = bld_imm_f32(bld, 0.0f);
+      if (insn->Dst[0].Register.WriteMask & 0x8)
+         dst0[3] = bld_imm_f32(bld, 1.0f);
+      break;
    case TGSI_OPCODE_SUB:
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
          src0 = emit_fetch(bld, insn, 0, c);
@@ -1527,12 +1554,15 @@ bld_instruction(struct bld_context *bld,
          dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG;
       }
       break;
+   case TGSI_OPCODE_RET:
+      (new_instruction(bld->pc, NV_OP_RET))->fixed = 1;
+      break;
    case TGSI_OPCODE_END:
       if (bld->ti->p->type == PIPE_SHADER_FRAGMENT)
          bld_export_outputs(bld);
       break;
    default:
-      NOUVEAU_ERR("nv_bld: unhandled opcode %u\n", insn->Instruction.Opcode);
+      NOUVEAU_ERR("unhandled opcode %u\n", insn->Instruction.Opcode);
       abort();
       break;
    }
-- 
cgit v1.2.3


From cb75082768d516d684a69588266b92b06e19b7bd Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 17 Aug 2010 13:07:12 +0200
Subject: nv50: fix PSIZ and PRIMID mapping

Initializing map to 0x40 (0x80) instead of 0 now, so need to clear
it first.
---
 src/gallium/drivers/nv50/nv50_shader_state.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index 3d5df596ef..5f70df3662 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -496,16 +496,19 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
       m = nv50_vec4_map(map, m, lin,
                         &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy);
 	}
+
    /* PrimitiveID either is replaced by the system value, or
     * written by the geometry shader into an output register
     */
    if (fp->gp.primid < 0x40) {
-      map[m / 4] |= vp->gp.primid << ((m % 4) * 8);
+      i = (m % 4) * 8;
+      map[m / 4] = (map[m / 4] & ~(0xff << i)) | (vp->gp.primid << i);
       primid = m++;
    }
 
    if (nv50->rasterizer->pipe.point_size_per_vertex) {
-      map[m / 4] |= vp->vp.psiz << ((m % 4) * 8);
+      i = (m % 4) * 8;
+      map[m / 4] = (map[m / 4] & ~(0xff << i)) | (vp->vp.psiz << i);
       psiz = (m++ << 4) | 1;
    }
 
@@ -532,7 +535,6 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
       so_datap (so, map, n);
    }
 
-   //colors = 0x01000404;
    so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
    so_data  (so, colors);
    so_data  (so, clip);
-- 
cgit v1.2.3


From 3e27785f3ebe6620805f97cb5c17ec8bd28bc1e8 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 17 Aug 2010 15:27:56 +0200
Subject: nv50: check dst compatibility in CSE

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 3e6e09a904..80f3bb34b0 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -1007,6 +1007,13 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
                 ik->flags_def || ir->flags_def)
                continue; /* and also not with flags, for now */
 
+            assert(ik->def[0] && ir->def[0]);
+
+            if (ik->def[0]->reg.file == NV_FILE_OUT ||
+                ir->def[0]->reg.file == NV_FILE_OUT ||
+                !values_equal(ik->def[0], ir->def[0]))
+               continue;
+
             for (s = 0; s < 3; ++s) {
                struct nv_value *a, *b;
 
-- 
cgit v1.2.3


From 1bbbc8e0c8230d33cb1eae89cc47b5296edefc10 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 17 Aug 2010 19:03:11 +0200
Subject: nv50: initialize edgeflag input index

---
 src/gallium/drivers/nv50/nv50_program.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 54cd36f868..d47941d3b1 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -484,6 +484,7 @@ nv50_prog_scan(struct nv50_translation_info *ti)
    struct tgsi_parse_context parse;
    int ret;
 
+   p->vp.edgeflag = 0x40;
    p->vp.psiz = 0x40;
    p->vp.bfc[0] = 0x40;
    p->vp.bfc[1] = 0x40;
-- 
cgit v1.2.3


From eaab76457818fad0926b84c663440e8987e1f19f Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Wed, 18 Aug 2010 14:36:47 +0200
Subject: nv50: emit predicate for interp

---
 src/gallium/drivers/nv50/nv50_pc_emit.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index b5f4383aa1..bc151c3a80 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -596,8 +596,12 @@ emit_interp(struct nv_pc *pc, struct nv_instruction *i)
    if (i->centroid)
       pc->emit[0] |= 1 << 24;
 
+   assert(i->is_long || !i->flags_src);
+
    if (i->is_long) {
-      pc->emit[1] |= 0x0780 |
+      set_pred(pc, i);
+
+      pc->emit[1] |=
 	      (pc->emit[0] & (3 << 24)) >> (24 - 16) |
 	      (pc->emit[0] & (1 <<  8)) >> (18 -  8);
 
-- 
cgit v1.2.3


From 33f45c5a8afd353ad9bbd8647fa5c6dfc59cdfd7 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 22 Aug 2010 22:59:01 +0200
Subject: nv50: DP2, fix ARL

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 44 ++++++++++++++++++------------
 1 file changed, 27 insertions(+), 17 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index dafff725b8..7b2ccef704 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1199,6 +1199,25 @@ bld_tex(struct bld_context *bld, struct nv_value *dst0[4],
    nvi->tex_argc = arg;
 }
 
+static INLINE struct nv_value *
+bld_dot(struct bld_context *bld, const struct tgsi_full_instruction *insn,
+	int n)
+{
+   struct nv_value *dotp, *src0, *src1;
+   int c;
+
+   src0 = emit_fetch(bld, insn, 0, 0);
+   src1 = emit_fetch(bld, insn, 1, 0);
+   dotp = bld_insn_2(bld, NV_OP_MUL, src0, src1);
+
+   for (c = 1; c < n; ++c) {
+      src0 = emit_fetch(bld, insn, 0, c);
+      src1 = emit_fetch(bld, insn, 1, c);
+      dotp = bld_insn_3(bld, NV_OP_MAD, src0, src1, dotp);
+   }
+   return dotp;
+}
+
 #define FOR_EACH_DST0_ENABLED_CHANNEL(chan, inst) \
    for (chan = 0; chan < 4; ++chan)               \
       if ((inst)->Dst[0].Register.WriteMask & (1 << chan))
@@ -1232,7 +1251,7 @@ bld_instruction(struct bld_context *bld,
       src1 = bld_imm_u32(bld, 4);
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
          src0 = emit_fetch(bld, insn, 0, c);
-         (temp = bld_insn_1(bld, NV_OP_FLOOR, temp))->reg.type = NV_TYPE_S32;
+         (temp = bld_insn_1(bld, NV_OP_FLOOR, src0))->reg.type = NV_TYPE_S32;
          dst0[c] = bld_insn_2(bld, NV_OP_SHL, temp, src1);
       }
       break;
@@ -1269,27 +1288,18 @@ bld_instruction(struct bld_context *bld,
       temp = bld_insn_1(bld, NV_OP_PRESIN, src0);
       dst0[3] = bld_insn_1(bld, opcode, temp);
       break;
+   case TGSI_OPCODE_DP2:
+      temp = bld_dot(bld, insn, 2);
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
    case TGSI_OPCODE_DP3:
-      src0 = emit_fetch(bld, insn, 0, 0);
-      src1 = emit_fetch(bld, insn, 1, 0);
-      temp = bld_insn_2(bld, NV_OP_MUL, src0, src1);
-      for (c = 1; c < 3; ++c) {
-         src0 = emit_fetch(bld, insn, 0, c);
-         src1 = emit_fetch(bld, insn, 1, c);
-         temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp);
-      }
+      temp = bld_dot(bld, insn, 3);
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
          dst0[c] = temp;
       break;
    case TGSI_OPCODE_DP4:
-      src0 = emit_fetch(bld, insn, 0, 0);
-      src1 = emit_fetch(bld, insn, 1, 0);
-      temp = bld_insn_2(bld, NV_OP_MUL, src0, src1);
-      for (c = 1; c < 4; ++c) {
-         src0 = emit_fetch(bld, insn, 0, c);
-         src1 = emit_fetch(bld, insn, 1, c);
-         temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp);
-      }
+      temp = bld_dot(bld, insn, 4);
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
          dst0[c] = temp;
       break;
-- 
cgit v1.2.3


From 0df5e84b01f5420e37006a32c916835af2aa4314 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 22 Aug 2010 23:09:55 +0200
Subject: nv50: yet another case we need a nop.exit

---
 src/gallium/drivers/nv50/nv50_pc.c           | 2 +-
 src/gallium/drivers/nv50/nv50_shader_state.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 5041fc7505..b9d274414d 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -327,7 +327,7 @@ nv50_emit_program(struct nv_pc *pc)
    assert(pc->emit == &code[pc->bin_size / 4]);
 
    /* XXX: we can do better than this ... */
-   if (!(pc->emit[-2] & 1) || (pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) {
+   if (!(pc->emit[-2] & 1) || (pc->emit[-2] & 2) || (pc->emit[-1] & 3)) {
       pc->emit[0] = 0xf0000001;
       pc->emit[1] = 0xe0000000;
       pc->bin_size += 8;
diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index 5f70df3662..a244753c4d 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -547,7 +547,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
    so_datap (so, lin, 4);
 
-   if (nv50->rasterizer->pipe.sprite_coord_enable) {
+   if (nv50->rasterizer->pipe.sprite_coord_enable) { /* XXX: gl_PointCoord */
       so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
       so_data  (so,
                 nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff));
-- 
cgit v1.2.3


From bae181f78d6ff5e37ef3c022563b2077c0247c2b Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 23 Aug 2010 14:25:13 +0200
Subject: nv50: fix check for sprite/point coord enable

---
 src/gallium/drivers/nv50/nv50_shader_state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index a244753c4d..f187a074e6 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -384,7 +384,7 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned m)
             break;
 
       if (j < vp->out_nr) {
-         ubyte en = nv50->rasterizer->pipe.sprite_coord_enable;
+         uint32_t en = nv50->rasterizer->pipe.sprite_coord_enable;
 
          if (!(en & (1 << vp->out[j].si))) {
             m += n;
@@ -547,7 +547,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
    so_datap (so, lin, 4);
 
-   if (nv50->rasterizer->pipe.sprite_coord_enable) { /* XXX: gl_PointCoord */
+   if (nv50->rasterizer->pipe.point_quad_rasterization) {
       so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
       so_data  (so,
                 nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff));
-- 
cgit v1.2.3


From db1874272c325e3e19fb7f386ec82f36e7a24496 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 24 Aug 2010 11:21:06 +0200
Subject: nv50: handle TEXTURE_SWIZZLE and GEOMETRY_SHADER4 caps

GP support will probably be re-added soon.
---
 src/gallium/drivers/nv50/nv50_screen.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 78137d6940..fc75d81d54 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -84,6 +84,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 1;
+	case PIPE_CAP_GEOMETRY_SHADER4:
+		return 0;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
 		return 1;
 	case PIPE_CAP_POINT_SPRITE:
@@ -105,6 +107,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
 		return 1;
+	case PIPE_CAP_TEXTURE_SWIZZLE:
+		return 1;
 	case PIPE_CAP_TGSI_CONT_SUPPORTED:
 		return 1;
 	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
-- 
cgit v1.2.3


From 3844c365947082550565accefd996c10fbb15cc4 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sat, 28 Aug 2010 17:05:11 +0200
Subject: nv50: set the FragDepth output index

---
 src/gallium/drivers/nv50/nv50_program.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index d47941d3b1..d4a75dc64a 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -461,7 +461,7 @@ nv50_fragprog_prepare(struct nv50_translation_info *ti)
    }
    if (depr < p->out_nr) {
       p->out[depr].mask = 0x4;
-      p->out[depr].hw = p->max_out++;
+      p->out[depr].hw = ti->output_map[depr][2] = p->max_out++;
    }
 
    return 0;
-- 
cgit v1.2.3


From d90502b2b468732e2a42985580bbbe9d9fdfd14e Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 31 Aug 2010 13:17:07 +0200
Subject: nv50: turn off verbose debug output by default

---
 src/gallium/drivers/nv50/nv50_pc.c          | 12 ++++++---
 src/gallium/drivers/nv50/nv50_pc.h          |  6 +++++
 src/gallium/drivers/nv50/nv50_pc_emit.c     | 11 ++++----
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 30 ++++++----------------
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 40 ++++++-----------------------
 src/gallium/drivers/nv50/nv50_program.c     |  9 ++++---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 12 +++++----
 7 files changed, 49 insertions(+), 71 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index b9d274414d..1c12fe1b9e 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -20,6 +20,8 @@
  * SOFTWARE.
  */
 
+/* #define NV50PC_DEBUG */
+
 #include "nv50_pc.h"
 #include "nv50_program.h"
 
@@ -311,7 +313,7 @@ nv50_emit_program(struct nv_pc *pc)
    uint32_t *code = pc->emit;
    int n;
 
-   debug_printf("emitting program: size = %u\n", pc->bin_size);
+   NV50_DBGMSG("emitting program: size = %u\n", pc->bin_size);
 
    for (n = 0; n < pc->num_blocks; ++n) {
       struct nv_instruction *i;
@@ -336,7 +338,9 @@ nv50_emit_program(struct nv_pc *pc)
    pc->emit = code;
    code[pc->bin_size / 4 - 1] |= 1;
 
+#ifdef NV50PC_DEBUG
    nvcg_show_bincode(pc);
+#endif
 
    return 0;
 }
@@ -354,7 +358,9 @@ nv50_generate_code(struct nv50_translation_info *ti)
    ret = nv50_tgsi_to_nc(pc, ti);
    if (ret)
       goto out;
+#ifdef NV50PC_DEBUG
    nv_print_program(pc->root);
+#endif
 
    /* optimization */
    ret = nv_pc_exec_pass0(pc);
@@ -392,7 +398,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    ti->p->fixups = pc->fixups;
    ti->p->num_fixups = pc->num_fixups;
 
-   debug_printf("SHADER TRANSLATION - %s\n", ret ? "failure" : "success");
+   NV50_DBGMSG("SHADER TRANSLATION - %s\n", ret ? "failure" : "success");
 
 out:
    nv_pc_free_refs(pc);
@@ -492,7 +498,7 @@ nv_nvi_delete(struct nv_instruction *nvi)
 
    if (nvi == b->phi) {
       if (nvi->opcode != NV_OP_PHI)
-         debug_printf("NOTE: b->phi points to non-PHI instruction\n");
+         NV50_DBGMSG("NOTE: b->phi points to non-PHI instruction\n");
 
       assert(!nvi->prev);
       if (!nvi->next || nvi->next->opcode != NV_OP_PHI)
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index d24375100d..48918f46d5 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -23,6 +23,12 @@
 #ifndef __NV50_COMPILER_H__
 #define __NV50_COMPILER_H__
 
+#ifdef NV50PC_DEBUG
+# define NV50_DBGMSG(args...) debug_printf(args)
+#else
+# define NV50_DBGMSG(args...)
+#endif
+
 #include "pipe/p_defines.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index bc151c3a80..7808335e50 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -239,8 +239,7 @@ set_dst(struct nv_pc *pc, struct nv_value *value)
    struct nv_reg *reg = &value->join->reg;
 
    if (reg->id < 0) {
-      debug_printf("WARNING: unused dst, hope we can bucket it !\n");
-      pc->emit[0] |= 127 << 2;
+      pc->emit[0] |= (127 << 2) | 1; /* set 'long'-bit to catch bugs */
       pc->emit[1] |= 0x8;
       return;
    }
@@ -249,7 +248,7 @@ set_dst(struct nv_pc *pc, struct nv_value *value)
       pc->emit[1] |= 0x8;
    else
    if (reg->file == NV_FILE_ADDR)
-	   assert(0);
+      assert(0);
 
    pc->emit[0] |= reg->id << 2;
 }
@@ -801,8 +800,8 @@ emit_flop(struct nv_pc *pc, struct nv_instruction *i)
 
    pc->emit[0] = 0x90000000;
 
-   assert(SREG(src0)->type == NV_TYPE_F32);
-   assert(SREG(src0)->file == NV_FILE_GPR);
+   assert(STYPE(i, 0) == NV_TYPE_F32);
+   assert(SFILE(i, 0) == NV_FILE_GPR);
 
    if (!i->is_long) {
       emit_form_MUL(pc, i);
@@ -1057,7 +1056,7 @@ emit_ddy(struct nv_pc *pc, struct nv_instruction *i)
 void
 nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
 {
-   // nv_print_instruction(i);
+   /* nv_print_instruction(i); */
 
    switch (i->opcode) {
    case NV_OP_MOV:
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 80f3bb34b0..4b1cd56fc1 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -20,6 +20,8 @@
  * SOFTWARE.
  */
 
+/* #define NV50PC_DEBUG */
+
 #include "nv50_pc.h"
 
 #define DESCEND_ARBITRARY(j, f)                                 \
@@ -109,7 +111,7 @@ nvi_isnop(struct nv_instruction *nvi)
       return FALSE;
 
    if (nvi->src[0]->value->join->reg.id < 0) {
-      debug_printf("nvi_isnop: orphaned value detected\n");
+      NV50_DBGMSG("nvi_isnop: orphaned value detected\n");
       return TRUE;
    }
 
@@ -176,9 +178,6 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
           nv50_inst_min_size(nvi->next) == 4 &&
           inst_commutation_legal(nvi, nvi->next)) {
          ++n32;
-         debug_printf("permuting: ");
-         nv_print_instruction(nvi);
-         nv_print_instruction(nvi->next);
          nv_nvi_permute(nvi, nvi->next);
          next = nvi;
       } else {
@@ -193,7 +192,7 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
    }
 
    if (!b->entry) {
-      debug_printf("block %p is now empty\n", b);
+      NV50_DBGMSG("block %p is now empty\n", b);
    } else
    if (!b->exit->is_long) {
       assert(n32);
@@ -221,7 +220,7 @@ nv_pc_exec_pass2(struct nv_pc *pc)
    pc->pass_seq++;
    nv_pass_flatten(&pass, pc->root);
 
-   debug_printf("preparing %u blocks for emission\n", pc->num_blocks);
+   NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks);
 
    pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
    pc->num_blocks = 0;
@@ -708,21 +707,6 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
    return 0;
 }
 
-/*
-set $r2 g f32 $r2 $r3
-cvt abs rn f32 $r2 s32 $r2
-cvt f32 $c0 # f32 $r2
-e $c0 bra 0x80
-*/
-#if 0
-static int
-nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b)
-{
-   /* XXX: easier in IR builder for now */
-   return 0;
-}
-#endif
-
 /* TODO: redundant store elimination */
 
 struct load_record {
@@ -936,7 +920,7 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
 
    if (bb_is_if_else_endif(b)) {
 
-      debug_printf("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id);
+      NV50_DBGMSG("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id);
 
       for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0)
          if (!nv50_nvi_can_predicate(nvi))
@@ -945,11 +929,13 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
          for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1)
             if (!nv50_nvi_can_predicate(nvi))
                break;
+#ifdef NV50_PC_DEBUG
          if (nvi) {
             debug_printf("cannot predicate: "); nv_print_instruction(nvi);
          }
       } else {
          debug_printf("cannot predicate: "); nv_print_instruction(nvi);
+#endif
       }
 
       if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index d45dd7f95f..59462cc11e 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -20,6 +20,8 @@
  * SOFTWARE.
  */
 
+/* #define NV50PC_DEBUG */
+
 #include "nv50_context.h"
 #include "nv50_pc.h"
 
@@ -112,15 +114,8 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end)
    if (bgn < b->entry->serial || bgn > b->exit->serial)
       bgn = b->entry->serial;
 
-   if (bgn > end) {
-      debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n",
-                   b->entry->serial, b->exit->serial, bgn, end);
-   }
    assert(bgn <= end);
 
-   if (bgn < val->insn->serial)
-      debug_printf("WARNING: leaking value %i ?\n", val->n);
-
    add_range_ex(val, bgn, end, NULL);
 }
 
@@ -559,12 +554,8 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
    struct nv_instruction *i;
    int j, n, ret = 0;
 
-   debug_printf("pass_build_live_sets BB:%i\n", b->id);
-
-   if (b->pass_seq >= ctx->pc->pass_seq) {
-      debug_printf("already visited\n");
+   if (b->pass_seq >= ctx->pc->pass_seq)
       return 0;
-   }
    b->pass_seq = ctx->pc->pass_seq;
 
    /* slight hack for undecidedness: set phi = entry if it's undefined */
@@ -595,13 +586,10 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
                break;
             assert(i->src[j]->value->insn);
 
-            if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) {
+            if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n]))
                live_set_add(b, i->src[j]->value);
-               debug_printf("BB:%i liveset + %i\n", b->id, i->src[j]->value->n);
-            } else {
+            else
                live_set_rem(b, i->src[j]->value);
-               debug_printf("BB:%i liveset - %i\n", b->id, i->src[j]->value->n);
-            }
          }
       }
    }
@@ -653,7 +641,7 @@ static void collect_live_values(struct nv_basic_block *b, const int n)
    }
 }
 
-/* NOTE: the live intervals of phi functions start the the first non-phi instruction */
+/* NOTE: the live intervals of phi functions start at the first non-phi insn. */
 static int
 pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 {
@@ -661,8 +649,6 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
    int j, s;
    const int n = (ctx->pc->num_values + 31) / 32;
 
-   debug_printf("building intervals for BB %i\n", b->id);
-
    /* verify that first block does not have live-in values */
    if (b->num_in == 0)
       for (j = 0; j < n; ++j)
@@ -700,7 +686,6 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
          add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
       }
    }
-   debug_printf("%s: looping through instructions now\n", __func__);
 
    i_stop = b->entry ? b->entry->prev : NULL;
 
@@ -763,8 +748,6 @@ insert_ordered_tail(struct nv_value *list, struct nv_value *nval)
 {
    struct nv_value *elem = list->prev;
 
-   // debug_printf("inserting value %i\n", nval->n);
-
    for (elem = list->prev;
 	elem != list && elem->livei->bgn > nval->livei->bgn;
 	elem = elem->prev);
@@ -818,8 +801,6 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter)
    foreach_s(cur, tmp[0], &unhandled) {
       remove_from_list(cur);
 
-      /* debug_printf("handling value %i\n", cur->n); */
-
       foreach_s(val, tmp[1], &active) {
          if (livei_end(val) <= cur->livei->bgn) {
             reg_release(&free, val);
@@ -878,23 +859,19 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    struct nv_pc_pass *ctx;
    int i, ret;
 
-   debug_printf("REGISTER ALLOCATION - entering\n");
+   NV50_DBGMSG("REGISTER ALLOCATION - entering\n");
 
    ctx = CALLOC_STRUCT(nv_pc_pass);
    if (!ctx)
       return -1;
    ctx->pc = pc;
 
-   nv_print_program(ctx->pc->root);
-
    ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *));
 
    pc->pass_seq++;
    ret = pass_generate_phi_movs(ctx, pc->root);
    assert(!ret);
 
-   nv_print_program(ctx->pc->root);
-
    for (i = 0; i < pc->loop_nesting_bound; ++i) {
       pc->pass_seq++;
       ret = pass_build_live_sets(ctx, pc->root);
@@ -934,8 +911,7 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    for (i = 0; i < pc->num_values; ++i)
       livei_release(&pc->values[i]);
 
-   debug_printf("REGISTER ALLOCATION - leaving\n");
-   nv_print_program(ctx->pc->root);
+   NV50_DBGMSG("REGISTER ALLOCATION - leaving\n");
 
 out:
    FREE(ctx);
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index d4a75dc64a..182a591eb3 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -20,6 +20,8 @@
  * SOFTWARE.
  */
 
+/* #define NV50_PROGRAM_DEBUG */
+
 #include "nv50_program.h"
 #include "nv50_pc.h"
 #include "nv50_context.h"
@@ -187,8 +189,6 @@ prog_immediate(struct nv50_translation_info *ti,
    int c;
    unsigned n = ++ti->immd32_nr;
 
-   tgsi_dump_immediate(imm);
-
    if (n == (1 << (ffs(n) - 1)))
       ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16);
 
@@ -228,7 +228,6 @@ prog_decl(struct nv50_translation_info *ti,
       sn = decl->Semantic.Name;
       si = decl->Semantic.Index;
    }
-   tgsi_dump_declaration(decl);
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_INPUT:
@@ -492,6 +491,10 @@ nv50_prog_scan(struct nv50_translation_info *ti)
 
    tgsi_scan_shader(p->pipe.tokens, &ti->scan);
 
+#ifdef NV50_PROGRAM_DEBUG
+   tgsi_dump(p->pipe.tokens, 0);
+#endif
+
    tgsi_parse_init(&parse, p->pipe.tokens);
    while (!tgsi_parse_end_of_tokens(&parse)) {
       tgsi_parse_token(&parse);
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 7b2ccef704..115b5df939 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -20,6 +20,8 @@
  * SOFTWARE.
  */
 
+/* #define NV50_TGSI2NC_DEBUG */
+
 /* XXX: need to clean this up so we get the typecasting right more naturally */
 
 #include <unistd.h>
@@ -1015,10 +1017,8 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       abort();
       break;	   
    }
-   if (!res) {
-      debug_printf("WARNING: undefined source value in TGSI instruction\n");
-      return bld_load_imm_u32(bld, 0);
-   }
+   if (!res)
+      return bld_undef(bld, NV_FILE_GPR);
 
    switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
    case TGSI_UTIL_SIGN_KEEP:
@@ -1234,7 +1234,9 @@ bld_instruction(struct bld_context *bld,
    int c;
    uint opcode = translate_opcode(insn->Instruction.Opcode);
 
-   tgsi_dump_instruction(insn, 1);
+#ifdef NV50_TGSI2NC_DEBUG
+   debug_printf("bld_instruction:"); tgsi_dump_instruction(insn, 1);
+#endif
 	
    switch (insn->Instruction.Opcode) {
    case TGSI_OPCODE_ADD:
-- 
cgit v1.2.3


From 0a8292e096bc37eeb225bf7d3854b6b6edc4bceb Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Wed, 1 Sep 2010 17:54:56 +0200
Subject: nv50: attempt at making more complicated loops work

Nested loops, and loops with multiple exits (BREAK, CONT).
---
 src/gallium/drivers/nv50/nv50_pc.c          | 20 +++++--
 src/gallium/drivers/nv50/nv50_pc.h          |  6 ++
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 14 +++--
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 85 ++++++++++++++++++-----------
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 70 +++++++++++++++++++-----
 5 files changed, 138 insertions(+), 57 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 1c12fe1b9e..b03f5b27f6 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -220,6 +220,7 @@ edge_name(ubyte type)
    case CFG_EDGE_BACK: return "back";
    case CFG_EDGE_LOOP_ENTER: return "loop";
    case CFG_EDGE_LOOP_LEAVE: return "break";
+   case CFG_EDGE_FAKE: return "fake";
    default:
       return "?";
    }
@@ -247,6 +248,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
          case CFG_EDGE_BACK:
             continue;
          case CFG_EDGE_FORWARD:
+         case CFG_EDGE_FAKE:
             if (++b->out[j]->priv == b->out[j]->num_in)
                bb[p++] = b->out[j];
             break;
@@ -264,9 +266,11 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
 
       f(priv, b);
 
-      if (!p)
-         while (pp > 0)
-            bb[p++] = bbb[--pp];
+      if (!p) {
+         p = pp;
+         for (; pp > 0; --pp)
+            bb[pp - 1] = bbb[pp - 1];
+      }
    }
 }
 
@@ -366,11 +370,17 @@ nv50_generate_code(struct nv50_translation_info *ti)
    ret = nv_pc_exec_pass0(pc);
    if (ret)
       goto out;
+#ifdef NV50PC_DEBUG
+   nv_print_program(pc->root);
+#endif
 
    /* register allocation */
    ret = nv_pc_exec_pass1(pc);
    if (ret)
       goto out;
+#ifdef NV50PC_DEBUG
+   nv_print_program(pc->root);
+#endif
 
    /* prepare for emission */
    ret = nv_pc_exec_pass2(pc);
@@ -580,10 +590,10 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
    if (bp == bt)
       return FALSE;
 
-   if (bp->out[0] && bp->out_kind[0] != CFG_EDGE_BACK &&
+   if (bp->out[0] && !IS_WALL_EDGE(bp->out_kind[0]) &&
        nvbb_reachable_by(bf, bp->out[0], bt))
       return TRUE;
-   if (bp->out[1] && bp->out_kind[1] != CFG_EDGE_BACK &&
+   if (bp->out[1] && !IS_WALL_EDGE(bp->out_kind[1]) &&
        nvbb_reachable_by(bf, bp->out[1], bt))
       return TRUE;
    return FALSE;
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 48918f46d5..2bb3ea4374 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -257,6 +257,12 @@ struct nv_instruction {
 #define CFG_EDGE_BACK        1
 #define CFG_EDGE_LOOP_ENTER  2
 #define CFG_EDGE_LOOP_LEAVE  4
+#define CFG_EDGE_FAKE        8
+
+/* 'WALL' edge means where reachability check doesn't follow */
+/* 'LOOP' edge means just having to do with loops */
+#define IS_LOOP_EDGE(k) ((k) & 7)
+#define IS_WALL_EDGE(k) ((k) & 9)
 
 struct nv_basic_block {
    struct nv_instruction *entry; /* first non-phi instruction */
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 4b1cd56fc1..1d2710a8ac 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -362,6 +362,9 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
          nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
          if (ld->src[4])
             nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value);
+
+         if (!nv_nvi_refcount(ld))
+            nv_nvi_delete(ld);
       }
    }
    DESCEND_ARBITRARY(j, nv_pass_fold_loads);
@@ -504,7 +507,7 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi,
    u1.u32 = src1->reg.imm.u32;
 
    modifiers_apply(&u0.u32, type, nvi->src[0]->mod);
-   modifiers_apply(&u0.u32, type, nvi->src[1]->mod);
+   modifiers_apply(&u1.u32, type, nvi->src[1]->mod);
 
    switch (nvi->opcode) {
    case NV_OP_MAD:
@@ -951,7 +954,9 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
          if (b->exit && b->exit->opcode == NV_OP_JOINAT)
             nv_nvi_delete(b->exit);
 
-         if ((nvi = b->out[0]->out[0]->entry)) {
+         i = (b->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) ? 1 : 0;
+
+         if ((nvi = b->out[0]->out[i]->entry)) {
             nvi->is_join = 0;
             if (nvi->opcode == NV_OP_JOIN)
                nv_nvi_delete(nvi);
@@ -980,7 +985,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
             if (ir->opcode != ik->opcode)
                continue;
 
-            if (ik->opcode == NV_OP_LDA ||
+            if (!ir->def[0] || !ik->def[0] ||
+                ik->opcode == NV_OP_LDA ||
                 ik->opcode == NV_OP_STA ||
                 ik->opcode == NV_OP_MOV ||
                 nv_is_vector_op(ik->opcode))
@@ -993,8 +999,6 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
                 ik->flags_def || ir->flags_def)
                continue; /* and also not with flags, for now */
 
-            assert(ik->def[0] && ir->def[0]);
-
             if (ik->def[0]->reg.file == NV_FILE_OUT ||
                 ir->def[0]->reg.file == NV_FILE_OUT ||
                 !values_equal(ik->def[0], ir->def[0]))
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 59462cc11e..81decf8d4a 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -22,6 +22,10 @@
 
 /* #define NV50PC_DEBUG */
 
+/* #define NV50_RA_DEBUG_LIVEI */
+/* #define NV50_RA_DEBUG_LIVE_SETS */
+/* #define NV50_RA_DEBUG_JOIN */
+
 #include "nv50_context.h"
 #include "nv50_pc.h"
 
@@ -119,7 +123,7 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end)
    add_range_ex(val, bgn, end, NULL);
 }
 
-#ifdef NV50_RA_DEBUG_JOIN
+#if defined(NV50_RA_DEBUG_JOIN) || defined(NV50_RA_DEBUG_LIVEI)
 static void
 livei_print(struct nv_value *a)
 {
@@ -359,16 +363,37 @@ need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p)
    int i = 0, n = 0;
 
    for (; i < 2; ++i)
-      if (p->out[i] && p->out_kind[i] != CFG_EDGE_LOOP_LEAVE)
+      if (p->out[i] && !IS_LOOP_EDGE(p->out_kind[i]))
          ++n;
 
    return (b->num_in > 1) && (n == 2);
 }
 
+static int
+phi_opnd_for_bb(struct nv_instruction *phi, struct nv_basic_block *b,
+                struct nv_basic_block *tb)
+{
+   int i, j;
+
+   for (j = -1, i = 0; i < 4 && phi->src[i]; ++i) {
+      if (!nvbb_reachable_by(b, phi->src[i]->value->insn->bb, tb))
+         continue;
+      /* NOTE: back-edges are ignored by the reachable-by check */
+      if (j < 0 || !nvbb_reachable_by(phi->src[j]->value->insn->bb,
+                                      phi->src[i]->value->insn->bb, tb))
+         j = i;
+   }
+   return j;
+}
+
 /* For each operand of each PHI in b, generate a new value by inserting a MOV
  * at the end of the block it is coming from and replace the operand with its
  * result. This eliminates liveness conflicts and enables us to let values be
  * copied to the right register if such a conflict exists nonetheless.
+ *
+ * These MOVs are also crucial in making sure the live intervals of phi srces
+ * are extended until the end of the loop, since they are not included in the
+ * live-in sets.
  */
 static int
 pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
@@ -404,14 +429,17 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       ctx->pc->current_block = pn;
 
       for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
-         for (j = 0; j < 4 && i->src[j]; ++j) {
-            if (nvbb_reachable_by(p, i->src[j]->value->insn->bb, b))
-               break;
-         }
-         if (j >= 4 || !i->src[j])
+         if ((j = phi_opnd_for_bb(i, p, b)) < 0)
             continue;
          val = i->src[j]->value;
 
+         if (i->src[j]->flags) {
+            val = val->insn->src[0]->value;
+            while (j < 4 && i->src[j])
+               ++j;
+            assert(j < 4);
+         }
+
          ni = new_instruction(ctx->pc, NV_OP_MOV);
 
          /* TODO: insert instruction at correct position in the first place */
@@ -423,6 +451,8 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b)
          ni->src[0] = new_ref(ctx->pc, val);
 
          nv_reference(ctx->pc, &i->src[j], ni->def[0]);
+
+         i->src[j]->flags = 1;
       }
 
       if (pn != p && pn->exit) {
@@ -452,8 +482,8 @@ pass_join_values(struct nv_pc_pass *ctx, int iter)
       case NV_OP_PHI:
          if (!iter)
             continue;
-         try_join_values(ctx, i->src[0]->value, i->src[1]->value);
-         try_join_values(ctx, i->def[0], i->src[0]->value);
+         for (c = 0; c < 4 && i->src[c]; ++c)
+            try_join_values(ctx, i->def[0], i->src[c]->value);
          break;
       case NV_OP_MOV:
          if (iter && i->src[0]->value->insn &&
@@ -576,22 +606,6 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
          for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j)
             b->live_set[j] |= b->out[n]->live_set[j];
       }
-
-      /* Kick values out of our live set that are created in incoming
-       * blocks of our successors that are not us.
-       */
-      for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) {
-         for (j = 0; j < 4; ++j) {
-            if (!i->src[j])
-               break;
-            assert(i->src[j]->value->insn);
-
-            if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n]))
-               live_set_add(b, i->src[j]->value);
-            else
-               live_set_rem(b, i->src[j]->value);
-         }
-      }
    }
 
    if (!b->entry)
@@ -599,7 +613,7 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
 
    bb_live_set_print(ctx->pc, b);
 
-   for (i = b->exit; i; i = i->prev) {
+   for (i = b->exit; i != b->entry->prev; i = i->prev) {
       for (j = 0; j < 4; j++) {
          if (!i->def[j])
             break;
@@ -617,6 +631,9 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       if (i->flags_src)
          live_set_add(b, i->flags_src->value);
    }
+   for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next)
+      live_set_rem(b, i->def[0]);
+
    bb_live_set_print(ctx->pc, b);
 
    return 0;
@@ -680,10 +697,12 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       for (j = 0; j < ctx->pc->num_values; ++j) {
          if (!(b->live_set[j / 32] & (1 << (j % 32))))
             continue;
+         add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
 #ifdef NV50_RA_DEBUG_LIVEI
-         debug_printf("adding range for live value %i\n", j);
+         debug_printf("adding range for live value %i: ", j);
+         livei_print(&ctx->pc->values[j]);
 #endif
-         add_range(&ctx->pc->values[j], b, b->exit->serial + 1);
+
       }
    }
 
@@ -702,20 +721,22 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b)
       for (j = 0; j < 5; ++j) {
          if (i->src[j] && !live_set_test(b, i->src[j])) {
             live_set_add(b, i->src[j]->value);
+            add_range(i->src[j]->value, b, i->serial);
 #ifdef NV50_RA_DEBUG_LIVEI
-            debug_printf("adding range for source that ends living: %i\n",
+            debug_printf("adding range for source %i (ends living): ",
                          i->src[j]->value->n);
+            livei_print(i->src[j]->value);
 #endif
-            add_range(i->src[j]->value, b, i->serial);
          }
       }
       if (i->flags_src && !live_set_test(b, i->flags_src)) {
          live_set_add(b, i->flags_src->value);
+         add_range(i->flags_src->value, b, i->serial);
 #ifdef NV50_RA_DEBUG_LIVEI
-         debug_printf("adding range for source that ends living: %i\n",
+         debug_printf("adding range for source %i (ends living): ",
                       i->flags_src->value->n);
+         livei_print(i->flags_src->value);
 #endif
-         add_range(i->flags_src->value, b, i->serial);
       }
    }
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 115b5df939..8b18a9c025 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -120,6 +120,8 @@ struct bld_context {
    struct nv_basic_block *brkt_bb[BLD_MAX_LOOP_NESTING];
    int loop_lvl;
 
+   ubyte out_kind; /* CFG_EDGE_FORWARD, or FAKE in case of BREAK/CONT */
+
    struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */
    struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */
    struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */
@@ -268,7 +270,7 @@ fetch_by_bb(struct bld_value_stack *stack,
       return;
    }
    for (i = 0; i < b->num_in; ++i)
-      if (b->in_kind[i] != CFG_EDGE_BACK)
+      if (!IS_WALL_EDGE(b->in_kind[i]))
          fetch_by_bb(stack, vals, n, b->in[i]);
 }
 
@@ -362,18 +364,31 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b,
    return phi->def[0];
 }
 
+/* Insert a phi function in the loop header.
+ * For nested loops, we need to insert phi functions in all the outer
+ * loop headers if they don't have one yet.
+ *
+ * @def: redefinition from inside loop, or NULL if to be replaced later
+ */
 static struct nv_value *
 bld_loop_phi(struct bld_context *bld, struct bld_value_stack *stack,
              struct nv_value *def)
 {
-   struct nv_basic_block *bb = bld->pc->current_block;
    struct nv_instruction *phi;
-   struct nv_value *val;
+   struct nv_basic_block *bb = bld->pc->current_block;
+   struct nv_value *val = NULL;
 
-   val = bld_phi(bld, bld->pc->current_block, stack);
+   if (bld->loop_lvl > 1) {
+      --bld->loop_lvl;
+      if (!((stack->loop_def | stack->loop_use) & (1 << bld->loop_lvl)))
+         val = bld_loop_phi(bld, stack, NULL);
+      ++bld->loop_lvl;
+   }
+
+   if (!val)
+      val = bld_phi(bld, bld->pc->current_block, stack); /* old definition */
    if (!val) {
       bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]->in[0];
-
       val = bld_undef(bld, bld_stack_file(bld, stack));
    }
 
@@ -449,10 +464,11 @@ bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *,
 static void
 bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb)
 {
+   struct nv_basic_block *save = bld->pc->current_block;
    struct nv_instruction *phi, *next;
    struct nv_value *val;
    struct bld_value_stack *stk;
-   int s;
+   int i, s, n;
 
    for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = next) {
       next = phi->next;
@@ -460,19 +476,33 @@ bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb)
       stk = (struct bld_value_stack *)phi->target;
       phi->target = NULL;
 
-      val = bld_fetch_global(bld, stk);
+      for (s = 1, n = 0; n < bb->num_in; ++n) {
+         if (bb->in_kind[n] != CFG_EDGE_BACK)
+            continue;
 
-      nv_reference(bld->pc, &phi->src[1], val);
+         assert(s < 4);
+         bld->pc->current_block = bb->in[n];
+         val = bld_fetch_global(bld, stk);
+
+         for (i = 0; i < 4; ++i)
+            if (phi->src[i] && phi->src[i]->value == val)
+               break;
+         if (i == 4)
+            nv_reference(bld->pc, &phi->src[s++], val);
+      }
+      bld->pc->current_block = save;
 
-      s = -1;
       if (phi->src[0]->value == phi->def[0] ||
           phi->src[0]->value == phi->src[1]->value)
          s = 1;
       else
       if (phi->src[1]->value == phi->def[0])
          s = 0;
+      else
+         continue;
 
       if (s >= 0) {
+         /* eliminate the phi */
          bld_vals_del_val(stk, phi->def[0]);
 
          ++bld->pc->pass_seq;
@@ -915,6 +945,8 @@ bld_new_block(struct bld_context *bld, struct nv_basic_block *b)
 
    for (i = 0; i < 128; ++i)
       bld->saved_inputs[i] = NULL;
+
+   bld->out_kind = CFG_EDGE_FORWARD;
 }
 
 static struct nv_value *
@@ -1366,7 +1398,7 @@ bld_instruction(struct bld_context *bld,
       struct nv_basic_block *b = new_basic_block(bld->pc);
 
       --bld->cond_lvl;
-      nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD);
+      nvbb_attach_block(bld->pc->current_block, b, bld->out_kind);
       nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD);
 
       bld->cond_bb[bld->cond_lvl]->exit->target = b;
@@ -1407,8 +1439,10 @@ bld_instruction(struct bld_context *bld,
 
       bld_flow(bld, NV_OP_BREAK, NV_CC_TR, NULL, bb, FALSE);
 
-      /* XXX: don't do this for redundant BRKs */
-      nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE);
+      if (bld->out_kind == CFG_EDGE_FORWARD) /* else we already had BRK/CONT */
+         nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE);
+
+      bld->out_kind = CFG_EDGE_FAKE;
    }
       break;
    case TGSI_OPCODE_CONT:
@@ -1418,11 +1452,17 @@ bld_instruction(struct bld_context *bld,
       bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE);
 
       nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK);
+
+      if ((bb = bld->join_bb[bld->cond_lvl - 1])) {
+         bld->join_bb[bld->cond_lvl - 1] = NULL;
+         nv_nvi_delete(bb->exit->prev);
+      }
+      bld->out_kind = CFG_EDGE_FAKE;
    }
       break;
    case TGSI_OPCODE_ENDLOOP:
    {
-      struct nv_basic_block *bb = bld->loop_bb[--bld->loop_lvl];
+      struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1];
 
       bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE);
 
@@ -1430,7 +1470,7 @@ bld_instruction(struct bld_context *bld,
 
       bld_loop_end(bld, bb); /* replace loop-side operand of the phis */
 
-      bld_new_block(bld, bld->brkt_bb[bld->loop_lvl]);
+      bld_new_block(bld, bld->brkt_bb[--bld->loop_lvl]);
    }
       break;
    case TGSI_OPCODE_ABS:
@@ -1651,7 +1691,7 @@ bld_replace_value(struct nv_pc *pc, struct nv_basic_block *b,
 {
    struct nv_instruction *nvi;
 
-   for (nvi = b->entry; nvi; nvi = nvi->next) {
+   for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = nvi->next) {
       int s;
       for (s = 0; s < 5; ++s) {
          if (!nvi->src[s])
-- 
cgit v1.2.3


From 7145ab214f1bd0d84671936dddb87db05f2861f6 Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Sat, 28 Aug 2010 18:08:26 +0200
Subject: nv50: DST

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 8b18a9c025..0ea2912846 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1337,6 +1337,19 @@ bld_instruction(struct bld_context *bld,
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
          dst0[c] = temp;
       break;
+   case TGSI_OPCODE_DST:
+      if (insn->Dst[0].Register.WriteMask & 1)
+         dst0[0] = bld_imm_f32(bld, 1.0f);
+      if (insn->Dst[0].Register.WriteMask & 2) {
+         src0 = emit_fetch(bld, insn, 0, 1);
+         src1 = emit_fetch(bld, insn, 1, 1);
+         dst0[1] = bld_insn_2(bld, NV_OP_MUL, src0, src1);
+      }
+      if (insn->Dst[0].Register.WriteMask & 4)
+         dst0[2] = emit_fetch(bld, insn, 0, 2);
+      if (insn->Dst[0].Register.WriteMask & 8)
+         dst0[3] = emit_fetch(bld, insn, 1, 3);
+      break;
    case TGSI_OPCODE_EX2:
       src0 = emit_fetch(bld, insn, 0, 0);
       temp = bld_insn_1(bld, NV_OP_PREEX2, src0);
-- 
cgit v1.2.3


From e02c63bc10fd935537441917a10fef63fb3f9bfa Mon Sep 17 00:00:00 2001
From: Ben Skeggs <bskeggs@redhat.com>
Date: Sat, 28 Aug 2010 18:10:09 +0200
Subject: nv50: DPH

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 0ea2912846..5ac61f108e 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1337,6 +1337,13 @@ bld_instruction(struct bld_context *bld,
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
          dst0[c] = temp;
       break;
+   case TGSI_OPCODE_DPH:
+      src0 = bld_dot(bld, insn, 3);
+      src1 = emit_fetch(bld, insn, 1, 3);
+      temp = bld_insn_2(bld, NV_OP_ADD, src0, src1);
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
+         dst0[c] = temp;
+      break;
    case TGSI_OPCODE_DST:
       if (insn->Dst[0].Register.WriteMask & 1)
          dst0[0] = bld_imm_f32(bld, 1.0f);
-- 
cgit v1.2.3


From 917c79b384af9da95d2fe3ad86d488478d0d7718 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 31 Aug 2010 19:03:35 +0200
Subject: nv50: SSG

---
 src/gallium/drivers/nv50/nv50_pc_emit.c     |  4 ++--
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 13 +++++--------
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 11 +++++++++++
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index 7808335e50..e1d7bc6459 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -729,7 +729,7 @@ emit_bitop2(struct nv_pc *pc, struct nv_instruction *i)
 {
    pc->emit[0] = 0xd0000000;
 
-   if (SFILE(i, 0) == NV_FILE_IMM) {
+   if (SFILE(i, 1) == NV_FILE_IMM) {
       emit_form_IMM(pc, i, 0);
 
       if (i->opcode == NV_OP_OR)
@@ -761,7 +761,7 @@ emit_arl(struct nv_pc *pc, struct nv_instruction *i)
    pc->emit[0] = 0x00000001;
    pc->emit[1] = 0xc0000000;
 
-   set_dst(pc, i->def[0]);
+   pc->emit[0] |= (i->def[0]->reg.id + 1) << 2;
    set_pred(pc, i);
    set_src_0(pc, i->src[0]);
    pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x3f) << 16;
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 1d2710a8ac..4a3a51512e 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -264,11 +264,8 @@ check_swap_src_0_1(struct nv_instruction *nvi)
       return;
    assert(src0 && src1);
 
-   if (src1->value->reg.file == NV_FILE_IMM) {
-      /* should only be present from folding a constant MUL part of a MAD */
-      assert(nvi->opcode == NV_OP_ADD);
+   if (src1->value->reg.file == NV_FILE_IMM)
       return;
-   }
 
    if (is_cmem_load(src0->value->insn)) {
       if (!is_cmem_load(src1->value->insn)) {
@@ -305,7 +302,7 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
          continue;
 
       nvi = sti->src[0]->value->insn;
-      if (!nvi || nvi->opcode == NV_OP_PHI)
+      if (!nvi || nvi->opcode == NV_OP_PHI || nv_is_vector_op(nvi->opcode))
          continue;
       assert(nvi->def[0] == sti->src[0]->value);
 
@@ -536,9 +533,9 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi,
       break;
    case NV_OP_SUB:
       switch (type) {
-      case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32;
-      case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32;
-      case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32;
+      case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32; break;
+      case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32; break;
+      case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32; break;
       default:
          assert(0);
          break;
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 5ac61f108e..0a4c88c817 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1595,6 +1595,17 @@ bld_instruction(struct bld_context *bld,
       if (insn->Dst[0].Register.WriteMask & 0x8)
          dst0[3] = bld_imm_f32(bld, 1.0f);
       break;
+   case TGSI_OPCODE_SSG:
+      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
+         src0 = emit_fetch(bld, insn, 0, c);
+         src1 = bld_predicate(bld, src0, FALSE);
+         temp = bld_insn_2(bld, NV_OP_AND, src0, bld_imm_u32(bld, 0x80000000));
+         temp = bld_insn_2(bld, NV_OP_OR,  temp, bld_imm_f32(bld, 1.0f));
+         dst0[c] = bld_insn_2(bld, NV_OP_XOR, temp, temp);
+         dst0[c]->insn->cc = NV_CC_EQ;
+         nv_reference(bld->pc, &dst0[c]->insn->flags_src, src1);
+      }
+      break;
    case TGSI_OPCODE_SUB:
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
          src0 = emit_fetch(bld, insn, 0, c);
-- 
cgit v1.2.3


From 07fe7c2f02dbf4e0c385aaf3f21ee858f0ae974c Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 31 Aug 2010 19:09:15 +0200
Subject: nv50: make FrontFacing -1 or +1

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 0a4c88c817..c98d5e126a 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -964,6 +964,14 @@ bld_saved_input(struct bld_context *bld, unsigned i, unsigned c)
 static struct nv_value *
 bld_interpolate(struct bld_context *bld, unsigned mode, struct nv_value *val)
 {
+   if (val->reg.id == 255) {
+      /* gl_FrontFacing: 0/~0 to -1.0/+1.0 */
+      val = bld_insn_1(bld, NV_OP_LINTERP, val);
+      val = bld_insn_2(bld, NV_OP_SHL, val, bld_imm_u32(bld, 31));
+      val->insn->src[0]->typecast = NV_TYPE_U32;
+      val = bld_insn_2(bld, NV_OP_XOR, val, bld_imm_f32(bld, -1.0f));
+      val->insn->src[0]->typecast = NV_TYPE_U32;
+   } else
    if (mode & (NV50_INTERP_LINEAR | NV50_INTERP_FLAT))
       val = bld_insn_1(bld, NV_OP_LINTERP, val);
    else
@@ -1029,9 +1037,8 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       } else {
          assert(src->Dimension.Dimension == 0);
          res = bld_insn_1(bld, NV_OP_LDA, res);
+         assert(res->reg.type == type);
       }
-      assert(res->reg.type == type);
-
       bld->saved_inputs[bld->ti->input_map[idx][swz]] = res;
       break;
    case TGSI_FILE_TEMPORARY:
-- 
cgit v1.2.3


From 6f9978050eb8648888a728fc09b99e279c2b7b15 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 31 Aug 2010 19:17:46 +0200
Subject: nv50: re-add proper TEXBIAS sequence

---
 src/gallium/drivers/nv50/nv50_pc.c          |  29 ++++
 src/gallium/drivers/nv50/nv50_pc.h          |   9 +-
 src/gallium/drivers/nv50/nv50_pc_emit.c     |  28 +++-
 src/gallium/drivers/nv50/nv50_pc_optimize.c |  34 ++---
 src/gallium/drivers/nv50/nv50_pc_print.c    |   5 +-
 src/gallium/drivers/nv50/nv50_pc_regalloc.c |   8 +-
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 212 ++++++++++++++++++++++++----
 7 files changed, 258 insertions(+), 67 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index b03f5b27f6..28e32eadb7 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -204,6 +204,35 @@ nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val,
    return n;
 }
 
+struct nv_value *
+nvcg_find_constant(struct nv_ref *ref)
+{
+   struct nv_value *src;
+
+   if (!ref)
+      return NULL;
+
+   src = ref->value;
+   while (src->insn && src->insn->opcode == NV_OP_MOV) {
+      assert(!src->insn->src[0]->mod);
+      src = src->insn->src[0]->value;
+   }
+   if ((src->reg.file == NV_FILE_IMM) ||
+       (src->insn && src->insn->opcode == NV_OP_LDA &&
+        src->insn->src[0]->value->reg.file >= NV_FILE_MEM_C(0) &&
+        src->insn->src[0]->value->reg.file <= NV_FILE_MEM_C(15)))
+      return src;
+   return NULL;
+}
+
+struct nv_value *
+nvcg_find_immediate(struct nv_ref *ref)
+{
+   struct nv_value *src = nvcg_find_constant(ref);
+
+   return (src && src->reg.file == NV_FILE_IMM) ? src : NULL;
+}
+
 static void
 nv_pc_free_refs(struct nv_pc *pc)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 2bb3ea4374..adc46dec8d 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -126,6 +126,7 @@
 #define NV_TYPE_ISINT(t) ((t) <= 5)
 #define NV_TYPE_ISFLT(t) ((t) & 0x08)
 
+/* $cX registers contain 4 bits: OCSZ (Z is bit 0) */
 #define NV_CC_FL 0x0
 #define NV_CC_LT 0x1
 #define NV_CC_EQ 0x2
@@ -135,6 +136,10 @@
 #define NV_CC_GE 0x6
 #define NV_CC_U  0x8
 #define NV_CC_TR 0xf
+#define NV_CC_O  0x10
+#define NV_CC_C  0x11
+#define NV_CC_A  0x12
+#define NV_CC_S  0x13
 
 #define NV_PC_MAX_INSTRUCTIONS 2048
 #define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4)
@@ -241,7 +246,7 @@ struct nv_instruction {
    ubyte saturate : 1;
    ubyte centroid : 1;
    ubyte flat     : 1;
-   ubyte padding  : 4;
+   ubyte lanes    : 4;
    ubyte tex_live : 1;
    /* */
    ubyte tex_t; /* TIC binding */
@@ -459,6 +464,8 @@ boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *,
 struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *);
 int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val,
                        struct nv_value *new_val);
+struct nv_value *nvcg_find_immediate(struct nv_ref *);
+struct nv_value *nvcg_find_constant(struct nv_ref *);
 
 typedef void (*nv_pc_pass_func)(void *priv, struct nv_basic_block *b);
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index e1d7bc6459..bb0a6f32d1 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -540,8 +540,9 @@ emit_mov(struct nv_pc *pc, struct nv_instruction *i)
       set_a16_bits(pc, SREG(i->src[0])->id);
    } else
    if (DFILE(i, 0) == NV_FILE_FLAGS) {
-      pc->emit[0] = 0x000001fd;
-      pc->emit[1] = 0xa0000788 | (1 << 6);
+      pc->emit[0] = 0x00000001;
+      pc->emit[1] = 0xa0000000 | (1 << 6);
+      set_pred(pc, i);
       pc->emit[0] |= SREG(i->src[0])->id << 9;
       pc->emit[1] |= DREG(i->def[0])->id << 4;
    } else
@@ -984,7 +985,7 @@ emit_tex(struct nv_pc *pc, struct nv_instruction *i)
    pc->emit[0] |= i->tex_t << 9;
    pc->emit[0] |= i->tex_s << 17;
 
-   pc->emit[0] |= i->tex_argc << 22;
+   pc->emit[0] |= (i->tex_argc - 1) << 22;
 
    pc->emit[0] |= (i->tex_mask & 0x3) << 25;
    pc->emit[1] |= (i->tex_mask & 0xc) << 12;
@@ -1000,8 +1001,6 @@ emit_tex(struct nv_pc *pc, struct nv_instruction *i)
    else
    if (i->opcode == NV_OP_TXL)
       pc->emit[1] |= 0x40000000;
-   else
-      pc->emit[0] -= 1 << 22;
 }
 
 static void
@@ -1053,6 +1052,20 @@ emit_ddy(struct nv_pc *pc, struct nv_instruction *i)
    set_pred_wr(pc, i);
 }
 
+static void
+emit_quadop(struct nv_pc *pc, struct nv_instruction *i)
+{
+   pc->emit[0] = 0xc0000000;
+   pc->emit[1] = 0x80000000;
+
+   emit_form_ADD(pc, i);
+
+   pc->emit[0] |= i->lanes << 16;
+
+   pc->emit[0] |= (i->quadop & 0x03) << 20;
+   pc->emit[1] |= (i->quadop & 0xfc) << 20;
+}
+
 void
 nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
 {
@@ -1132,6 +1145,9 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
    case NV_OP_TXL:
       emit_tex(pc, i);
       break;
+   case NV_OP_QUADOP:
+      emit_quadop(pc, i);
+      break;
    case NV_OP_KIL:
       emit_flow(pc, i, 0x0);
       break;
@@ -1162,7 +1178,7 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i)
    case NV_OP_UNDEF:
    case NV_OP_SUB:
       NOUVEAU_ERR("operation \"%s\" should have been eliminated\n",
-		  nv_opcode_name(i->opcode));
+                  nv_opcode_name(i->opcode));
       break;
    default:
       NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode);
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 4a3a51512e..fb95da30f2 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -94,14 +94,17 @@ nvi_isnop(struct nv_instruction *nvi)
    if (nvi->opcode == NV_OP_EXPORT || nvi->opcode == NV_OP_UNDEF)
       return TRUE;
 
-   if (nvi->fixed ||
-       nvi->is_terminator ||
-       nvi->flags_src ||
+   /* NOTE: 'fixed' now only means that it shouldn't be optimized away,
+    *  but we can still remove it if it is a no-op move.
+    */
+   if (/* nvi->fixed || */
+       /* nvi->flags_src || */ /* cond. MOV to same register is still NOP */
        nvi->flags_def ||
+       nvi->is_terminator ||
        nvi->is_join)
       return FALSE;
 
-   if (nvi->def[0]->join->reg.id < 0)
+   if (nvi->def[0] && nvi->def[0]->join->reg.id < 0)
       return TRUE;
 
    if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT)
@@ -436,22 +439,6 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
 
 #define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL)
 
-static struct nv_value *
-find_immediate(struct nv_ref *ref)
-{
-   struct nv_value *src;
-
-   if (!ref)
-      return NULL;
-
-   src = ref->value;
-   while (src->insn && src->insn->opcode == NV_OP_MOV) {
-      assert(!src->insn->src[0]->mod);
-      src = src->insn->src[0]->value;
-   }
-   return (src->reg.file == NV_FILE_IMM) ? src : NULL;
-}
-
 static void
 modifiers_apply(uint32_t *val, ubyte type, ubyte mod)
 {
@@ -663,8 +650,8 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
 
       next = nvi->next;
 
-      src0 = find_immediate(nvi->src[0]);
-      src1 = find_immediate(nvi->src[1]);
+      src0 = nvcg_find_immediate(nvi->src[0]);
+      src1 = nvcg_find_immediate(nvi->src[1]);
 
       if (src0 && src1)
          constant_expression(ctx->pc, nvi, src0, src1);
@@ -778,6 +765,7 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
          if (ld->def[0]->reg.id >= 0)
             it->value = ld->def[0];
          else
+         if (!ld->fixed)
             nvcg_replace_value(ctx->pc, ld->def[0], it->value);
       } else {
          if (ctx->alloc == LOAD_RECORD_POOL_SIZE)
@@ -979,7 +967,7 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
       for (ir = entry; ir; ir = next) {
          next = ir->next;
          for (ik = entry; ik != ir; ik = ik->next) {
-            if (ir->opcode != ik->opcode)
+            if (ir->opcode != ik->opcode || ir->fixed)
                continue;
 
             if (!ir->def[0] || !ik->def[0] ||
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 7bdeb1c78d..01a6f00997 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -102,7 +102,8 @@ static const char *nv_opcode_names[NV_OP_COUNT + 1] = {
 static const char *nv_cond_names[] =
 {
    "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "",
-   "never", "ltu", "equ", "leu", "gtu", "neu", "geu", ""
+   "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "",
+   "o", "c", "a", "s"
 };
 
 static const char *nv_modifier_strings[] =
@@ -144,7 +145,7 @@ nv_type_name(ubyte type)
 static INLINE const char *
 nv_cond_name(ubyte cc)
 {
-   return nv_cond_names[MIN2(cc, 15)];
+   return nv_cond_names[MIN2(cc, 19)];
 }
 
 static INLINE const char *
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 81decf8d4a..e689d349f1 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -493,10 +493,10 @@ pass_join_values(struct nv_pc_pass *ctx, int iter)
       case NV_OP_SELECT:
          if (!iter)
             break;
-         assert(join_allowed(ctx, i->def[0], i->src[0]->value));
-         assert(join_allowed(ctx, i->def[0], i->src[1]->value));
-         do_join_values(ctx, i->def[0], i->src[0]->value);
-         do_join_values(ctx, i->def[0], i->src[1]->value);
+         for (c = 0; c < 4 && i->src[c]; ++c) {
+            assert(join_allowed(ctx, i->def[0], i->src[c]->value));
+            do_join_values(ctx, i->def[0], i->src[c]->value);
+         }
          break;
       case NV_OP_TEX:
       case NV_OP_TXB:
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index c98d5e126a..27d851e9fd 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1156,8 +1156,8 @@ get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg)
 
 static void
 load_proj_tex_coords(struct bld_context *bld,
-		     struct nv_value *t[4], int dim,
-		     const struct tgsi_full_instruction *insn)
+                     struct nv_value *t[4], int dim,
+                     const struct tgsi_full_instruction *insn)
 {
    int c, mask = 0;
 
@@ -1188,59 +1188,209 @@ load_proj_tex_coords(struct bld_context *bld,
    }
 }
 
+/* For a quad of threads / top left, top right, bottom left, bottom right
+ * pixels, do a different operation, and take src0 from a specific thread.
+ */
+#define QOP_ADD 0
+#define QOP_SUBR 1
+#define QOP_SUB 2
+#define QOP_MOV1 3
+
+#define QOP(a, b, c, d) \
+   ((QOP_##a << 0) | (QOP_##b << 2) | (QOP_##c << 4) | (QOP_##d << 6))
+
+static INLINE struct nv_value *
+bld_quadop(struct bld_context *bld, ubyte qop, struct nv_value *src0, int lane,
+           struct nv_value *src1, boolean wp)
+{
+   struct nv_value *val = bld_insn_2(bld, NV_OP_QUADOP, src0, src1);
+   val->insn->lanes = lane;
+   val->insn->quadop = qop;
+   if (wp) {
+      val->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16);
+      val->insn->flags_def->insn = val->insn;
+   }
+   return val;
+}
+
+static INLINE struct nv_value *
+bld_cmov(struct bld_context *bld,
+         struct nv_value *src, ubyte cc, struct nv_value *cr)
+{
+   src = bld_insn_1(bld, NV_OP_MOV, src);
+
+   src->insn->cc = cc;
+   src->insn->flags_src = new_ref(bld->pc, cr);
+
+   return src;
+}
+
+static struct nv_instruction *
+emit_tex(struct bld_context *bld, uint opcode,
+         struct nv_value *dst[4], struct nv_value *t_in[4],
+         int argc, int tic, int tsc, int cube)
+{
+   struct nv_value *t[4];
+   struct nv_instruction *nvi;
+   int c;
+
+   /* the inputs to a tex instruction must be separate values */
+   for (c = 0; c < argc; ++c) {
+      t[c] = bld_insn_1(bld, NV_OP_MOV, t_in[c]);
+      t[c]->reg.type = NV_TYPE_F32;
+      t[c]->insn->fixed = 1;
+   }
+
+   nvi = new_instruction(bld->pc, opcode);
+
+   for (c = 0; c < 4; ++c)
+      dst[c] = bld_def(nvi, c, new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32));
+
+   for (c = 0; c < argc; ++c)
+      nvi->src[c] = new_ref(bld->pc, t[c]);
+
+   nvi->tex_t = tic;
+   nvi->tex_s = tsc;
+   nvi->tex_mask = 0xf;
+   nvi->tex_cube = cube;
+   nvi->tex_live = 0;
+   nvi->tex_argc = argc;
+
+   return nvi;
+}
+
+static void
+bld_texlod_sequence(struct bld_context *bld,
+                    struct nv_value *dst[4], struct nv_value *t[4], int arg,
+                    int tic, int tsc, int cube)
+{
+   emit_tex(bld, NV_OP_TXL, dst, t, arg, tic, tsc, cube); /* TODO */
+}
+
+
+/* The lanes of a quad are grouped by the bit in the condition register
+ * they have set, which is selected by differing bias values.
+ * Move the input values for TEX into a new register set for each group
+ * and execute TEX only for a specific group.
+ * We always need to use 4 new registers for the inputs/outputs because
+ * the implicitly calculated derivatives must be correct.
+ */
+static void
+bld_texbias_sequence(struct bld_context *bld,
+                     struct nv_value *dst[4], struct nv_value *t[4], int arg,
+                     int tic, int tsc, int cube)
+{
+   struct nv_instruction *sel, *tex;
+   struct nv_value *bit[4], *cr[4], *res[4][4], *val;
+   int l, c;
+
+   const ubyte cc[4] = { NV_CC_EQ, NV_CC_S, NV_CC_C, NV_CC_O };
+
+   for (l = 0; l < 4; ++l) {
+      bit[l] = bld_load_imm_u32(bld, 1 << l);
+
+      val = bld_quadop(bld, QOP(SUBR, SUBR, SUBR, SUBR),
+                       t[arg - 1], l, t[arg - 1], TRUE);
+
+      cr[l] = bld_cmov(bld, bit[l], NV_CC_EQ, val->insn->flags_def);
+
+      cr[l]->reg.file = NV_FILE_FLAGS;
+      cr[l]->reg.type = NV_TYPE_U16;
+   }
+
+   sel = new_instruction(bld->pc, NV_OP_SELECT);
+
+   for (l = 0; l < 4; ++l)
+      sel->src[l] = new_ref(bld->pc, cr[l]);
+
+   bld_def(sel, 0, new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16));
+
+   for (l = 0; l < 4; ++l) {
+      tex = emit_tex(bld, NV_OP_TXB, dst, t, arg, tic, tsc, cube);
+
+      tex->cc = cc[l];
+      tex->flags_src = new_ref(bld->pc, sel->def[0]);
+
+      for (c = 0; c < 4; ++c)
+         res[l][c] = tex->def[c];
+   }
+
+   for (l = 0; l < 4; ++l)
+      for (c = 0; c < 4; ++c)
+         res[l][c] = bld_cmov(bld, res[l][c], cc[l], sel->def[0]);
+
+   for (c = 0; c < 4; ++c) {
+      sel = new_instruction(bld->pc, NV_OP_SELECT);
+
+      for (l = 0; l < 4; ++l)
+         sel->src[l] = new_ref(bld->pc, res[l][c]);
+
+      bld_def(sel, 0, (dst[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32)));
+   }
+}
+
+static boolean
+bld_is_constant(struct nv_value *val)
+{
+   if (val->reg.file == NV_FILE_IMM)
+      return TRUE;
+   return val->insn && nvcg_find_constant(val->insn->src[0]);
+}
+
 static void
 bld_tex(struct bld_context *bld, struct nv_value *dst0[4],
         const struct tgsi_full_instruction *insn)
 {
-   struct nv_value *t[4];
-   struct nv_instruction *nvi;
+   struct nv_value *t[4], *s[3];
    uint opcode = translate_opcode(insn->Instruction.Opcode);
    int arg, dim, c;
+   const int tic = insn->Src[1].Register.Index;
+   const int tsc = 0;
+   const int cube = (insn->Texture.Texture  == TGSI_TEXTURE_CUBE) ? 1 : 0;
 
    get_tex_dim(insn, &dim, &arg);
 
-   if (insn->Texture.Texture == TGSI_TEXTURE_CUBE) {
-   }
-   // else
-   if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) {
+   if (!cube && insn->Instruction.Opcode == TGSI_OPCODE_TXP)
       load_proj_tex_coords(bld, t, dim, insn);
-   } else
+   else
       for (c = 0; c < dim; ++c)
          t[c] = emit_fetch(bld, insn, 0, c);
 
-   if (arg != dim)
-      t[dim] = emit_fetch(bld, insn, 0, 2);
+   if (cube) {
+      assert(dim >= 3);
+      for (c = 0; c < 3; ++c)
+         s[c] = bld_insn_1(bld, NV_OP_ABS, t[c]);
 
-   if (insn->Instruction.Opcode == TGSI_OPCODE_TXB ||
-       insn->Instruction.Opcode == TGSI_OPCODE_TXL) {
-      t[arg++] = emit_fetch(bld, insn, 0, 3);
-   }
+      s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[1]);
+      s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[2]);
+      s[0] = bld_insn_1(bld, NV_OP_RCP, s[0]);
 
-   for (c = 0; c < arg; ++c) {
-      t[c] = bld_insn_1(bld, NV_OP_MOV, t[c]);
-      t[c]->reg.type = NV_TYPE_F32;
+      for (c = 0; c < 3; ++c)
+         t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], s[0]);
    }
 
-   nvi = new_instruction(bld->pc, opcode);
+   if (arg != dim)
+      t[dim] = emit_fetch(bld, insn, 0, 2);
 
-   for (c = 0; c < 4; ++c) {
-      nvi->def[c] = dst0[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32);
-      nvi->def[c]->insn = nvi;
+   if (opcode == NV_OP_TXB || opcode == NV_OP_TXL) {
+      t[arg++] = emit_fetch(bld, insn, 0, 3);
+
+      if ((bld->ti->p->type == PIPE_SHADER_FRAGMENT) &&
+          !bld_is_constant(t[arg - 1])) {
+         if (opcode == NV_OP_TXB)
+            bld_texbias_sequence(bld, dst0, t, arg, tic, tsc, cube);
+         else
+            bld_texlod_sequence(bld, dst0, t, arg, tic, tsc, cube);
+         return;
+      }
    }
-   for (c = 0; c < arg; ++c)
-      nvi->src[c] = new_ref(bld->pc, t[c]);
 
-   nvi->tex_t = insn->Src[1].Register.Index;
-   nvi->tex_s = 0;
-   nvi->tex_mask = 0xf;
-   nvi->tex_cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0;
-   nvi->tex_live = 0;
-   nvi->tex_argc = arg;
+   emit_tex(bld, opcode, dst0, t, arg, tic, tsc, cube);
 }
 
 static INLINE struct nv_value *
 bld_dot(struct bld_context *bld, const struct tgsi_full_instruction *insn,
-	int n)
+        int n)
 {
    struct nv_value *dotp, *src0, *src1;
    int c;
-- 
cgit v1.2.3


From e08f70a41d1012a0270468866614485a3415168e Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 31 Aug 2010 20:36:45 +0200
Subject: nv50: make use of TGSI immediate type

---
 src/gallium/drivers/nv50/nv50_program.c    | 14 ++++++++++----
 src/gallium/drivers/nv50/nv50_program.h    |  1 +
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 10 +++++++++-
 3 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 182a591eb3..523603ca3a 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -187,13 +187,14 @@ prog_immediate(struct nv50_translation_info *ti,
                const struct tgsi_full_immediate *imm)
 {
    int c;
-   unsigned n = ++ti->immd32_nr;
+   unsigned n = ti->immd32_nr++;
 
-   if (n == (1 << (ffs(n) - 1)))
-      ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16);
+   assert(ti->immd32_nr <= ti->scan.immediate_count);
 
    for (c = 0; c < 4; ++c)
-      ti->immd32[(n - 1) * 4 + c] = imm->u[c].Uint;
+      ti->immd32[n * 4 + c] = imm->u[c].Uint;
+
+   ti->immd32_ty[n] = imm->Immediate.DataType;
 }
 
 static INLINE unsigned
@@ -495,6 +496,9 @@ nv50_prog_scan(struct nv50_translation_info *ti)
    tgsi_dump(p->pipe.tokens, 0);
 #endif
 
+   ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16);
+   ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte));
+
    tgsi_parse_init(&parse, p->pipe.tokens);
    while (!tgsi_parse_end_of_tokens(&parse)) {
       tgsi_parse_token(&parse);
@@ -561,6 +565,8 @@ nv50_program_tx(struct nv50_program *p)
 out:
    if (ti->immd32)
       FREE(ti->immd32);
+   if (ti->immd32_ty)
+      FREE(ti->immd32_ty);
    FREE(ti);
    return ret ? FALSE : TRUE;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 1184d9be3b..639f06217e 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -116,6 +116,7 @@ struct nv50_translation_info {
    struct tgsi_shader_info scan;
    uint32_t *immd32;
    unsigned immd32_nr;
+   ubyte *immd32_ty;
    ubyte edgeflag_out;
    struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES];
    int subr_nr;
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 27d851e9fd..141d2cd325 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1022,7 +1022,15 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
    case TGSI_FILE_IMMEDIATE:
       assert(idx < bld->ti->immd32_nr);
       res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]);
-      res->reg.type = type;
+
+      switch (bld->ti->immd32_ty[idx]) {
+      case TGSI_IMM_FLOAT32: res->reg.type = NV_TYPE_F32; break;
+      case TGSI_IMM_UINT32: res->reg.type = NV_TYPE_U32; break;
+      case TGSI_IMM_INT32: res->reg.type = NV_TYPE_S32; break;
+      default:
+         res->reg.type = type;
+         break;
+      }
       break;
    case TGSI_FILE_INPUT:
       res = bld_saved_input(bld, idx, swz);
-- 
cgit v1.2.3


From 8e6ba3c8cc41701b4391d0772bf2318604972ae9 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Wed, 1 Sep 2010 12:41:59 +0200
Subject: nv50: must join SELECT inputs before MOV inputs

---
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 32 +++++++++++++++++------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index e689d349f1..d401706b5b 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -480,18 +480,18 @@ pass_join_values(struct nv_pc_pass *ctx, int iter)
 
       switch (i->opcode) {
       case NV_OP_PHI:
-         if (!iter)
-            continue;
+         if (iter != 2)
+            break;
          for (c = 0; c < 4 && i->src[c]; ++c)
             try_join_values(ctx, i->def[0], i->src[c]->value);
          break;
       case NV_OP_MOV:
-         if (iter && i->src[0]->value->insn &&
+         if ((iter == 2) && i->src[0]->value->insn &&
              !nv_is_vector_op(i->src[0]->value->join->insn->opcode))
             try_join_values(ctx, i->def[0], i->src[0]->value);
          break;
       case NV_OP_SELECT:
-         if (!iter)
+         if (iter != 1)
             break;
          for (c = 0; c < 4 && i->src[c]; ++c) {
             assert(join_allowed(ctx, i->def[0], i->src[c]->value));
@@ -919,15 +919,21 @@ nv_pc_exec_pass1(struct nv_pc *pc)
       livei_print(&pc->values[i]);
 #endif
 
-   for (i = 0; i < 2; ++i) {
-      ret = pass_join_values(ctx, i);
-      if (ret)
-         goto out;
-      ret = pass_linear_scan(ctx, i);
-      if (ret)
-         goto out;
-   }
-   assert(!ret && "joining");
+   ret = pass_join_values(ctx, 0);
+   if (ret)
+      goto out;
+   ret = pass_linear_scan(ctx, 0);
+   if (ret)
+      goto out;
+   ret = pass_join_values(ctx, 1);
+   if (ret)
+      goto out;
+   ret = pass_join_values(ctx, 2);
+   if (ret)
+      goto out;
+   ret = pass_linear_scan(ctx, 1);
+   if (ret)
+      goto out;
 
    for (i = 0; i < pc->num_values; ++i)
       livei_release(&pc->values[i]);
-- 
cgit v1.2.3


From a79da61a4b5dd94fdacc0e7196510e8d27c8a157 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Wed, 1 Sep 2010 12:42:15 +0200
Subject: nv50: fix XPD, was negated

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 141d2cd325..6bd2de4c74 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1791,12 +1791,12 @@ bld_instruction(struct bld_context *bld,
             dst0[3] = bld_imm_f32(bld, 1.0f);
             break;
          }
-         src0 = emit_fetch(bld, insn, 0, (c + 1) % 3);
-         src1 = emit_fetch(bld, insn, 1, (c + 2) % 3);
+         src0 = emit_fetch(bld, insn, 1, (c + 1) % 3);
+         src1 = emit_fetch(bld, insn, 0, (c + 2) % 3);
          dst0[c] = bld_insn_2(bld, NV_OP_MUL, src0, src1);
 
-         src0 = emit_fetch(bld, insn, 0, (c + 2) % 3);
-         src1 = emit_fetch(bld, insn, 1, (c + 1) % 3);
+         src0 = emit_fetch(bld, insn, 0, (c + 1) % 3);
+         src1 = emit_fetch(bld, insn, 1, (c + 2) % 3);
          dst0[c] = bld_insn_3(bld, NV_OP_MAD, src0, src1, dst0[c]);
 
          dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG;
-- 
cgit v1.2.3


From 9f9ae4eee1939dd15853b8cd1a4fad2c7197aa9a Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 2 Sep 2010 18:28:39 +0200
Subject: nv50: fix find_dom_frontier

---
 src/gallium/drivers/nv50/nv50_pc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 28e32eadb7..c2f2ab3ef3 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -631,6 +631,7 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
 static struct nv_basic_block *
 nvbb_find_dom_frontier(struct nv_basic_block *b, struct nv_basic_block *df)
 {
+   struct nv_basic_block *out;
    int i;
 
    if (!nvbb_dominated_by(df, b)) {
@@ -641,11 +642,11 @@ nvbb_find_dom_frontier(struct nv_basic_block *b, struct nv_basic_block *df)
             return df;
       }
    }
-   for (i = 0; i < 2 && b->out[i]; ++i) {
-      if (b->out_kind[i] == CFG_EDGE_BACK)
+   for (i = 0; i < 2 && df->out[i]; ++i) {
+      if (df->out_kind[i] == CFG_EDGE_BACK)
          continue;
-      if ((df = nvbb_find_dom_frontier(b, b->out[i])))
-         return df;
+      if ((out = nvbb_find_dom_frontier(b, df->out[i])))
+         return out;
    }
    return NULL;
 }
-- 
cgit v1.2.3


From 443abc80db9e1a288ce770e76cccd43664348098 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 2 Sep 2010 18:27:01 +0200
Subject: nv50: fix build-predicate function

---
 src/gallium/drivers/nv50/nv50_pc.c          | 15 ++++++++++++-
 src/gallium/drivers/nv50/nv50_pc.h          | 20 ++++++++++++++++-
 src/gallium/drivers/nv50/nv50_pc_optimize.c |  9 ++++++++
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 34 +++++++++++++++++++----------
 4 files changed, 65 insertions(+), 13 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index c2f2ab3ef3..e34c0553eb 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -121,7 +121,7 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)
       return FALSE;
    case NV_OP_MOV:
       assert(s == 0);
-      return TRUE;
+      return /* TRUE */ FALSE; /* don't turn MOVs into loads */
    default:
       return FALSE;
    }
@@ -507,6 +507,19 @@ nvbb_insert_tail(struct nv_basic_block *b, struct nv_instruction *i)
    b->num_instructions++;
 }
 
+void
+nvi_insert_after(struct nv_instruction *at, struct nv_instruction *ni)
+{
+   if (!at->next) {
+      nvbb_insert_tail(at->bb, ni);
+      return;
+   }
+   ni->next = at->next;
+   ni->prev = at;
+   ni->next->prev = ni;
+   ni->prev->next = ni;
+}
+
 void
 nv_nvi_delete(struct nv_instruction *nvi)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index adc46dec8d..703d32d334 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -347,9 +347,10 @@ struct nv_pc {
 };
 
 void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *);
+void nvi_insert_after(struct nv_instruction *, struct nv_instruction *);
 
 static INLINE struct nv_instruction *
-new_instruction(struct nv_pc *pc, uint opcode)
+nv_alloc_instruction(struct nv_pc *pc, uint opcode)
 {
    struct nv_instruction *insn;
 
@@ -359,10 +360,27 @@ new_instruction(struct nv_pc *pc, uint opcode)
    insn->cc = NV_CC_TR;
    insn->opcode = opcode;
 
+   return insn;
+}
+
+static INLINE struct nv_instruction *
+new_instruction(struct nv_pc *pc, uint opcode)
+{
+   struct nv_instruction *insn = nv_alloc_instruction(pc, opcode);
+
    nvbb_insert_tail(pc->current_block, insn);
    return insn;
 }
 
+static INLINE struct nv_instruction *
+new_instruction_at(struct nv_pc *pc, struct nv_instruction *at, uint opcode)
+{
+   struct nv_instruction *insn = nv_alloc_instruction(pc, opcode);
+
+   nvi_insert_after(at, insn);
+   return insn;
+}
+
 static INLINE struct nv_value *
 new_value(struct nv_pc *pc, ubyte file, ubyte type)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index fb95da30f2..1ed5032175 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -636,6 +636,15 @@ constant_operand(struct nv_pc *pc,
    default:
       break;
    }
+
+   if (nvi->opcode == NV_OP_MOV && nvi->flags_def) {
+      struct nv_instruction *cvt = new_instruction_at(pc, nvi, NV_OP_CVT);
+
+      nv_reference(pc, &cvt->src[0], nvi->def[0]);
+
+      cvt->flags_def = nvi->flags_def;
+      nvi->flags_def = NULL;
+   }
 }
 
 static int
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 6bd2de4c74..e1c6ed87bf 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -625,23 +625,35 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)
 static struct nv_value *
 bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only)
 {
-   struct nv_instruction *nvi = src->insn;
+   struct nv_instruction *s0i, *nvi = src->insn;
 
-   if (nvi->opcode == NV_OP_LDA ||
-       nvi->opcode == NV_OP_PHI ||
-       nvi->bb != bld->pc->current_block) {
-      nvi = new_instruction(bld->pc, NV_OP_CVT);
-      nv_reference(bld->pc, &nvi->src[0], src);
+   if (!nvi) {
+      nvi = bld_insn_1(bld,
+                       (src->reg.file == NV_FILE_IMM) ? NV_OP_MOV : NV_OP_LDA,
+                       src)->insn;
+      src = nvi->def[0];
    } else
    if (bool_only) {
-      while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_CVT ||
-             nvi->opcode == NV_OP_NEG) {
-         /* TGSI SET gets conversion to f32, we only need source 0/~0 */
-         if (!nvi->def[0]->insn->flags_src)
-            nvi = nvi->src[0]->value->insn;
+      while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_NEG ||
+             nvi->opcode == NV_OP_CVT) {
+         s0i = nvi->src[0]->value->insn;
+         if (!s0i ||
+             s0i->opcode == NV_OP_LDA ||
+             s0i->opcode == NV_OP_MOV ||
+             s0i->opcode == NV_OP_PHI)
+            break;
+         nvi = s0i;
+         assert(!nvi->flags_src);
       }
    }
 
+   if (nvi->opcode == NV_OP_LDA ||
+       nvi->opcode == NV_OP_MOV ||
+       nvi->opcode == NV_OP_PHI || nvi->bb != bld->pc->current_block) {
+      nvi = new_instruction(bld->pc, NV_OP_CVT);
+      nv_reference(bld->pc, &nvi->src[0], src);
+   }
+
    if (!nvi->flags_def) {
       nvi->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16);
       nvi->flags_def->insn = nvi;
-- 
cgit v1.2.3


From 9e4901402cf50405be28ce6311f10e22196fbc35 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Fri, 3 Sep 2010 14:26:47 +0200
Subject: nv50: load address register before using it, not after

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index e1c6ed87bf..386dbda423 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1000,6 +1000,7 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
 {
    const struct tgsi_full_src_register *src = &insn->Src[s];
    struct nv_value *res;
+   struct nv_value *ptr = NULL;
    unsigned idx, swz, dim_idx, ind_idx, ind_swz;
    ubyte type = infer_src_type(insn->Instruction.Opcode);
 
@@ -1012,7 +1013,11 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
    if (src->Register.Indirect) {
       ind_idx = src->Indirect.Index;
       ind_swz = tgsi_util_get_src_register_swizzle(&src->Indirect, 0);
+
+      ptr = FETCH_ADDR(ind_idx, ind_swz);
    }
+   if (idx >= (128 / 4) && src->Register.File == TGSI_FILE_CONSTANT)
+      ptr = bld_get_address(bld, (idx * 16) & ~0x1ff, ptr);
 
    switch (src->Register.File) {
    case TGSI_FILE_CONSTANT:
@@ -1025,11 +1030,8 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       res->reg.id = (idx * 4 + swz) & 127;
       res = bld_insn_1(bld, NV_OP_LDA, res);
 
-      if (src->Register.Indirect)
-         res->insn->src[4] = new_ref(bld->pc, FETCH_ADDR(ind_idx, ind_swz));
-      if (idx >= (128 / 4))
-         res->insn->src[4] =
-            new_ref(bld->pc, bld_get_address(bld, (idx * 16) & ~0x1ff, NULL));
+      if (ptr)
+         res->insn->src[4] = new_ref(bld->pc, ptr);
       break;
    case TGSI_FILE_IMMEDIATE:
       assert(idx < bld->ti->immd32_nr);
-- 
cgit v1.2.3


From 217542a061ef31150b1b04f1b45b6099bcc153fe Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 5 Sep 2010 19:06:17 +0200
Subject: nv50: save tgsi instructions

---
 src/gallium/drivers/nv50/nv50_program.c | 5 +++++
 src/gallium/drivers/nv50/nv50_program.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 523603ca3a..d7d3030e2f 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -499,6 +499,8 @@ nv50_prog_scan(struct nv50_translation_info *ti)
    ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16);
    ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte));
 
+   ti->insns = MALLOC(ti->scan.num_instructions * sizeof(ti->insns[0]));
+
    tgsi_parse_init(&parse, p->pipe.tokens);
    while (!tgsi_parse_end_of_tokens(&parse)) {
       tgsi_parse_token(&parse);
@@ -511,6 +513,7 @@ nv50_prog_scan(struct nv50_translation_info *ti)
          prog_decl(ti, &parse.FullToken.FullDeclaration);
          break;
       case TGSI_TOKEN_TYPE_INSTRUCTION:
+         ti->insns[ti->inst_nr] = parse.FullToken.FullInstruction;
          prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr);
          break;
       }
@@ -567,6 +570,8 @@ out:
       FREE(ti->immd32);
    if (ti->immd32_ty)
       FREE(ti->immd32_ty);
+   if (ti->insns)
+      FREE(ti->insns);
    FREE(ti);
    return ret ? FALSE : TRUE;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 639f06217e..3c3f1f7f97 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -104,6 +104,7 @@ struct nv50_subroutine {
 struct nv50_translation_info {
    struct nv50_program *p;
    unsigned inst_nr;
+   struct tgsi_full_instruction *insns;
    ubyte input_file;
    ubyte output_file;
    ubyte input_map[PIPE_MAX_SHADER_INPUTS][4];
-- 
cgit v1.2.3


From d91b8865ec2bb41f9b58ad5ce2df7f6f48f98281 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 7 Sep 2010 15:40:34 +0200
Subject: nv50: prepare for having multiple functions

At some point we'll want to support real subroutines instead of
just inlining them into the main shader.

Since recursive calls are forbidden, we can just save all used
registers to a fixed local memory region and restore them on a
return, no need for a stack pointer.
---
 src/gallium/drivers/nv50/nv50_pc.c          | 48 ++++++++++++++++------
 src/gallium/drivers/nv50/nv50_pc.h          | 12 +++---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 56 +++++++++++++++++--------
 src/gallium/drivers/nv50/nv50_pc_regalloc.c | 23 ++++++++---
 src/gallium/drivers/nv50/nv50_program.c     | 63 +++++++++++++++++++++++++++--
 src/gallium/drivers/nv50/nv50_program.h     | 16 ++++----
 src/gallium/drivers/nv50/nv50_screen.c      |  3 +-
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  |  2 +-
 8 files changed, 171 insertions(+), 52 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index e34c0553eb..c54f16e4c5 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -304,7 +304,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
 }
 
 static void
-nv_do_print_program(void *priv, struct nv_basic_block *b)
+nv_do_print_function(void *priv, struct nv_basic_block *b)
 {
    struct nv_instruction *i = b->phi;
 
@@ -323,11 +323,23 @@ nv_do_print_program(void *priv, struct nv_basic_block *b)
 }
 
 void
-nv_print_program(struct nv_basic_block *root)
+nv_print_function(struct nv_basic_block *root)
 {
-   nv_pc_pass_in_order(root, nv_do_print_program, root);
+   if (root->subroutine)
+      debug_printf("SUBROUTINE %i\n", root->subroutine);
+   else
+      debug_printf("MAIN\n");
 
-   debug_printf("END\n\n");
+   nv_pc_pass_in_order(root, nv_do_print_function, root);
+}
+
+void
+nv_print_program(struct nv_pc *pc)
+{
+   int i;
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i])
+         nv_print_function(pc->root[i]);
 }
 
 static INLINE void
@@ -388,11 +400,18 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (!pc)
       return 1;
 
+   pc->root = CALLOC(ti->subr_nr + 1, sizeof(pc->root[0]));
+   if (!pc->root) {
+      FREE(pc);
+      return 1;
+   }
+   pc->num_subroutines = ti->subr_nr;
+
    ret = nv50_tgsi_to_nc(pc, ti);
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* optimization */
@@ -400,7 +419,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* register allocation */
@@ -408,7 +427,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* prepare for emission */
@@ -441,16 +460,19 @@ nv50_generate_code(struct nv50_translation_info *ti)
 
 out:
    nv_pc_free_refs(pc);
-   if (ret) {
+
+   if (pc->bb_list)
+      FREE(pc->bb_list);
+
+   if (ret) { /* on success, these will be referenced by nv50_program */
       if (pc->emit)
-         free(pc->emit);
+         FREE(pc->emit);
       if (pc->immd_buf)
-         free(pc->immd_buf);
+         FREE(pc->immd_buf);
       if (pc->fixups)
-         free(pc->fixups);
+         FREE(pc->fixups);
    }
-   free(pc);
-
+   FREE(pc);
    return ret;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 703d32d334..d9cc775572 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -282,7 +282,7 @@ struct nv_basic_block {
    ubyte in_kind[8];
 
    int id;
-   struct nv_basic_block *last_visitor;
+   int subroutine;
    uint priv;
    uint pass_seq;
 
@@ -314,10 +314,10 @@ nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data)
    bin[fixup->offset / 4] = val;
 }
 
-struct nv_pc {
-   struct nv50_translation_info *ti;
+struct nv50_translation_info;
 
-   struct nv_basic_block *root;
+struct nv_pc {
+   struct nv_basic_block **root;
    struct nv_basic_block *current_block;
    struct nv_basic_block *parent_block;
 
@@ -332,6 +332,7 @@ struct nv_pc {
    int num_instructions;
    int num_refs;
    int num_blocks;
+   int num_subroutines;
 
    int max_reg[4];
 
@@ -463,7 +464,8 @@ void nv_print_instruction(struct nv_instruction *);
 
 /* nv50_pc.c */
 
-void nv_print_program(struct nv_basic_block *b);
+void nv_print_function(struct nv_basic_block *root);
+void nv_print_program(struct nv_pc *);
 
 boolean nv_op_commutative(uint opcode);
 int nv50_indirect_opnd(struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 1ed5032175..4f5bdc1f9f 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -213,23 +213,36 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
    pc->bin_size += b->bin_size *= 4;
 }
 
-int
-nv_pc_exec_pass2(struct nv_pc *pc)
+static int
+nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pass pass;
 
    pass.pc = pc;
 
    pc->pass_seq++;
-   nv_pass_flatten(&pass, pc->root);
+
+   nv_pass_flatten(&pass, root);
+
+   nv_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc);
+
+   return 0;
+}
+
+int
+nv_pc_exec_pass2(struct nv_pc *pc)
+{
+   int i, ret;
 
    NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks);
 
-   pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
-   pc->num_blocks = 0;
+   pc->bb_list = CALLOC(pc->num_blocks, sizeof(pc->bb_list[0]));
 
-   nv_pc_pass_in_order(pc->root, nv_pc_pass_pre_emission, pc);
+   pc->num_blocks = 0;
 
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i])))
+         return ret;
    return 0;
 }
 
@@ -1032,8 +1045,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
    return 0;
 }
 
-int
-nv_pc_exec_pass0(struct nv_pc *pc)
+static int
+nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pass_reld_elim *reldelim;
    struct nv_pass pass;
@@ -1047,35 +1060,35 @@ nv_pc_exec_pass0(struct nv_pc *pc)
     * to whether sources are supported memory loads.
     */
    pc->pass_seq++;
-   ret = nv_pass_lower_arith(&pass, pc->root);
+   ret = nv_pass_lower_arith(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_fold_loads(&pass, pc->root);
+   ret = nv_pass_fold_loads(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_fold_stores(&pass, pc->root);
+   ret = nv_pass_fold_stores(&pass, root);
    if (ret)
       return ret;
 
    reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
    reldelim->pc = pc;
    pc->pass_seq++;
-   ret = nv_pass_reload_elim(reldelim, pc->root);
+   ret = nv_pass_reload_elim(reldelim, root);
    FREE(reldelim);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_cse(&pass, pc->root);
+   ret = nv_pass_cse(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_lower_mods(&pass, pc->root);
+   ret = nv_pass_lower_mods(&pass, root);
    if (ret)
       return ret;
 
@@ -1083,14 +1096,25 @@ nv_pc_exec_pass0(struct nv_pc *pc)
    do {
       dce.removed = 0;
       pc->pass_seq++;
-      ret = nv_pass_dce(&dce, pc->root);
+      ret = nv_pass_dce(&dce, root);
       if (ret)
          return ret;
    } while (dce.removed);
 
-   ret = nv_pass_tex_mask(&pass, pc->root);
+   ret = nv_pass_tex_mask(&pass, root);
    if (ret)
       return ret;
 
    return ret;
 }
+
+int
+nv_pc_exec_pass0(struct nv_pc *pc)
+{
+   int i, ret;
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i])))
+         return ret;
+   return 0;
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index d401706b5b..2998343db5 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -874,8 +874,8 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter)
    return 0;
 }
 
-int
-nv_pc_exec_pass1(struct nv_pc *pc)
+static int
+nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pc_pass *ctx;
    int i, ret;
@@ -890,12 +890,12 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *));
 
    pc->pass_seq++;
-   ret = pass_generate_phi_movs(ctx, pc->root);
+   ret = pass_generate_phi_movs(ctx, root);
    assert(!ret);
 
    for (i = 0; i < pc->loop_nesting_bound; ++i) {
       pc->pass_seq++;
-      ret = pass_build_live_sets(ctx, pc->root);
+      ret = pass_build_live_sets(ctx, root);
       assert(!ret && "live sets");
       if (ret) {
          NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i);
@@ -904,10 +904,10 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    }
 
    pc->pass_seq++;
-   nv_pc_pass_in_order(pc->root, pass_order_instructions, ctx);
+   nv_pc_pass_in_order(root, pass_order_instructions, ctx);
 
    pc->pass_seq++;
-   ret = pass_build_intervals(ctx, pc->root);
+   ret = pass_build_intervals(ctx, root);
    assert(!ret && "build intervals");
    if (ret) {
       NOUVEAU_ERR("failed to build live intervals\n");
@@ -944,3 +944,14 @@ out:
    FREE(ctx);
    return ret;
 }
+
+int
+nv_pc_exec_pass1(struct nv_pc *pc)
+{
+   int i, ret;
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass1(pc, pc->root[i])))
+         return ret;
+   return 0;
+}
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index d7d3030e2f..925028700c 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -147,10 +147,17 @@ prog_inst(struct nv50_translation_info *ti,
    int s, c, k;
    unsigned mask;
 
+   if (inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) {
+      ti->subr[ti->subr_nr].pos = id - 1;
+      ti->subr[ti->subr_nr].id = ti->subr_nr + 1; /* id 0 is main program */
+      ++ti->subr_nr;
+   }
+
    if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+      dst = &inst->Dst[0].Register;
+
       for (c = 0; c < 4; ++c) {
-         dst = &inst->Dst[0].Register;
-         if (inst->Dst[0].Register.Indirect)
+         if (dst->Indirect)
             nv50_indirect_outputs(ti, id);
          if (!(dst->WriteMask & (1 << c)))
             continue;
@@ -182,6 +189,44 @@ prog_inst(struct nv50_translation_info *ti,
    }
 }
 
+/* Probably should introduce something like struct tgsi_function_declaration
+ * instead of trying to guess inputs/outputs.
+ */
+static void
+prog_subroutine_inst(struct nv50_subroutine *subr,
+                     const struct tgsi_full_instruction *inst)
+{
+   const struct tgsi_dst_register *dst;
+   const struct tgsi_src_register *src;
+   int s, c, k;
+   unsigned mask;
+
+   for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
+      src = &inst->Src[s].Register;
+      if (src->File != TGSI_FILE_TEMPORARY)
+         continue;
+      mask = nv50_tgsi_src_mask(inst, s);
+
+      assert(!inst->Src[s].Register.Indirect);
+
+      for (c = 0; c < 4; ++c) {
+         k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c);
+
+         if ((mask & (1 << c)) && k < TGSI_SWIZZLE_W)
+            if (!(subr->retv[src->Index / 32][k] & (1 << (src->Index % 32))))
+               subr->argv[src->Index / 32][k] |= 1 << (src->Index % 32);
+      }
+   }
+
+   if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
+      dst = &inst->Dst[0].Register;
+
+      for (c = 0; c < 4; ++c)
+         if (dst->WriteMask & (1 << c))
+            subr->retv[dst->Index / 32][c] |= 1 << (dst->Index % 32);
+   }
+}
+
 static void
 prog_immediate(struct nv50_translation_info *ti,
                const struct tgsi_full_immediate *imm)
@@ -482,7 +527,7 @@ nv50_prog_scan(struct nv50_translation_info *ti)
 {
    struct nv50_program *p = ti->p;
    struct tgsi_parse_context parse;
-   int ret;
+   int ret, i;
 
    p->vp.edgeflag = 0x40;
    p->vp.psiz = 0x40;
@@ -496,6 +541,9 @@ nv50_prog_scan(struct nv50_translation_info *ti)
    tgsi_dump(p->pipe.tokens, 0);
 #endif
 
+   ti->subr =
+      CALLOC(ti->scan.opcode_count[TGSI_OPCODE_BGNSUB], sizeof(ti->subr[0]));
+
    ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16);
    ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte));
 
@@ -519,6 +567,13 @@ nv50_prog_scan(struct nv50_translation_info *ti)
       }
    }
 
+   /* Scan to determine which registers are inputs/outputs of a subroutine. */
+   for (i = 0; i < ti->subr_nr; ++i) {
+      int pc = ti->subr[i].id;
+      while (ti->insns[pc].Instruction.Opcode != TGSI_OPCODE_ENDSUB)
+         prog_subroutine_inst(&ti->subr[i], &ti->insns[pc++]);
+   }
+
    p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1;
    p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1;
 
@@ -572,6 +627,8 @@ out:
       FREE(ti->immd32_ty);
    if (ti->insns)
       FREE(ti->insns);
+   if (ti->subr)
+      FREE(ti->subr);
    FREE(ti);
    return ret ? FALSE : TRUE;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 3c3f1f7f97..918baf325f 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -27,6 +27,8 @@
 #include "tgsi/tgsi_scan.h"
 #include "nouveau/nouveau_class.h"
 
+#define NV50_CAP_MAX_PROGRAM_TEMPS (128 / 4)
+
 struct nv50_varying {
    uint8_t id; /* tgsi index */
    uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
@@ -92,13 +94,13 @@ struct nv50_program {
 #define NV50_INTERP_FLAT     (1 << 1)
 #define NV50_INTERP_CENTROID (1 << 2)
 
-#define NV50_PROG_MAX_SUBROUTINES 8
-
 /* analyze TGSI and see which TEMP[] are used as subroutine inputs/outputs */
 struct nv50_subroutine {
-   int id;
-   uint32_t argv[4][1]; /* 4 bitmasks, for each of xyzw, only allow 32 TEMPs */
-   uint32_t retv[4][1];
+   unsigned id;
+   unsigned pos;
+   /* function inputs and outputs */
+   uint32_t argv[NV50_CAP_MAX_PROGRAM_TEMPS][4];
+   uint32_t retv[NV50_CAP_MAX_PROGRAM_TEMPS][4];
 };
 
 struct nv50_translation_info {
@@ -119,8 +121,8 @@ struct nv50_translation_info {
    unsigned immd32_nr;
    ubyte *immd32_ty;
    ubyte edgeflag_out;
-   struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES];
-   int subr_nr;
+   struct nv50_subroutine *subr;
+   unsigned subr_nr;
 };
 
 int nv50_generate_code(struct nv50_translation_info *ti);
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index fc75d81d54..c1efa443da 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -26,6 +26,7 @@
 #include "nv50_context.h"
 #include "nv50_screen.h"
 #include "nv50_resource.h"
+#include "nv50_program.h"
 
 #include "nouveau/nouveau_stateobj.h"
 
@@ -152,7 +153,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 0;
 	case PIPE_CAP_MAX_VS_TEMPS:
 	case PIPE_CAP_MAX_FS_TEMPS: /* no spilling atm */
-		return 128 / 4;
+		return NV50_CAP_MAX_PROGRAM_TEMPS;
 	case PIPE_CAP_DEPTH_CLAMP:
 		return 1;
 	default:
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 386dbda423..dea8fa0663 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1850,7 +1850,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
    struct bld_context *bld = CALLOC_STRUCT(bld_context);
    int c;
 
-   pc->root = pc->current_block = new_basic_block(pc);
+   pc->root[0] = pc->current_block = new_basic_block(pc);
 
    bld->pc = pc;
    bld->ti = ti;
-- 
cgit v1.2.3


From d8dcff79702860eae92d3d35b461c9b71114c1c5 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Tue, 7 Sep 2010 19:02:10 +0200
Subject: nv50: don't parse again in tgsi_2_nc

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index dea8fa0663..983fcb2fbf 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1849,6 +1849,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
 {
    struct bld_context *bld = CALLOC_STRUCT(bld_context);
    int c;
+   unsigned ip;
 
    pc->root[0] = pc->current_block = new_basic_block(pc);
 
@@ -1865,21 +1866,8 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
       bld->frgcrd[3] = bld_insn_1(bld, NV_OP_RCP, bld->frgcrd[3]);
    }
 
-   tgsi_parse_init(&bld->parse[0], ti->p->pipe.tokens);
-
-   while (!tgsi_parse_end_of_tokens(&bld->parse[bld->call_lvl])) {
-      const union tgsi_full_token *tok = &bld->parse[bld->call_lvl].FullToken;
-
-      tgsi_parse_token(&bld->parse[bld->call_lvl]);
-
-      switch (tok->Token.Type) {
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         bld_instruction(bld, &tok->FullInstruction);
-         break;
-      default:
-         break;
-      }
-   }
+   for (ip = 0; ip < ti->inst_nr; ++ip)
+      bld_instruction(bld, &ti->insns[ip]);
 
    bld_free_value_trackers(&bld->tvs[0][0], BLD_MAX_TEMPS);
    bld_free_value_trackers(&bld->avs[0][0], BLD_MAX_ADDRS);
-- 
cgit v1.2.3


From f30810cb68a53c4fef360778a230126ed0ee0ee3 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 9 Sep 2010 19:12:54 +0200
Subject: nv50: use actual loads/stores if TEMPs are accessed indirectly

---
 src/gallium/drivers/nv50/nv50_pc.c          |  2 ++
 src/gallium/drivers/nv50/nv50_pc.h          |  3 ++
 src/gallium/drivers/nv50/nv50_pc_emit.c     | 28 +++++++++++----
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 19 ++++++----
 src/gallium/drivers/nv50/nv50_pc_print.c    |  3 ++
 src/gallium/drivers/nv50/nv50_program.c     |  7 ++++
 src/gallium/drivers/nv50/nv50_program.h     |  1 +
 src/gallium/drivers/nv50/nv50_screen.c      | 25 ++++++++++---
 src/gallium/drivers/nv50/nv50_screen.h      |  3 +-
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 54 ++++++++++++++++++++++++++---
 10 files changed, 122 insertions(+), 23 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index c54f16e4c5..637b3cf2fe 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -414,6 +414,8 @@ nv50_generate_code(struct nv50_translation_info *ti)
    nv_print_program(pc);
 #endif
 
+   pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE;
+
    /* optimization */
    ret = nv_pc_exec_pass0(pc);
    if (ret)
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index d9cc775572..ba32ab08ab 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -345,6 +345,9 @@ struct nv_pc {
 
    struct nv_fixup *fixups;
    int num_fixups;
+
+   /* optimization enables */
+   boolean opt_reload_elim;
 };
 
 void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index bb0a6f32d1..8c64b19875 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -412,25 +412,25 @@ emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask)
 }
 
 static void
-set_ld_st_size(struct nv_pc *pc, ubyte type)
+set_ld_st_size(struct nv_pc *pc, int s, ubyte type)
 {
    switch (type) {
    case NV_TYPE_F64:
-      pc->emit[1] |= 0x8000;
+      pc->emit[1] |= 0x8000 << s;
       break;
    case NV_TYPE_F32:
    case NV_TYPE_S32:
    case NV_TYPE_U32:
-      pc->emit[1] |= 0xc000;
+      pc->emit[1] |= 0xc000 << s;
       break;
    case NV_TYPE_S16:
-      pc->emit[1] |= 0x6000;
+      pc->emit[1] |= 0x6000 << s;
       break;
    case NV_TYPE_U16:
-      pc->emit[1] |= 0x4000;
+      pc->emit[1] |= 0x4000 << s;
       break;
    case NV_TYPE_S8:
-      pc->emit[1] |= 0x2000;
+      pc->emit[1] |= 0x2000 << s;
       break;
    default:
       break;
@@ -473,12 +473,14 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
    if (sf == NV_FILE_MEM_L) {
       pc->emit[0] = 0xd0000001;
       pc->emit[1] = 0x40000000;
+
+      set_addr(pc, i);
    } else {
       NOUVEAU_ERR("invalid ld source file\n");
       abort();
    }
 
-   set_ld_st_size(pc, STYPE(i, 0));
+   set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0));
 
    set_dst(pc, i->def[0]);
    set_pred_wr(pc, i);
@@ -495,7 +497,19 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
 static void
 emit_st(struct nv_pc *pc, struct nv_instruction *i)
 {
+   assert(SFILE(i, 1) == NV_FILE_GPR);
+   assert(SFILE(i, 0) == NV_FILE_MEM_L);
+
+   pc->emit[0] = 0xd0000001;
+   pc->emit[1] = 0x60000000;
 
+   SID(pc, i->src[1], 2);
+   SID(pc, i->src[0], 9);
+
+   set_ld_st_size(pc, 8, STYPE(i, 1));
+
+   set_addr(pc, i);
+   set_pred(pc, i);
 }
 
 static int
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 4f5bdc1f9f..09d232abda 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -82,6 +82,8 @@ inst_commutation_legal(struct nv_instruction *a,
 static INLINE boolean
 inst_cullable(struct nv_instruction *nvi)
 {
+   if (nvi->opcode == NV_OP_STA)
+      return FALSE;
    return (!(nvi->is_terminator || nvi->is_join ||
              nvi->target ||
              nvi->fixed ||
@@ -739,6 +741,7 @@ struct nv_pass_reld_elim {
    int alloc;
 };
 
+/* TODO: properly handle loads from l[] memory in the presence of stores */
 static int
 nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
 {
@@ -1074,13 +1077,15 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
    if (ret)
       return ret;
 
-   reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
-   reldelim->pc = pc;
-   pc->pass_seq++;
-   ret = nv_pass_reload_elim(reldelim, root);
-   FREE(reldelim);
-   if (ret)
-      return ret;
+   if (pc->opt_reload_elim) {
+      reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
+      reldelim->pc = pc;
+      pc->pass_seq++;
+      ret = nv_pass_reload_elim(reldelim, root);
+      FREE(reldelim);
+      if (ret)
+         return ret;
+   }
 
    pc->pass_seq++;
    ret = nv_pass_cse(&pass, root);
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 01a6f00997..74c3970f40 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -217,6 +217,9 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
    case NV_FILE_FLAGS:
       PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value));
       break;
+   case NV_FILE_MEM_L:
+      nv_print_address('l', -1, ind, 4 * nv_value_id(value));
+      break;
    case NV_FILE_MEM_S:
       nv_print_address('s', -1, ind, 4 * nv_value_id(value));
       break;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 925028700c..24952f70f1 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -168,10 +168,17 @@ prog_inst(struct nv50_translation_info *ti,
           inst->Src[0].Register.File == TGSI_FILE_INPUT &&
           dst->Index == ti->edgeflag_out)
          ti->p->vp.edgeflag = inst->Src[0].Register.Index;
+   } else
+   if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
+      if (inst->Dst[0].Register.Indirect)
+         ti->store_to_memory = TRUE;
    }
 
    for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
       src = &inst->Src[s].Register;
+      if (src->File == TGSI_FILE_TEMPORARY)
+         if (inst->Src[s].Register.Indirect)
+            ti->store_to_memory = TRUE;
       if (src->File != TGSI_FILE_INPUT)
          continue;
       mask = nv50_tgsi_src_mask(inst, s);
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 918baf325f..a1b2bde97b 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -116,6 +116,7 @@ struct nv50_translation_info {
    int output_access[PIPE_MAX_SHADER_OUTPUTS][4];
    boolean indirect_inputs;
    boolean indirect_outputs;
+   boolean store_to_memory;
    struct tgsi_shader_info scan;
    uint32_t *immd32;
    unsigned immd32_nr;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index c1efa443da..24a6d8055c 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -274,7 +274,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	uint64_t value;
 	unsigned chipset = dev->chipset;
 	unsigned tesla_class = 0;
-	unsigned stack_size;
+	unsigned stack_size, local_size, max_warps;
 	int ret, i;
 	const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
 
@@ -495,9 +495,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	/* shader stack */
 	nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
 
-	stack_size  = util_bitcount(value & 0xffff);
-	stack_size *= util_bitcount((value >> 24) & 0xf);
-	stack_size *= 32 * 64 * 8;
+	max_warps  = util_bitcount(value & 0xffff);
+	max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
+
+	stack_size = max_warps * 64 * 8;
 
 	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
 			     stack_size, &screen->stack_bo);
@@ -510,6 +511,22 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	OUT_RING  (chan, 4);
 
+	local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32;
+
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+			     local_size, &screen->local_bo);
+	if (ret) {
+		nv50_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
+
+	BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RING  (chan, util_unsigned_logbase2(local_size / 8));
+
 	/* Vertex array limits - max them out */
 	for (i = 0; i < 16; i++) {
 		BEGIN_RING(chan, screen->tesla,
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index 1517f5608f..ad6bdeb27c 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -25,7 +25,8 @@ struct nv50_screen {
 	struct nouveau_bo *tic;
 	struct nouveau_bo *tsc;
 
-	struct nouveau_bo *stack_bo;
+	struct nouveau_bo *stack_bo; /* control flow stack */
+	struct nouveau_bo *local_bo; /* l[] memory */
 
 	boolean force_push;
 };
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 983fcb2fbf..f4fee4e0f2 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -558,6 +558,38 @@ bld_insn_3(struct bld_context *bld, uint opcode,
    return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
 }
 
+static void
+bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst,
+               struct nv_value *val)
+{
+   struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA);
+   struct nv_value *loc;
+
+   loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+   loc->reg.id = ofst * 4;
+
+   nv_reference(bld->pc, &insn->src[0], loc);
+   nv_reference(bld->pc, &insn->src[1], val);
+   nv_reference(bld->pc, &insn->src[4], ptr);
+}
+
+static struct nv_value *
+bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst)
+{
+   struct nv_value *loc, *val;
+
+   loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
+
+   loc->reg.id = ofst * 4;
+
+   val = bld_insn_1(bld, NV_OP_LDA, loc);
+
+   nv_reference(bld->pc, &val->insn->src[4], ptr);
+
+   return val;
+}
+
 #define BLD_INSN_1_EX(d, op, dt, s0, s0t)           \
    do {                                             \
       (d) = bld_insn_1(bld, (NV_OP_##op), (s0));    \
@@ -854,10 +886,18 @@ infer_dst_type(unsigned opcode)
 
 static void
 emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
-	   unsigned chan, struct nv_value *value)
+           unsigned chan, struct nv_value *value)
 {
+   struct nv_value *ptr;
    const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 
+   if (reg->Register.Indirect) {
+      ptr = FETCH_ADDR(reg->Indirect.Index,
+                       tgsi_util_get_src_register_swizzle(&reg->Indirect, 0));
+   } else {
+      ptr = NULL;
+   }
+
    assert(chan < 4);
 
    if (inst->Instruction.Opcode != TGSI_OPCODE_MOV)
@@ -893,7 +933,11 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
       value->reg.file = NV_FILE_GPR;
       if (value->insn->bb != bld->pc->current_block)
          value = bld_insn_1(bld, NV_OP_MOV, value);
-      STORE_TEMP(reg->Register.Index, chan, value);
+
+      if (bld->ti->store_to_memory)
+         bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value);
+      else
+         STORE_TEMP(reg->Register.Index, chan, value);
       break;
    case TGSI_FILE_ADDRESS:
       assert(reg->Register.Index < BLD_MAX_ADDRS);
@@ -1064,8 +1108,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       bld->saved_inputs[bld->ti->input_map[idx][swz]] = res;
       break;
    case TGSI_FILE_TEMPORARY:
-      /* this should be load from l[], with reload elimination later on */
-      res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
+      if (bld->ti->store_to_memory)
+         res = bld_lmem_load(bld, ptr, idx * 4 + swz);
+      else
+         res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
       break;
    case TGSI_FILE_ADDRESS:
       res = bld_fetch_global(bld, &bld->avs[idx][swz]);
-- 
cgit v1.2.3


From 9cc80e25db3d0bfd38015a197de3a1a80b6733ab Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 9 Sep 2010 19:17:55 +0200
Subject: nv50: create value references with the right type

Since atm our OPs aren't typed but instead values are, we need to
take care if they're used as different types (e.g. a load makes a
value u32 by default).

Maybe this should be changed (also to match TGSI), but it should
work as well if done properly.
---
 src/gallium/drivers/nv50/nv50_pc.h         |  9 ++++--
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 46 +++++++++++++++---------------
 2 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index ba32ab08ab..ccddae063c 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -189,6 +189,7 @@ struct nv_reg {
    int id;
    ubyte file;
    ubyte type; /* type of generating instruction's result */
+   ubyte as_type; /* default type for new references to this value */
    union {
       float f32;
       double f64;
@@ -396,14 +397,16 @@ new_value(struct nv_pc *pc, ubyte file, ubyte type)
    value->join = value;
    value->reg.id = -1;
    value->reg.file = file;
-   value->reg.type = type;
+   value->reg.type = value->reg.as_type = type;
    return value;
 }
 
 static INLINE struct nv_value *
 new_value_like(struct nv_pc *pc, struct nv_value *like)
 {
-   return new_value(pc, like->reg.file, like->reg.type);
+   struct nv_value *val = new_value(pc, like->reg.file, like->reg.type);
+   val->reg.as_type = like->reg.as_type;
+   return val;
 }
 
 static INLINE struct nv_ref *
@@ -425,7 +428,7 @@ new_ref(struct nv_pc *pc, struct nv_value *val)
 
    ref = pc->refs[pc->num_refs++];
    ref->value = val;
-   ref->typecast = val->reg.type;
+   ref->typecast = val->reg.as_type;
 
    ++val->refc;
    return ref;
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index f4fee4e0f2..50f0151b53 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -22,8 +22,6 @@
 
 /* #define NV50_TGSI2NC_DEBUG */
 
-/* XXX: need to clean this up so we get the typecasting right more naturally */
-
 #include <unistd.h>
 
 #include "nv50_context.h"
@@ -519,17 +517,16 @@ bld_imm_f32(struct bld_context *bld, float f)
    return bld_imm_u32(bld, fui(f));
 }
 
-#define SET_TYPE(v, t) ((v)->reg.type = NV_TYPE_##t)
+#define SET_TYPE(v, t) ((v)->reg.type = (v)->reg.as_type = (t))
 
 static struct nv_value *
 bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0)
 {
    struct nv_instruction *insn = new_instruction(bld->pc, opcode);
-   assert(insn);
 
-   nv_reference(bld->pc, &insn->src[0], src0); /* NOTE: new_ref would suffice */
+   nv_reference(bld->pc, &insn->src[0], src0);
    
-   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
+   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type));
 }
 
 static struct nv_value *
@@ -541,7 +538,7 @@ bld_insn_2(struct bld_context *bld, uint opcode,
    nv_reference(bld->pc, &insn->src[0], src0);
    nv_reference(bld->pc, &insn->src[1], src1);
 
-   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
+   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type));
 }
 
 static struct nv_value *
@@ -555,7 +552,7 @@ bld_insn_3(struct bld_context *bld, uint opcode,
    nv_reference(bld->pc, &insn->src[1], src1);
    nv_reference(bld->pc, &insn->src[2], src2);
 
-   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
+   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type));
 }
 
 static void
@@ -593,14 +590,14 @@ bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst)
 #define BLD_INSN_1_EX(d, op, dt, s0, s0t)           \
    do {                                             \
       (d) = bld_insn_1(bld, (NV_OP_##op), (s0));    \
-      (d)->reg.type = NV_TYPE_##dt;                 \
+      SET_TYPE(d, NV_TYPE_##dt);                    \
       (d)->insn->src[0]->typecast = NV_TYPE_##s0t;  \
    } while(0)
 
 #define BLD_INSN_2_EX(d, op, dt, s0, s0t, s1, s1t)       \
    do {                                                  \
       (d) = bld_insn_2(bld, (NV_OP_##op), (s0), (s1));   \
-      (d)->reg.type = NV_TYPE_##dt;                      \
+      SET_TYPE(d, NV_TYPE_##dt);                         \
       (d)->insn->src[0]->typecast = NV_TYPE_##s0t;       \
       (d)->insn->src[1]->typecast = NV_TYPE_##s1t;       \
    } while(0)
@@ -910,9 +907,9 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
       BLD_INSN_1_EX(value, SAT, F32, value, F32);
       break;
    case TGSI_SAT_MINUS_PLUS_ONE:
+      value->reg.as_type = NV_TYPE_F32;
       value = bld_insn_2(bld, NV_OP_MAX, value, bld_load_imm_f32(bld, -1.0f));
       value = bld_insn_2(bld, NV_OP_MIN, value, bld_load_imm_f32(bld, 1.0f));
-      value->reg.type = NV_TYPE_F32;
       break;
    }
 
@@ -1070,7 +1067,7 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       assert(dim_idx == 1); /* for now */
 
       res = new_value(bld->pc, NV_FILE_MEM_C(dim_idx), type);
-      res->reg.type = type;
+      SET_TYPE(res, type);
       res->reg.id = (idx * 4 + swz) & 127;
       res = bld_insn_1(bld, NV_OP_LDA, res);
 
@@ -1082,11 +1079,11 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
       res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]);
 
       switch (bld->ti->immd32_ty[idx]) {
-      case TGSI_IMM_FLOAT32: res->reg.type = NV_TYPE_F32; break;
-      case TGSI_IMM_UINT32: res->reg.type = NV_TYPE_U32; break;
-      case TGSI_IMM_INT32: res->reg.type = NV_TYPE_S32; break;
+      case TGSI_IMM_FLOAT32: SET_TYPE(res, NV_TYPE_F32); break;
+      case TGSI_IMM_UINT32: SET_TYPE(res, NV_TYPE_U32); break;
+      case TGSI_IMM_INT32: SET_TYPE(res, NV_TYPE_S32); break;
       default:
-         res->reg.type = type;
+         SET_TYPE(res, type);
          break;
       }
       break;
@@ -1127,6 +1124,9 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
    if (!res)
       return bld_undef(bld, NV_FILE_GPR);
 
+   if (insn->Instruction.Opcode != TGSI_OPCODE_MOV)
+      res->reg.as_type = type;
+
    switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
    case TGSI_UTIL_SIGN_KEEP:
       break;
@@ -1305,7 +1305,7 @@ emit_tex(struct bld_context *bld, uint opcode,
    /* the inputs to a tex instruction must be separate values */
    for (c = 0; c < argc; ++c) {
       t[c] = bld_insn_1(bld, NV_OP_MOV, t_in[c]);
-      t[c]->reg.type = NV_TYPE_F32;
+      SET_TYPE(t[c], NV_TYPE_F32);
       t[c]->insn->fixed = 1;
    }
 
@@ -1363,7 +1363,7 @@ bld_texbias_sequence(struct bld_context *bld,
       cr[l] = bld_cmov(bld, bit[l], NV_CC_EQ, val->insn->flags_def);
 
       cr[l]->reg.file = NV_FILE_FLAGS;
-      cr[l]->reg.type = NV_TYPE_U16;
+      SET_TYPE(cr[l], NV_TYPE_U16);
    }
 
    sel = new_instruction(bld->pc, NV_OP_SELECT);
@@ -1510,7 +1510,8 @@ bld_instruction(struct bld_context *bld,
       src1 = bld_imm_u32(bld, 4);
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) {
          src0 = emit_fetch(bld, insn, 0, c);
-         (temp = bld_insn_1(bld, NV_OP_FLOOR, src0))->reg.type = NV_TYPE_S32;
+         temp = bld_insn_1(bld, NV_OP_FLOOR, src0);
+         SET_TYPE(temp, NV_TYPE_S32);
          dst0[c] = bld_insn_2(bld, NV_OP_SHL, temp, src1);
       }
       break;
@@ -1791,7 +1792,7 @@ bld_instruction(struct bld_context *bld,
          src1 = emit_fetch(bld, insn, 1, c);
          dst0[c] = bld_insn_2(bld, NV_OP_SET, src0, src1);
          dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode);
-         dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode);
+         SET_TYPE(dst0[c], infer_dst_type(insn->Instruction.Opcode));
 
          dst0[c]->insn->src[0]->typecast =
          dst0[c]->insn->src[1]->typecast =
@@ -1799,11 +1800,10 @@ bld_instruction(struct bld_context *bld,
 
          if (dst0[c]->reg.type != NV_TYPE_F32)
             break;
+         dst0[c]->reg.as_type = NV_TYPE_S32;
          dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]);
-         dst0[c]->insn->src[0]->typecast = NV_TYPE_S32;
-         dst0[c]->reg.type = NV_TYPE_S32;
          dst0[c] = bld_insn_1(bld, NV_OP_CVT, dst0[c]);
-         dst0[c]->reg.type = NV_TYPE_F32;
+         SET_TYPE(dst0[c], NV_TYPE_F32);
       }
       break;
    case TGSI_OPCODE_SCS:
-- 
cgit v1.2.3


From 246ebd7df1854db22a7f46302ecb1b5d56b68855 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 9 Sep 2010 19:18:42 +0200
Subject: nv50: duplicate interps in load_proj_tex_coords

Otherwise we might clobber the origin interpolation result or
use the result of the RCP before its definition.
---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 39 +++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 50f0151b53..4168bbbc95 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -555,6 +555,34 @@ bld_insn_3(struct bld_context *bld, uint opcode,
    return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type));
 }
 
+static struct nv_value *
+bld_duplicate_insn(struct bld_context *bld, struct nv_instruction *nvi)
+{
+   struct nv_instruction *dupi = new_instruction(bld->pc, nvi->opcode);
+   int c;
+
+   if (nvi->def[0])
+      bld_def(dupi, 0, new_value_like(bld->pc, nvi->def[0]));
+
+   if (nvi->flags_def) {
+      dupi->flags_def = new_value_like(bld->pc, nvi->flags_def);
+      dupi->flags_def->insn = dupi;
+   }
+
+   for (c = 0; c < 5; ++c)
+      if (nvi->src[c])
+         nv_reference(bld->pc, &dupi->src[c], nvi->src[c]->value);
+   if (nvi->flags_src)
+      nv_reference(bld->pc, &dupi->flags_src, nvi->flags_src->value);
+
+   dupi->cc = nvi->cc;
+   dupi->saturate = nvi->saturate;
+   dupi->centroid = nvi->centroid;
+   dupi->flat = nvi->flat;
+
+   return dupi->def[0];
+}
+
 static void
 bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst,
                struct nv_value *val)
@@ -1232,6 +1260,7 @@ load_proj_tex_coords(struct bld_context *bld,
    t[3] = emit_fetch(bld, insn, 0, 3);
 
    if (t[3]->insn->opcode == NV_OP_PINTERP) {
+      t[3] = bld_duplicate_insn(bld, t[3]->insn);
       t[3]->insn->opcode = NV_OP_LINTERP;
       nv_reference(bld->pc, &t[3]->insn->src[1], NULL);
    }
@@ -1240,13 +1269,15 @@ load_proj_tex_coords(struct bld_context *bld,
 
    for (c = 0; c < dim; ++c) {
       t[c] = emit_fetch(bld, insn, 0, c);
-      if (t[c]->insn->opcode == NV_OP_LINTERP)
-         t[c]->insn->opcode = NV_OP_PINTERP;
 
-      if (t[c]->insn->opcode == NV_OP_PINTERP)
+      if (t[c]->insn->opcode == NV_OP_LINTERP ||
+          t[c]->insn->opcode == NV_OP_PINTERP) {
+         t[c] = bld_duplicate_insn(bld, t[c]->insn);
+         t[c]->insn->opcode = NV_OP_PINTERP;
          nv_reference(bld->pc, &t[c]->insn->src[1], t[3]);
-      else
+      } else {
          mask |= 1 << c;
+      }
    }
 
    for (c = 0; mask; ++c, mask >>= 1) {
-- 
cgit v1.2.3


From 6b14a3eb191ab798e524f2413180256fbcc2b33e Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 9 Sep 2010 19:19:08 +0200
Subject: nv50: address regs are 16 bit

---
 src/gallium/drivers/nv50/nv50_pc_print.c   | 4 +++-
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index 74c3970f40..a71401979c 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -171,12 +171,14 @@ nv_value_allocated(struct nv_value *value)
 static INLINE void
 nv_print_address(const char c, int buf, struct nv_value *a, int offset)
 {
+   const char ac =  (a && nv_value_allocated(a)) ? '$' : '%';
+
    if (buf >= 0)
       PRINT(" %s%c%i[", cyan, c, buf);
    else
       PRINT(" %s%c[", cyan, c);
    if (a)
-      PRINT("%s$a%i%s+", mgta, nv_value_id(a), cyan);
+      PRINT("%s%ca%i%s+", mgta, ac, nv_value_id(a), cyan);
    PRINT("%s0x%x%s]", orng, offset, cyan);
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 4168bbbc95..6fd749b35f 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -674,6 +674,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)
 
    bld->saved_addr[i][0] = bld_load_imm_u32(bld, id);
    bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR;
+   bld->saved_addr[i][0]->reg.type = NV_TYPE_U16;
    bld->saved_addr[i][1] = indirect;
    return bld->saved_addr[i][0];
 }
@@ -967,6 +968,7 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
    case TGSI_FILE_ADDRESS:
       assert(reg->Register.Index < BLD_MAX_ADDRS);
       value->reg.file = NV_FILE_ADDR;
+      value->reg.type = NV_TYPE_U16;
       STORE_ADDR(reg->Register.Index, chan, value);
       break;
    }
-- 
cgit v1.2.3


From 6997da9f3cf22b9d11ffdfa6ad25b68ef4913fc3 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Thu, 9 Sep 2010 19:09:38 +0200
Subject: nv50: fix can_load check for 3rd source

---
 src/gallium/drivers/nv50/nv50_pc.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 637b3cf2fe..e4df742a80 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -112,13 +112,11 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)
       if (s == 0 && (value->reg.file == NV_FILE_MEM_S ||
                      value->reg.file == NV_FILE_MEM_P))
          return TRUE;
-      if (s == 1 &&
-          value->reg.file >= NV_FILE_MEM_C(0) &&
-          value->reg.file <= NV_FILE_MEM_C(15))
-         return TRUE;
-      if (s == 2 && nvi->src[1]->value->reg.file == NV_FILE_GPR)
-         return TRUE;
-      return FALSE;
+      if (value->reg.file < NV_FILE_MEM_C(0) ||
+          value->reg.file > NV_FILE_MEM_C(15))
+         return FALSE;
+      return (s == 1) ||
+         ((s == 2) && (nvi->src[1]->value->reg.file == NV_FILE_GPR));
    case NV_OP_MOV:
       assert(s == 0);
       return /* TRUE */ FALSE; /* don't turn MOVs into loads */
-- 
cgit v1.2.3


From 7a4a537be1460b09b192fdf4d92680aad6c9e951 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 00:46:38 +0200
Subject: nv50: reduce bb_reachable_by runtime from pot to linear

As a by-product, remove the memory leak of nv_basic_blocks.
---
 src/gallium/drivers/nv50/nv50_pc.c          | 105 ++++++++++++++++++++++++----
 src/gallium/drivers/nv50/nv50_pc.h          |  16 +++--
 src/gallium/drivers/nv50/nv50_pc_optimize.c |   4 +-
 3 files changed, 104 insertions(+), 21 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index e4df742a80..e063888eb5 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -340,6 +340,66 @@ nv_print_program(struct nv_pc *pc)
          nv_print_function(pc->root[i]);
 }
 
+#ifdef NV50_PC_DEBUG
+static void
+nv_do_print_cfgraph(struct nv_pc *pc, FILE *f, struct nv_basic_block *b)
+{
+   int i;
+
+   b->pass_seq = pc->pass_seq;
+
+   fprintf(f, "\t%i [shape=box]\n", b->id);
+
+   for (i = 0; i < 2; ++i) {
+      if (!b->out[i])
+         continue;
+      switch (b->out_kind[i]) {
+      case CFG_EDGE_FORWARD:
+         fprintf(f, "\t%i -> %i;\n", b->id, b->out[i]->id);
+         break;
+      case CFG_EDGE_LOOP_ENTER:
+         fprintf(f, "\t%i -> %i [color=green];\n", b->id, b->out[i]->id);
+         break;
+      case CFG_EDGE_LOOP_LEAVE:
+         fprintf(f, "\t%i -> %i [color=red];\n", b->id, b->out[i]->id);
+         break;
+      case CFG_EDGE_BACK:
+         fprintf(f, "\t%i -> %i;\n", b->id, b->out[i]->id);
+         continue;
+      case CFG_EDGE_FAKE:
+         fprintf(f, "\t%i -> %i [style=dotted];\n", b->id, b->out[i]->id);
+         break;
+      default:
+         assert(0);
+         break;
+      }
+      if (b->out[i]->pass_seq < pc->pass_seq)
+         nv_do_print_cfgraph(pc, f, b->out[i]);
+   }
+}
+
+/* Print the control flow graph of subroutine @subr (0 == MAIN) to a file. */
+static void
+nv_print_cfgraph(struct nv_pc *pc, const char *filepath, int subr)
+{
+   FILE *f;
+
+   f = fopen(filepath, "a");
+   if (!f)
+      return;
+
+   fprintf(f, "digraph G {\n");
+
+   ++pc->pass_seq;
+
+   nv_do_print_cfgraph(pc, f, pc->root[subr]);
+
+   fprintf(f, "}\n");
+
+   fclose(f);
+}
+#endif
+
 static INLINE void
 nvcg_show_bincode(struct nv_pc *pc)
 {
@@ -393,6 +453,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
 {
    struct nv_pc *pc;
    int ret;
+   int i;
 
    pc = CALLOC_STRUCT(nv_pc);
    if (!pc)
@@ -428,6 +489,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
       goto out;
 #ifdef NV50PC_DEBUG
    nv_print_program(pc);
+   nv_print_cfgraph(pc, "nv50_shader_cfgraph.dot", 0);
 #endif
 
    /* prepare for emission */
@@ -461,8 +523,8 @@ nv50_generate_code(struct nv50_translation_info *ti)
 out:
    nv_pc_free_refs(pc);
 
-   if (pc->bb_list)
-      FREE(pc->bb_list);
+   for (i = 0; i < pc->num_blocks; ++i)
+      FREE(pc->bb_list[i]);
 
    if (ret) { /* on success, these will be referenced by nv50_program */
       if (pc->emit)
@@ -644,23 +706,38 @@ nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d)
    return j ? TRUE : FALSE;
 }
 
-/* check if bf (future) can be reached from bp (past) */
+/* check if @bf (future) can be reached from @bp (past), stop at @bt */
 boolean
 nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp,
                   struct nv_basic_block *bt)
 {
-   if (bf == bp)
-      return TRUE;
-   if (bp == bt)
-      return FALSE;
+   struct nv_basic_block *q[NV_PC_MAX_BASIC_BLOCKS], *b;
+   int i, p, n;
 
-   if (bp->out[0] && !IS_WALL_EDGE(bp->out_kind[0]) &&
-       nvbb_reachable_by(bf, bp->out[0], bt))
-      return TRUE;
-   if (bp->out[1] && !IS_WALL_EDGE(bp->out_kind[1]) &&
-       nvbb_reachable_by(bf, bp->out[1], bt))
-      return TRUE;
-   return FALSE;
+   p = 0;
+   n = 1;
+   q[0] = bp;
+
+   while (p < n) {
+      b = q[p++];
+
+      if (b == bf)
+         break;
+      if (b == bt)
+         continue;
+      assert(n <= (1024 - 2));
+
+      for (i = 0; i < 2; ++i) {
+         if (b->out[i] && !IS_WALL_EDGE(b->out_kind[i]) && !b->out[i]->priv) {
+            q[n] = b->out[i];
+            q[n++]->priv = 1;
+         }
+      }
+   }
+   for (--n; n >= 0; --n)
+      q[n]->priv = 0;
+
+   return (b == bf);
 }
 
 static struct nv_basic_block *
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index ccddae063c..e8d9942307 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -144,6 +144,8 @@
 #define NV_PC_MAX_INSTRUCTIONS 2048
 #define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4)
 
+#define NV_PC_MAX_BASIC_BLOCKS 1024
+
 static INLINE boolean
 nv_is_vector_op(uint opcode)
 {
@@ -284,7 +286,7 @@ struct nv_basic_block {
 
    int id;
    int subroutine;
-   uint priv;
+   uint priv; /* reset to 0 after you're done */
    uint pass_seq;
 
    uint32_t bin_pos; /* position, size in emitted code */
@@ -328,7 +330,7 @@ struct nv_pc {
    struct nv_value values[NV_PC_MAX_VALUES];
    struct nv_instruction instructions[NV_PC_MAX_INSTRUCTIONS];
    struct nv_ref **refs;
-   struct nv_basic_block **bb_list;
+   struct nv_basic_block *bb_list[NV_PC_MAX_BASIC_BLOCKS];
    int num_values;
    int num_instructions;
    int num_refs;
@@ -437,9 +439,15 @@ new_ref(struct nv_pc *pc, struct nv_value *val)
 static INLINE struct nv_basic_block *
 new_basic_block(struct nv_pc *pc)
 {
-   struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block);
+   struct nv_basic_block *bb;
+
+   if (pc->num_blocks >= NV_PC_MAX_BASIC_BLOCKS)
+      return NULL;
+
+   bb = CALLOC_STRUCT(nv_basic_block);
 
-   bb->id = pc->num_blocks++;
+   bb->id = pc->num_blocks;
+   pc->bb_list[pc->num_blocks++] = bb;
    return bb;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 09d232abda..edda6c0691 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -238,9 +238,7 @@ nv_pc_exec_pass2(struct nv_pc *pc)
 
    NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks);
 
-   pc->bb_list = CALLOC(pc->num_blocks, sizeof(pc->bb_list[0]));
-
-   pc->num_blocks = 0;
+   pc->num_blocks = 0; /* will reorder bb_list */
 
    for (i = 0; i < pc->num_subroutines + 1; ++i)
       if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i])))
-- 
cgit v1.2.3


From fc31a25afa2d28dea9bbda08ce8deab5aa96b684 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 00:56:16 +0200
Subject: nv50: minor compiler fixes and cleanups

---
 src/gallium/drivers/nv50/nv50_pc.c           |  4 +++-
 src/gallium/drivers/nv50/nv50_pc_regalloc.c  |  5 +++++
 src/gallium/drivers/nv50/nv50_shader_state.c |  2 +-
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c   | 16 ++++++++++------
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index e063888eb5..26ad9b4e3d 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -237,6 +237,7 @@ nv_pc_free_refs(struct nv_pc *pc)
    int i;
    for (i = 0; i < pc->num_refs; i += 64)
       FREE(pc->refs[i]);
+   FREE(pc->refs);
 }
 
 static const char *
@@ -525,7 +526,8 @@ out:
 
    for (i = 0; i < pc->num_blocks; ++i)
       FREE(pc->bb_list[i]);
-
+   if (pc->root)
+      FREE(pc->root);
    if (ret) { /* on success, these will be referenced by nv50_program */
       if (pc->emit)
          FREE(pc->emit);
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index 2998343db5..b9d5ba5ef6 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -888,6 +888,10 @@ nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root)
    ctx->pc = pc;
 
    ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *));
+   if (!ctx->insns) {
+      FREE(ctx);
+      return -1;
+   }
 
    pc->pass_seq++;
    ret = pass_generate_phi_movs(ctx, root);
@@ -941,6 +945,7 @@ nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root)
    NV50_DBGMSG("REGISTER ALLOCATION - leaving\n");
 
 out:
+   FREE(ctx->insns);
    FREE(ctx);
    return ret;
 }
diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index f187a074e6..564f7e5324 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -44,7 +44,7 @@ nv50_transfer_constbuf(struct nv50_context *nv50,
    if (!map)
       return;
 
-   count = buf->width0; /* MIN2(buf->width0, size); */
+   count = (buf->width0 + 3) / 4;
    start = 0;
 
    while (count) {
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 6fd749b35f..5994d1c27e 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -39,7 +39,7 @@
 #define BLD_MAX_PREDS 4
 #define BLD_MAX_IMMDS 128
 
-#define BLD_MAX_COND_NESTING 4
+#define BLD_MAX_COND_NESTING 8
 #define BLD_MAX_LOOP_NESTING 4
 #define BLD_MAX_CALL_NESTING 2
 
@@ -70,14 +70,14 @@ bld_vals_del_val(struct bld_value_stack *stk, struct nv_value *val)
 {
    unsigned i;
 
-   for (i = stk->size - 1; i >= 0; --i)
-      if (stk->body[i] == val)
+   for (i = stk->size; i > 0; --i)
+      if (stk->body[i - 1] == val)
          break;
-   if (i < 0)
+   if (!i)
       return FALSE;
 
-   if (i != stk->size - 1)
-      stk->body[i] = stk->body[stk->size - 1];
+   if (i != stk->size)
+      stk->body[i - 1] = stk->body[stk->size - 1];
 
    --stk->size; /* XXX: old size in REALLOC */
    return TRUE;
@@ -1643,6 +1643,8 @@ bld_instruction(struct bld_context *bld,
    {
       struct nv_basic_block *b = new_basic_block(bld->pc);
 
+      assert(bld->cond_lvl < BLD_MAX_COND_NESTING);
+
       nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD);
 
       bld->join_bb[bld->cond_lvl] = bld->pc->current_block;
@@ -1695,6 +1697,8 @@ bld_instruction(struct bld_context *bld,
       struct nv_basic_block *bl = new_basic_block(bld->pc);
       struct nv_basic_block *bb = new_basic_block(bld->pc);
 
+      assert(bld->loop_lvl < BLD_MAX_LOOP_NESTING);
+
       bld->loop_bb[bld->loop_lvl] = bl;
       bld->brkt_bb[bld->loop_lvl] = bb;
 
-- 
cgit v1.2.3


From 9b39fb1b6127fecf2fbb41926caca2bbb559a1d0 Mon Sep 17 00:00:00 2001
From: Xavier Chantry <chantry.xavier@gmail.com>
Date: Sat, 11 Sep 2010 20:18:25 +0200
Subject: nv50: fix size of outputs_written array

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 5994d1c27e..978bba4d57 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -125,7 +125,7 @@ struct bld_context {
    struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */
    struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4];
 
-   uint32_t outputs_written[(PIPE_MAX_SHADER_OUTPUTS + 31) / 32];
+   uint32_t outputs_written[(PIPE_MAX_SHADER_OUTPUTS + 7) / 8];
 
    struct nv_value *frgcrd[4];
    struct nv_value *sysval[4];
-- 
cgit v1.2.3


From d4fd11a628b0e48d76fab4a0b94470a7592faf26 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 11:19:24 +0200
Subject: nv50: cannot move from local mem to output reg directly

---
 src/gallium/drivers/nv50/nv50_pc.c          | 3 ++-
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 3 ++-
 src/gallium/drivers/nv50/nv50_pc_print.c    | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 26ad9b4e3d..0511acfd57 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -57,6 +57,7 @@ nv50_indirect_opnd(struct nv_instruction *i)
    switch (i->opcode) {
    case NV_OP_MOV:
    case NV_OP_LDA:
+   case NV_OP_STA:
       return 0;
    default:
       return 1;
@@ -341,7 +342,7 @@ nv_print_program(struct nv_pc *pc)
          nv_print_function(pc->root[i]);
 }
 
-#ifdef NV50_PC_DEBUG
+#ifdef NV50PC_DEBUG
 static void
 nv_do_print_cfgraph(struct nv_pc *pc, FILE *f, struct nv_basic_block *b)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index edda6c0691..8653bc6e63 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -327,7 +327,8 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
 
       /* cannot write to $oX when using immediate */
       for (j = 0; j < 4 && nvi->src[j]; ++j)
-         if (nvi->src[j]->value->reg.file == NV_FILE_IMM)
+         if (nvi->src[j]->value->reg.file == NV_FILE_IMM ||
+             nvi->src[j]->value->reg.file == NV_FILE_MEM_L)
             break;
       if (j < 4 && nvi->src[j])
          continue;
diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c
index a71401979c..984f6cbe17 100644
--- a/src/gallium/drivers/nv50/nv50_pc_print.c
+++ b/src/gallium/drivers/nv50/nv50_pc_print.c
@@ -220,7 +220,7 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
       PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value));
       break;
    case NV_FILE_MEM_L:
-      nv_print_address('l', -1, ind, 4 * nv_value_id(value));
+      nv_print_address('l', -1, ind, nv_value_id(value));
       break;
    case NV_FILE_MEM_S:
       nv_print_address('s', -1, ind, 4 * nv_value_id(value));
-- 
cgit v1.2.3


From fdb00ac1efc7c12aeed1a7e705c5a5dd258b7d54 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 11:37:07 +0200
Subject: nv50: newlines in shader bincode printing

---
 src/gallium/drivers/nv50/nv50_pc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 0511acfd57..c934450d42 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -405,10 +405,13 @@ nv_print_cfgraph(struct nv_pc *pc, const char *filepath, int subr)
 static INLINE void
 nvcg_show_bincode(struct nv_pc *pc)
 {
-   int i;
+   unsigned i;
 
-   for (i = 0; i < pc->bin_size / 4; ++i)
+   for (i = 0; i < pc->bin_size / 4; ++i) {
       debug_printf("0x%08x ", pc->emit[i]);
+      if ((i % 16) == 15)
+         debug_printf("\n");
+   }
    debug_printf("\n");
 }
 
-- 
cgit v1.2.3


From 1fa812d84aa4dcb03f3e64fd46abe5b02ac985d1 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 11:37:45 +0200
Subject: nv50: match TEMP limit with nv50 ir builder

Mesa doesn't respect it anyway, but this makes it assert rather
than threads access areas of l[] that don't belong to them.
---
 src/gallium/drivers/nv50/nv50_program.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index a1b2bde97b..d8b6e8d6d1 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -27,7 +27,7 @@
 #include "tgsi/tgsi_scan.h"
 #include "nouveau/nouveau_class.h"
 
-#define NV50_CAP_MAX_PROGRAM_TEMPS (128 / 4)
+#define NV50_CAP_MAX_PROGRAM_TEMPS 64
 
 struct nv50_varying {
    uint8_t id; /* tgsi index */
-- 
cgit v1.2.3


From 98c87c382d080ff5a048564e942e649fbaf43879 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 14:31:01 +0200
Subject: nv50: handle TGSI EXP and LOG again

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c |  2 ++
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c  | 48 +++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 8653bc6e63..ea1da6268d 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -322,6 +322,8 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
          continue;
       assert(nvi->def[0] == sti->src[0]->value);
 
+      if (nvi->opcode == NV_OP_SELECT)
+         continue;
       if (nvi->def[0]->refc > 1)
          continue;
 
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 978bba4d57..b4f5a884c4 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -214,6 +214,7 @@ static INLINE void
 bld_warn_uninitialized(struct bld_context *bld, int kind,
                        struct bld_value_stack *stk, struct nv_basic_block *b)
 {
+#ifdef NV50_TGSI2NC_DEBUG
    long i = (stk - &bld->tvs[0][0]) / 4;
    long c = (stk - &bld->tvs[0][0]) & 3;
 
@@ -222,6 +223,7 @@ bld_warn_uninitialized(struct bld_context *bld, int kind,
 
    debug_printf("WARNING: TEMP[%li].%c %s used uninitialized in BB:%i\n",
                 i, (int)('x' + c), kind ? "may be" : "is", b->id);
+#endif
 }
 
 static INLINE struct nv_value *
@@ -646,7 +648,10 @@ bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e)
 static INLINE struct nv_value *
 bld_load_imm_f32(struct bld_context *bld, float f)
 {
-   return bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f));
+   struct nv_value *imm = bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f));
+
+   SET_TYPE(imm, NV_TYPE_F32);
+   return imm;
 }
 
 static INLINE struct nv_value *
@@ -944,6 +949,8 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
 
    switch (reg->Register.File) {
    case TGSI_FILE_OUTPUT:
+      if (!value->insn && (bld->ti->output_file == NV_FILE_OUT))
+         value = bld_insn_1(bld, NV_OP_MOV, value);
       value = bld_insn_1(bld, NV_OP_MOV, value);
       value->reg.file = bld->ti->output_file;
 
@@ -956,9 +963,9 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
       break;
    case TGSI_FILE_TEMPORARY:
       assert(reg->Register.Index < BLD_MAX_TEMPS);
-      value->reg.file = NV_FILE_GPR;
-      if (value->insn->bb != bld->pc->current_block)
+      if (!value->insn || (value->insn->bb != bld->pc->current_block))
          value = bld_insn_1(bld, NV_OP_MOV, value);
+      value->reg.file = NV_FILE_GPR;
 
       if (bld->ti->store_to_memory)
          bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value);
@@ -1616,6 +1623,23 @@ bld_instruction(struct bld_context *bld,
       if (insn->Dst[0].Register.WriteMask & 8)
          dst0[3] = emit_fetch(bld, insn, 1, 3);
       break;
+   case TGSI_OPCODE_EXP:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      temp = bld_insn_1(bld, NV_OP_FLOOR, src0);
+
+      if (insn->Dst[0].Register.WriteMask & 2)
+         dst0[1] = bld_insn_2(bld, NV_OP_SUB, src0, temp);
+      if (insn->Dst[0].Register.WriteMask & 1) {
+         temp = bld_insn_1(bld, NV_OP_PREEX2, temp);
+         dst0[0] = bld_insn_1(bld, NV_OP_EX2, temp);
+      }
+      if (insn->Dst[0].Register.WriteMask & 4) {
+         temp = bld_insn_1(bld, NV_OP_PREEX2, src0);
+         dst0[2] = bld_insn_1(bld, NV_OP_EX2, temp);
+      }
+      if (insn->Dst[0].Register.WriteMask & 8)
+         dst0[3] = bld_imm_f32(bld, 1.0f);
+      break;
    case TGSI_OPCODE_EX2:
       src0 = emit_fetch(bld, insn, 0, 0);
       temp = bld_insn_1(bld, NV_OP_PREEX2, src0);
@@ -1798,6 +1822,24 @@ bld_instruction(struct bld_context *bld,
       FOR_EACH_DST0_ENABLED_CHANNEL(c, insn)
          dst0[c] = temp;
       break;
+   case TGSI_OPCODE_LOG:
+      src0 = emit_fetch(bld, insn, 0, 0);
+      src0 = bld_insn_1(bld, NV_OP_ABS, src0);
+      temp = bld_insn_1(bld, NV_OP_LG2, src0);
+      dst0[2] = temp;
+      if (insn->Dst[0].Register.WriteMask & 3) {
+         temp = bld_insn_1(bld, NV_OP_FLOOR, temp);
+         dst0[0] = temp;
+      }
+      if (insn->Dst[0].Register.WriteMask & 2) {
+         temp = bld_insn_1(bld, NV_OP_PREEX2, temp);
+         temp = bld_insn_1(bld, NV_OP_EX2, temp);
+         temp = bld_insn_1(bld, NV_OP_RCP, temp);
+         dst0[1] = bld_insn_2(bld, NV_OP_MUL, src0, temp);
+      }
+      if (insn->Dst[0].Register.WriteMask & 8)
+         dst0[3] = bld_imm_f32(bld, 1.0f);
+      break;
    case TGSI_OPCODE_RCP:
    case TGSI_OPCODE_LG2:
       src0 = emit_fetch(bld, insn, 0, 0);
-- 
cgit v1.2.3


From cca3906a9b1d994c431ceeccccbde0ce87a2f6b4 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 19:43:22 +0200
Subject: nv50: check for immediates when turning MUL ADD into MAD

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index ea1da6268d..fba60984ac 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -562,6 +562,11 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi,
       nvi->src[0] = nvi->src[2];
       nvi->src[2] = NULL;
       nvi->opcode = NV_OP_ADD;
+
+      if (val->reg.imm.u32 == 0) {
+         nvi->src[1] = NULL;
+         nvi->opcode = NV_OP_MOV;
+      }
    }
 }
 
@@ -703,6 +708,10 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
       else
          continue;
 
+      /* could have an immediate from above constant_*  */
+      if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
+         continue;
+
       nvi->opcode = NV_OP_MAD;
       mod = nvi->src[(src == src0) ? 0 : 1]->mod;
       nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL);
-- 
cgit v1.2.3


From 1f1411f2ccc7f808d181c09f925b0780306a05ca Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 20:55:09 +0200
Subject: nv50: interp cannot write flags reg

---
 src/gallium/drivers/nv50/nv50_pc.c         | 21 +++++++++++++++++++++
 src/gallium/drivers/nv50/nv50_pc.h         |  1 +
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 10 +++-------
 3 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index c934450d42..78aca8fd56 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -170,6 +170,27 @@ nv50_supported_src_mods(uint opcode, int s)
    }
 }
 
+/* We may want an opcode table. */
+boolean
+nv50_op_can_write_flags(uint opcode)
+{
+   if (nv_is_vector_op(opcode))
+      return FALSE;
+   switch (opcode) { /* obvious ones like KIL, CALL, etc. not included */
+   case NV_OP_PHI:
+   case NV_OP_MOV:
+   case NV_OP_LINTERP:
+   case NV_OP_PINTERP:
+   case NV_OP_LDA:
+      return FALSE;
+   default:
+      break;
+   }
+   if (opcode >= NV_OP_RCP && opcode <= NV_OP_PREEX2)
+      return FALSE;
+   return TRUE;
+}
+
 int
 nv_nvi_refcount(struct nv_instruction *nvi)
 {
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index e8d9942307..8f15a82026 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -486,6 +486,7 @@ int nv50_indirect_opnd(struct nv_instruction *);
 boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s);
 boolean nv50_nvi_can_predicate(struct nv_instruction *);
 boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *);
+boolean nv50_op_can_write_flags(uint opcode);
 ubyte nv50_supported_src_mods(uint opcode, int s);
 int nv_nvi_refcount(struct nv_instruction *);
 void nv_nvi_delete(struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index b4f5a884c4..8ad0b18c79 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -700,19 +700,15 @@ bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only)
       while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_NEG ||
              nvi->opcode == NV_OP_CVT) {
          s0i = nvi->src[0]->value->insn;
-         if (!s0i ||
-             s0i->opcode == NV_OP_LDA ||
-             s0i->opcode == NV_OP_MOV ||
-             s0i->opcode == NV_OP_PHI)
+         if (!s0i || !nv50_op_can_write_flags(s0i->opcode))
             break;
          nvi = s0i;
          assert(!nvi->flags_src);
       }
    }
 
-   if (nvi->opcode == NV_OP_LDA ||
-       nvi->opcode == NV_OP_MOV ||
-       nvi->opcode == NV_OP_PHI || nvi->bb != bld->pc->current_block) {
+   if (!nv50_op_can_write_flags(nvi->opcode) ||
+       nvi->bb != bld->pc->current_block) {
       nvi = new_instruction(bld->pc, NV_OP_CVT);
       nv_reference(bld->pc, &nvi->src[0], src);
    }
-- 
cgit v1.2.3


From 3b3c20744f2ea90f6aaae33b337bdc5e135f3198 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Sun, 12 Sep 2010 23:11:30 +0200
Subject: nv50: MOV TEMP[0], -CONST[0] must be float32 negation

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 8ad0b18c79..54d6fb960f 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1075,7 +1075,7 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
    const struct tgsi_full_src_register *src = &insn->Src[s];
    struct nv_value *res;
    struct nv_value *ptr = NULL;
-   unsigned idx, swz, dim_idx, ind_idx, ind_swz;
+   unsigned idx, swz, dim_idx, ind_idx, ind_swz, sgn;
    ubyte type = infer_src_type(insn->Instruction.Opcode);
 
    idx = src->Register.Index;
@@ -1157,10 +1157,15 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
    if (!res)
       return bld_undef(bld, NV_FILE_GPR);
 
+   sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
+
    if (insn->Instruction.Opcode != TGSI_OPCODE_MOV)
       res->reg.as_type = type;
+   else
+   if (sgn != TGSI_UTIL_SIGN_KEEP) /* apparently "MOV A, -B" assumes float */
+      res->reg.as_type = NV_TYPE_F32;
 
-   switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+   switch (sgn) {
    case TGSI_UTIL_SIGN_KEEP:
       break;
    case TGSI_UTIL_SIGN_CLEAR:
-- 
cgit v1.2.3


From 0b8170103c8eaff46b75e89608198b3eb564bc52 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 13 Sep 2010 00:59:38 +0200
Subject: nv50: fix indirect CONST access with large or negative offsets

---
 src/gallium/drivers/nv50/nv50_pc_emit.c    | 6 ++++--
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 9 ++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index 8c64b19875..1eb44741f1 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -696,7 +696,9 @@ emit_add_b32(struct nv_pc *pc, struct nv_instruction *i)
 static void
 emit_add_a16(struct nv_pc *pc, struct nv_instruction *i)
 {
-   pc->emit[0] = 0xd0000001 | (get_immd_u32(i->src[0]) << 9);
+   int s = (i->opcode == NV_OP_MOV) ? 0 : 1;
+
+   pc->emit[0] = 0xd0000001 | ((uint16_t)get_immd_u32(i->src[s]) << 9);
    pc->emit[1] = 0x20000000;
 
    pc->emit[0] |= (DREG(i->def[0])->id + 1) << 2;
@@ -704,7 +706,7 @@ emit_add_a16(struct nv_pc *pc, struct nv_instruction *i)
    set_pred(pc, i);
 
    if (i->src[1])
-      set_a16_bits(pc, SREG(i->src[1])->id);
+      set_a16_bits(pc, SREG(i->src[1])->id + 1);
 }
 
 static void
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 54d6fb960f..a2b6901c81 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -665,6 +665,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)
 {
    int i;
    struct nv_instruction *nvi;
+   struct nv_value *val;
 
    for (i = 0; i < 4; ++i) {
       if (!bld->saved_addr[i][0])
@@ -677,7 +678,13 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)
    }
    i &= 3;
 
-   bld->saved_addr[i][0] = bld_load_imm_u32(bld, id);
+   val = bld_imm_u32(bld, id);
+   if (indirect)
+      val = bld_insn_2(bld, NV_OP_ADD, indirect, val);
+   else
+      val = bld_insn_1(bld, NV_OP_MOV, val);
+
+   bld->saved_addr[i][0] = val;
    bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR;
    bld->saved_addr[i][0]->reg.type = NV_TYPE_U16;
    bld->saved_addr[i][1] = indirect;
-- 
cgit v1.2.3


From 60f34e9f60c288a67132d91a82ec66378eb318ad Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 13 Sep 2010 17:04:48 +0200
Subject: nv50: fix TXP depth comparison value

---
 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 38 +++++++++++++++++-------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index a2b6901c81..90d81d3e17 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1269,10 +1269,14 @@ get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg)
 
 static void
 load_proj_tex_coords(struct bld_context *bld,
-                     struct nv_value *t[4], int dim,
+                     struct nv_value *t[4], int dim, int arg,
                      const struct tgsi_full_instruction *insn)
 {
-   int c, mask = 0;
+   int c, mask;
+
+   mask = (1 << dim) - 1;
+   if (arg != dim)
+      mask |= 4; /* depth comparison value */
 
    t[3] = emit_fetch(bld, insn, 0, 3);
 
@@ -1284,17 +1288,19 @@ load_proj_tex_coords(struct bld_context *bld,
 
    t[3] = bld_insn_1(bld, NV_OP_RCP, t[3]);
 
-   for (c = 0; c < dim; ++c) {
+   for (c = 0; c < 4; ++c) {
+      if (!(mask & (1 << c)))
+         continue;
       t[c] = emit_fetch(bld, insn, 0, c);
 
-      if (t[c]->insn->opcode == NV_OP_LINTERP ||
-          t[c]->insn->opcode == NV_OP_PINTERP) {
-         t[c] = bld_duplicate_insn(bld, t[c]->insn);
-         t[c]->insn->opcode = NV_OP_PINTERP;
-         nv_reference(bld->pc, &t[c]->insn->src[1], t[3]);
-      } else {
-         mask |= 1 << c;
-      }
+      if (t[c]->insn->opcode != NV_OP_LINTERP &&
+          t[c]->insn->opcode != NV_OP_PINTERP)
+         continue;
+      t[c] = bld_duplicate_insn(bld, t[c]->insn);
+      t[c]->insn->opcode = NV_OP_PINTERP;
+      nv_reference(bld->pc, &t[c]->insn->src[1], t[3]);
+
+      mask &= ~(1 << c);
    }
 
    for (c = 0; mask; ++c, mask >>= 1) {
@@ -1467,10 +1473,13 @@ bld_tex(struct bld_context *bld, struct nv_value *dst0[4],
    get_tex_dim(insn, &dim, &arg);
 
    if (!cube && insn->Instruction.Opcode == TGSI_OPCODE_TXP)
-      load_proj_tex_coords(bld, t, dim, insn);
-   else
+      load_proj_tex_coords(bld, t, dim, arg, insn);
+   else {
       for (c = 0; c < dim; ++c)
          t[c] = emit_fetch(bld, insn, 0, c);
+      if (arg != dim)
+         t[dim] = emit_fetch(bld, insn, 0, 2);
+   }
 
    if (cube) {
       assert(dim >= 3);
@@ -1485,9 +1494,6 @@ bld_tex(struct bld_context *bld, struct nv_value *dst0[4],
          t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], s[0]);
    }
 
-   if (arg != dim)
-      t[dim] = emit_fetch(bld, insn, 0, 2);
-
    if (opcode == NV_OP_TXB || opcode == NV_OP_TXL) {
       t[arg++] = emit_fetch(bld, insn, 0, 3);
 
-- 
cgit v1.2.3


From 16d8f5fee51a4a86f5f0c15228b48d5668ab2be2 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Mon, 13 Sep 2010 21:13:36 +0200
Subject: nv50: consider address register in reload elimination

---
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index fba60984ac..3ff6db7dd2 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -732,7 +732,7 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
 
 struct load_record {
    struct load_record *next;
-   uint64_t data;
+   uint64_t data[2];
    struct nv_value *value;
 };
 
@@ -757,7 +757,7 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
 {
    struct load_record **rec, *it;
    struct nv_instruction *ld, *next;
-   uint64_t data;
+   uint64_t data[2];
    struct nv_value *val;
    int j;
 
@@ -769,11 +769,13 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
       rec = NULL;
 
       if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) {
-         data = val->reg.id;
+         data[0] = val->reg.id;
+         data[1] = 0;
          rec = &ctx->mem_v;
       } else
       if (ld->opcode == NV_OP_LDA) {
-         data = val->reg.id;
+         data[0] = val->reg.id;
+         data[1] = ld->src[4] ? ld->src[4]->value->n : ~0ULL;
          if (val->reg.file >= NV_FILE_MEM_C(0) &&
              val->reg.file <= NV_FILE_MEM_C(15))
             rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)];
@@ -785,7 +787,8 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
             rec = &ctx->mem_l;
       } else
       if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) {
-         data = val->reg.imm.u32;
+         data[0] = val->reg.imm.u32;
+         data[1] = 0;
          rec = &ctx->imm;
       }
 
@@ -793,7 +796,7 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
          continue;
 
       for (it = *rec; it; it = it->next)
-         if (it->data == data)
+         if (it->data[0] == data[0] && it->data[1] == data[1])
             break;
 
       if (it) {
@@ -807,7 +810,8 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
             continue;
          it = &ctx->pool[ctx->alloc++];
          it->next = *rec;
-         it->data = data;
+         it->data[0] = data[0];
+         it->data[1] = data[1];
          it->value = ld->def[0];
          *rec = it;
       }
-- 
cgit v1.2.3


From c46e7a05e501e02b10dbc06772c0ef01308f60d5 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Wed, 15 Sep 2010 13:59:09 +0200
Subject: nv50: improve and fix modifier folding optimization

Execute before folding loads, because we don't check if it's legal
in lower_mods.
Ensure that a value's insn pointer is updated when transferring it
to a different instruction.
---
 src/gallium/drivers/nv50/nv50_pc.c          |  1 +
 src/gallium/drivers/nv50/nv50_pc.h          |  1 -
 src/gallium/drivers/nv50/nv50_pc_emit.c     |  5 +++
 src/gallium/drivers/nv50/nv50_pc_optimize.c | 65 ++++++++++++++++++-----------
 4 files changed, 46 insertions(+), 26 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 78aca8fd56..2706d88779 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -104,6 +104,7 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)
    case NV_OP_FLOOR:
    case NV_OP_TRUNC:
    case NV_OP_CVT:
+   case NV_OP_NEG:
    case NV_OP_MAD:
    case NV_OP_MUL:
    case NV_OP_SAT:
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 8f15a82026..92c6be5f6e 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -220,7 +220,6 @@ struct nv_value {
 
 struct nv_ref {
    struct nv_value *value;
-   struct nv_instruction *insn;
    ubyte mod;
    ubyte typecast;
    ubyte flags; /* not used yet */
diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c
index 1eb44741f1..137a531dd6 100644
--- a/src/gallium/drivers/nv50/nv50_pc_emit.c
+++ b/src/gallium/drivers/nv50/nv50_pc_emit.c
@@ -654,6 +654,8 @@ emit_add_f32(struct nv_pc *pc, struct nv_instruction *i)
 {
    pc->emit[0] = 0xb0000000;
 
+   assert(!((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS));
+
    if (SFILE(i, 1) == NV_FILE_IMM) {
       emit_form_IMM(pc, i, 0);
 
@@ -665,6 +667,9 @@ emit_add_f32(struct nv_pc *pc, struct nv_instruction *i)
 
       if (i->src[0]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 26;
       if (i->src[1]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 27;
+
+      if (i->saturate)
+         pc->emit[1] |= 0x20000000;
    } else {
       emit_form_MUL(pc, i);
 
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 3ff6db7dd2..921ed15691 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -336,6 +336,7 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)
          continue;
 
       nvi->def[0] = sti->def[0];
+      nvi->def[0]->insn = nvi;
       nvi->fixed = sti->fixed;
 
       nv_nvi_delete(sti);
@@ -374,7 +375,7 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
          if (j == 0 && ld->src[4]) /* can't load shared mem */
             continue;
 
-         /* fold it ! */ /* XXX: ref->insn */
+         /* fold it ! */
          nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value);
          if (ld->src[4])
             nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value);
@@ -388,6 +389,7 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b)
    return 0;
 }
 
+/* NOTE: Assumes loads have not yet been folded. */
 static int
 nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
 {
@@ -402,14 +404,7 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
          nvi->src[1]->mod ^= NV_MOD_NEG;
       }
 
-      /* should not put any modifiers on NEG and ABS */
-      assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod);
-      assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod);
-
-      for (j = 0; j < 4; ++j) {
-         if (!nvi->src[j])
-            break;
-
+      for (j = 0; j < 4 && nvi->src[j]; ++j) {
          mi = nvi->src[j]->value->insn;
          if (!mi)
             continue;
@@ -421,16 +416,32 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
          if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS;
          else
             continue;
+         assert(!(mod & mi->src[0]->mod & NV_MOD_NEG));
+
+         mod |= mi->src[0]->mod;
+
+         if (mi->flags_def || mi->flags_src)
+            continue;
 
-         if (nvi->opcode == NV_OP_ABS)
+         if ((nvi->opcode == NV_OP_ABS) || (nvi->src[j]->mod & NV_MOD_ABS)) {
+            /* abs neg [abs] = abs */
             mod &= ~(NV_MOD_NEG | NV_MOD_ABS);
-         else
-         if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) {
-            nvi->opcode = NV_OP_MOV;
+         } else
+         if ((nvi->opcode == NV_OP_NEG) && (mod & NV_MOD_NEG)) {
+            /* neg as opcode and modifier on same insn cannot occur */
+            /* neg neg abs = abs, neg neg = identity */
+            assert(j == 0);
+            if (mod & NV_MOD_ABS)
+               nvi->opcode = NV_OP_ABS;
+            else
+            if (nvi->flags_def)
+               nvi->opcode = NV_OP_CVT;
+            else
+               nvi->opcode = NV_OP_MOV;
             mod = 0;
          }
 
-         if (!(nv50_supported_src_mods(nvi->opcode, j) & mod))
+         if ((nv50_supported_src_mods(nvi->opcode, j) & mod) != mod)
             continue;
 
          nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value);
@@ -441,11 +452,15 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b)
       if (nvi->opcode == NV_OP_SAT) {
          mi = nvi->src[0]->value->insn;
 
-         if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) {
-            mi->saturate = 1;
-            mi->def[0] = nvi->def[0];
-            nv_nvi_delete(nvi);
-         }
+         if (mi->opcode != NV_OP_ADD || mi->opcode != NV_OP_MAD)
+            continue;
+         if (mi->flags_def || mi->def[0]->refc > 1)
+            continue;
+
+         mi->saturate = 1;
+         mi->def[0] = nvi->def[0];
+         mi->def[0]->insn = mi;
+         nv_nvi_delete(nvi);
       }
    }
    DESCEND_ARBITRARY(j, nv_pass_lower_mods);
@@ -956,7 +971,7 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)
          for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1)
             if (!nv50_nvi_can_predicate(nvi))
                break;
-#ifdef NV50_PC_DEBUG
+#ifdef NV50PC_DEBUG
          if (nvi) {
             debug_printf("cannot predicate: "); nv_print_instruction(nvi);
          }
@@ -1081,6 +1096,11 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
    if (ret)
       return ret;
 
+   pc->pass_seq++;
+   ret = nv_pass_lower_mods(&pass, root);
+   if (ret)
+      return ret;
+
    pc->pass_seq++;
    ret = nv_pass_fold_loads(&pass, root);
    if (ret)
@@ -1106,11 +1126,6 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
    if (ret)
       return ret;
 
-   pc->pass_seq++;
-   ret = nv_pass_lower_mods(&pass, root);
-   if (ret)
-      return ret;
-
    dce.pc = pc;
    do {
       dce.removed = 0;
-- 
cgit v1.2.3


From 84d170bbcef8e26017ac8e2f3bacbaeb20f889d3 Mon Sep 17 00:00:00 2001
From: Christoph Bumiller <e0425955@student.tuwien.ac.at>
Date: Wed, 15 Sep 2010 15:21:41 +0200
Subject: nv50: put low limit on REG_ALLOC_TEMP and FP_RESULT_COUNT

---
 src/gallium/drivers/nv50/nv50_pc.c      | 4 ++--
 src/gallium/drivers/nv50/nv50_program.c | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index 2706d88779..bb464ec4c9 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -539,8 +539,8 @@ nv50_generate_code(struct nv50_translation_info *ti)
    ti->p->immd_size = pc->immd_count * 4;
    ti->p->immd = pc->immd_buf;
 
-   /* highest 16 bit reg to num of 32 bit regs */
-   ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] >> 1) + 1;
+   /* highest 16 bit reg to num of 32 bit regs, limit to >= 4 */
+   ti->p->max_gpr = MAX2(4, (pc->max_reg[NV_FILE_GPR] >> 1) + 1);
 
    ti->p->fixups = pc->fixups;
    ti->p->num_fixups = pc->num_fixups;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 24952f70f1..b3600f7ba7 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -514,6 +514,9 @@ nv50_fragprog_prepare(struct nv50_translation_info *ti)
    if (depr < p->out_nr) {
       p->out[depr].mask = 0x4;
       p->out[depr].hw = ti->output_map[depr][2] = p->max_out++;
+   } else {
+      /* allowed values are 1, 4, 5, 8, 9, ... */
+      p->max_out = MAX2(4, p->max_out);
    }
 
    return 0;
-- 
cgit v1.2.3