diff options
| -rw-r--r-- | src/gallium/drivers/nv50/Makefile | 11 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc.c | 433 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc.h | 431 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_emit.c | 1139 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_optimize.c | 717 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_print.c | 287 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_regalloc.c | 973 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_program.c | 4973 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_program.h | 149 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_push.c | 2 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_shader_state.c | 619 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_state.c | 3 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_state_validate.c | 9 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 1266 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_vbo.c | 4 | 
15 files changed, 6407 insertions, 4609 deletions
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile index e31e6f8662..3943a9e257 100644 --- a/src/gallium/drivers/nv50/Makefile +++ b/src/gallium/drivers/nv50/Makefile @@ -10,7 +10,6 @@ C_SOURCES = \  	nv50_draw.c \  	nv50_miptree.c \  	nv50_query.c \ -	nv50_program.c \  	nv50_resource.c \  	nv50_screen.c \  	nv50_state.c \ @@ -19,6 +18,14 @@ C_SOURCES = \  	nv50_tex.c \  	nv50_transfer.c \  	nv50_vbo.c \ -	nv50_push.c +	nv50_push.c \ +	nv50_program.c \ +	nv50_shader_state.c \ +	nv50_pc.c \ +	nv50_pc_print.c \ +	nv50_pc_emit.c \ +	nv50_tgsi_to_nc.c \ +	nv50_pc_optimize.c \ +	nv50_pc_regalloc.c  include ../../Makefile.template diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c new file mode 100644 index 0000000000..8aba0a32b7 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -0,0 +1,433 @@ + +#include "nv50_pc.h" +#include "nv50_program.h" + +#include <stdio.h> + +/* returns TRUE if operands 0 and 1 can be swapped */ +boolean +nv_op_commutative(uint opcode) +{ +   switch (opcode) { +   case NV_OP_ADD: +   case NV_OP_MUL: +   case NV_OP_MAD: +   case NV_OP_AND: +   case NV_OP_OR: +   case NV_OP_XOR: +   case NV_OP_MIN: +   case NV_OP_MAX: +   case NV_OP_SAD: +     return TRUE; +   default: +     return FALSE; +   } +} + +/* return operand to which the address register applies */ +int +nv50_indirect_opnd(struct nv_instruction *i) +{ +   if (!i->src[4]) +      return -1; + +   switch (i->opcode) { +   case NV_OP_MOV: +   case NV_OP_LDA: +      return 0; +   default: +      return 1; +   } +} + +boolean +nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s) +{ +   if (nvi->flags_src || nvi->flags_def) +      return FALSE; + +   switch (nvi->opcode) { +   case NV_OP_ADD: +   case NV_OP_MUL: +   case NV_OP_AND: +   case NV_OP_OR: +   case NV_OP_XOR: +   case NV_OP_SHL: +   case NV_OP_SHR: +      return (s == 1) && (nvi->def[0]->reg.file == NV_FILE_GPR); +   case NV_OP_MOV: +      assert(s == 0); +      return (nvi->def[0]->reg.file == NV_FILE_GPR); +   default: +      return FALSE; +   } +} + +boolean +nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) +{ +   switch (nvi->opcode) { +   case NV_OP_ABS: +   case NV_OP_ADD: +   case NV_OP_CEIL: +   case NV_OP_FLOOR: +   case NV_OP_TRUNC: +   case NV_OP_CVT: +   case NV_OP_MAD: +   case NV_OP_MUL: +   case NV_OP_SAT: +   case NV_OP_SUB: +   case NV_OP_MAX: +   case NV_OP_MIN: +      if (s == 0 && (value->reg.file == NV_FILE_MEM_S || +                     value->reg.file == NV_FILE_MEM_P)) +         return TRUE; +      if (s == 1 && +          value->reg.file >= NV_FILE_MEM_C(0) && +          value->reg.file <= NV_FILE_MEM_C(15)) +         return TRUE; +      if (s == 2 && nvi->src[1]->value->reg.file == NV_FILE_GPR) +         return TRUE; +      return FALSE; +   case NV_OP_MOV: +      assert(s == 0); +      return TRUE; +   default: +      return FALSE; +   } +} + +ubyte +nv50_supported_src_mods(uint opcode, int s) +{ +   switch (opcode) { +   case NV_OP_ABS: +      return NV_MOD_NEG | NV_MOD_ABS; /* obviously */ +   case NV_OP_ADD: +   case NV_OP_MUL: +   case NV_OP_MAD: +      return NV_MOD_NEG; +   case NV_OP_DFDX: +   case NV_OP_DFDY: +      assert(s == 0); +      return NV_MOD_NEG; +   case NV_OP_MAX: +   case NV_OP_MIN: +      return NV_MOD_ABS; +   case NV_OP_CVT: +   case NV_OP_LG2: +   case NV_OP_NEG: +   case NV_OP_PREEX2: +   case NV_OP_PRESIN: +   case NV_OP_RCP: +   case NV_OP_RSQ: +      return NV_MOD_ABS | NV_MOD_NEG; +   default: +      return 0; +   } +} + +int +nv_nvi_refcount(struct nv_instruction *nvi) +{ +   int i, rc; + +   rc = nvi->flags_def ? nvi->flags_def->refc : 0; + +   for (i = 0; i < 4; ++i) { +      if (!nvi->def[i]) +         return rc; +      rc += nvi->def[i]->refc; +   } +   return rc; +} + +static void +nv_pc_free_refs(struct nv_pc *pc) +{ +   int i; +   for (i = 0; i < pc->num_refs; i += 64) +      FREE(pc->refs[i]); +} + +void +nv_print_program(struct nv_basic_block *b) +{ +   struct nv_instruction *i = b->phi; + +   b->priv = 0; + +   debug_printf("=== BB %i ", b->id); +   if (b->out[0]) +      debug_printf("(--0> %i) ", b->out[0]->id); +   if (b->out[1]) +      debug_printf("(--1> %i) ", b->out[1]->id); +   debug_printf("===\n"); + +   if (!i) +      i = b->entry; +   for (; i; i = i->next) +      nv_print_instruction(i); + +   if (!b->out[0]) { +      debug_printf("END\n\n"); +      return; +   } +   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) +      return; + +   if (b->out[0] != b) +      nv_print_program(b->out[0]); + +   if (b->out[1] && b->out[1] != b) +      nv_print_program(b->out[1]); +} + +static INLINE void +nvcg_show_bincode(struct nv_pc *pc) +{ +   int i; + +   for (i = 0; i < pc->bin_size / 4; ++i) +      debug_printf("0x%08x ", pc->emit[i]); +   debug_printf("\n"); +} + +static int +nv50_emit_program(struct nv_pc *pc) +{ +   uint32_t *code = pc->emit; +   int n; + +   debug_printf("emitting program: size = %u\n", pc->bin_size); + +   for (n = 0; n < pc->num_blocks; ++n) { +      struct nv_instruction *i; +      struct nv_basic_block *b = pc->bb_list[n]; + +      for (i = b->entry; i; i = i->next) { +         nv50_emit_instruction(pc, i); + +         pc->bin_pos += 1 + (pc->emit[0] & 1); +         pc->emit += 1 + (pc->emit[0] & 1); +      } +   } +   assert(pc->emit == &code[pc->bin_size / 4]); + +   /* XXX: we can do better than this ... */ +   if ((pc->emit[-1] & 3) == 3) { +      pc->emit[0] = 0xf0000001; +      pc->emit[1] = 0xe0000000; +      pc->bin_size += 8; +   } + +   pc->emit = code; +   code[pc->bin_size / 4 - 1] |= 1; + +   nvcg_show_bincode(pc); + +   return 0; +} + +int +nv50_generate_code(struct nv50_translation_info *ti) +{ +   struct nv_pc *pc; +   int ret; + +   pc = CALLOC_STRUCT(nv_pc); +   if (!pc) +      return 1; + +   ret = nv50_tgsi_to_nc(pc, ti); +   if (ret) +      goto out; + +   /* optimization */ +   ret = nv_pc_exec_pass0(pc); +   if (ret) +      goto out; + +   /* register allocation */ +   ret = nv_pc_exec_pass1(pc); +   if (ret) +      goto out; + +   /* prepare for emission */ +   ret = nv_pc_exec_pass2(pc); +   if (ret) +      goto out; + +   pc->emit = CALLOC(pc->bin_size / 4 + 2, 4); +   if (!pc->emit) { +      ret = 3; +      goto out; +   } +   ret = nv50_emit_program(pc); +   if (ret) +      goto out; + +   ti->p->code_size = pc->bin_size; +   ti->p->code = pc->emit; + +   ti->p->immd_size = pc->immd_count * 4; +   ti->p->immd = pc->immd_buf; + +   ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] + 1) >> 1; +   ti->p->max_gpr++; + +   ti->p->fixups = pc->fixups; +   ti->p->num_fixups = pc->num_fixups; + +   debug_printf("SHADER TRANSLATION - %s\n", ret ? "failure" : "success"); + +out: +   nv_pc_free_refs(pc); +   if (ret) { +      if (pc->emit) +         free(pc->emit); +      if (pc->immd_buf) +         free(pc->immd_buf); +      if (pc->fixups) +         free(pc->fixups); +   } +   free(pc); + +   return ret; +} + +static void +nvbb_insert_phi(struct nv_basic_block *b, struct nv_instruction *i) +{ +   if (!b->phi) { +      i->prev = NULL; +      b->phi = i; +      i->next = b->entry; +      if (b->entry) { +         assert(!b->entry->prev && b->exit); +         b->entry->prev = i; +      } else { +         b->entry = i; +	 b->exit = i; +      } +   } else { +      assert(b->entry); +      if (b->entry->opcode == NV_OP_PHI) { /* insert after entry */ +	 assert(b->entry == b->exit); +         b->entry->next = i; +         i->prev = b->entry; +         b->entry = i; +	 b->exit = i; +      } else { /* insert before entry */ +         assert(b->entry->prev && b->exit); +         i->next = b->entry; +         i->prev = b->entry->prev; +         b->entry->prev = i; +         i->prev->next = i; +      } +   } +} + +void +nvbb_insert_tail(struct nv_basic_block *b, struct nv_instruction *i) +{ +   if (i->opcode == NV_OP_PHI) { +      nvbb_insert_phi(b, i); +   } else { +      i->prev = b->exit; +      if (b->exit) +         b->exit->next = i; +      b->exit = i; +      if (!b->entry) +         b->entry = i; +      else +      if (i->prev && i->prev->opcode == NV_OP_PHI) +         b->entry = i; +   } + +   i->bb = b; +   b->num_instructions++; +} + +void +nv_nvi_delete(struct nv_instruction *nvi) +{ +   struct nv_basic_block *b = nvi->bb; +   int j; + +   debug_printf("REM: "); nv_print_instruction(nvi); + +   for (j = 0; j < 4; ++j) { +      if (!nvi->src[j]) +         break; +      --(nvi->src[j]->value->refc); +      nvi->src[j] = NULL; +   }	        + +   if (nvi->next) +      nvi->next->prev = nvi->prev; +   else { +      assert(nvi == b->exit); +      b->exit = nvi->prev; +   } + +   if (nvi->prev) +      nvi->prev->next = nvi->next; + +   if (nvi == b->entry) { +      assert(nvi->opcode != NV_OP_PHI || !nvi->next); + +      if (!nvi->next || (nvi->opcode == NV_OP_PHI)) +         b->entry = nvi->prev; +      else +         b->entry = nvi->next; +   } + +   if (nvi == b->phi) { +      assert(!nvi->prev); +      if (nvi->opcode != NV_OP_PHI) +         debug_printf("WARN: b->phi points to non-PHI instruction\n"); + +      if (!nvi->next || nvi->next->opcode != NV_OP_PHI) +         b->phi = NULL; +      else +         b->phi = nvi->next; +   } +} + +void +nv_nvi_permute(struct nv_instruction *i1, struct nv_instruction *i2) +{ +   struct nv_basic_block *b = i1->bb; + +   assert(i1->opcode != NV_OP_PHI && +          i2->opcode != NV_OP_PHI); +   assert(i1->next == i2); + +   if (b->exit == i2) +      b->exit = i1; + +   if (b->entry == i1) +      b->entry = i2; + +   i2->prev = i1->prev; +   i1->next = i2->next; +   i2->next = i1; +   i1->prev = i2; + +   if (i2->prev) +      i2->prev->next = i2; +   if (i1->next) +      i1->next->prev = i1; +} + +void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b) +{ +   if (parent->out[0]) { +      assert(!parent->out[1]); +      parent->out[1] = b; +   } else +      parent->out[0] = b; + +   b->in[b->num_in++] = parent; +} diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h new file mode 100644 index 0000000000..3ab48d0afd --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -0,0 +1,431 @@ +/*************************************************************************/ +/* Copyright (C) 2010 I                                                  */ +/*                                                                       */ +/* This program is free software: you can redistribute it and/or modify  */ +/* it under the terms of the GNU General Public License as published by  */ +/* the Free Software Foundation, either version 3 of the License, or     */ +/* (at your option) any later version.                                   */ +/*                                                                       */ +/* This program is distributed in the hope that it will be useful,       */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */ +/* GNU General Public License for more details.                          */ +/*                                                                       */ +/* You should have received a copy of the GNU General Public License     */ +/* along with this program.  If not, see <http://www.gnu.org/licenses/>. */ +/*************************************************************************/ + +#ifndef __NV50_COMPILER_H__ +#define __NV50_COMPILER_H__ + +#include "pipe/p_defines.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +#define NV_OP_PHI       0 +#define NV_OP_EXTRACT   1 +#define NV_OP_COMBINE   2 +#define NV_OP_LDA       3 +#define NV_OP_STA       4 +#define NV_OP_MOV       5 +#define NV_OP_ADD       6 +#define NV_OP_SUB       7 +#define NV_OP_NEG       8 +#define NV_OP_MUL       9 +#define NV_OP_MAD       10 +#define NV_OP_CVT       11 +#define NV_OP_SAT       12 +#define NV_OP_NOT       13 +#define NV_OP_AND       14 +#define NV_OP_OR        15 +#define NV_OP_XOR       16 +#define NV_OP_SHL       17 +#define NV_OP_SHR       18 +#define NV_OP_RCP       19 +/* gap */ +#define NV_OP_RSQ       21 +#define NV_OP_LG2       22 +#define NV_OP_SIN       23 +#define NV_OP_COS       24 +#define NV_OP_EX2       25 +#define NV_OP_PRESIN    26 +#define NV_OP_PREEX2    27 +#define NV_OP_MIN       28 +#define NV_OP_MAX       29 +#define NV_OP_SET       30 +#define NV_OP_SAD       31 +#define NV_OP_KIL       32 +#define NV_OP_BRA       33 +#define NV_OP_CALL      34 +#define NV_OP_RET       35 +#define NV_OP_BREAK     36 +#define NV_OP_BREAKADDR 37 +#define NV_OP_JOINAT    38 +#define NV_OP_TEX       39 +#define NV_OP_TXB       40 +#define NV_OP_TXL       41 +#define NV_OP_TXF       42 +#define NV_OP_TXQ       43 +#define NV_OP_DFDX      44 +#define NV_OP_DFDY      45 +#define NV_OP_QUADOP    46 +#define NV_OP_LINTERP   47 +#define NV_OP_PINTERP   48 +#define NV_OP_ABS       49 +#define NV_OP_CEIL      50 +#define NV_OP_FLOOR     51 +#define NV_OP_TRUNC     52 +#define NV_OP_NOP       53 +#define NV_OP_SELECT    54 +#define NV_OP_EXPORT    55 +#define NV_OP_COUNT     56 + +#define NV_FILE_GPR      0 +#define NV_FILE_OUT      1 +#define NV_FILE_ADDR     2 +#define NV_FILE_FLAGS    3 +#define NV_FILE_IMM      16 +#define NV_FILE_MEM_S    32 +#define NV_FILE_MEM_P    33 +#define NV_FILE_MEM_V    34 +#define NV_FILE_MEM_L    48 +#define NV_FILE_MEM_G(i) (64 + i) +#define NV_FILE_MEM_C(i) (80 + i) + +#define NV_MOD_NEG 1 +#define NV_MOD_ABS 2 +#define NV_MOD_NOT 4 +#define NV_MOD_SAT 8 + +#define NV_TYPE_U8  0x00 +#define NV_TYPE_S8  0x01 +#define NV_TYPE_U16 0x02 +#define NV_TYPE_S16 0x03 +#define NV_TYPE_U32 0x04 +#define NV_TYPE_S32 0x05 +#define NV_TYPE_P32 0x07 +#define NV_TYPE_F32 0x09 +#define NV_TYPE_F64 0x0b +#define NV_TYPE_VEC(x, n) (NV_TYPE_##x | (n << 4)) +#define NV_TYPE_LO  0x00 +#define NV_TYPE_HI  0x80 +#define NV_TYPE_ANY 0xff + +#define NV_TYPE_ISINT(t) ((t) <= 5) +#define NV_TYPE_ISFLT(t) ((t) & 0x08) + +#define NV_CC_FL 0x0 +#define NV_CC_LT 0x1 +#define NV_CC_EQ 0x2 +#define NV_CC_LE 0x3 +#define NV_CC_GT 0x4 +#define NV_CC_NE 0x5 +#define NV_CC_GE 0x6 +#define NV_CC_U  0x8 +#define NV_CC_TR 0xf + +#define NV_PC_MAX_INSTRUCTIONS 2048 +#define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4) + +static INLINE boolean +nv_is_vector_op(uint opcode) +{ +   return (opcode >= NV_OP_TEX) && (opcode <= NV_OP_TXQ); +} + +static INLINE uint +nv_type_order(ubyte type) +{ +   switch (type & 0xf) { +   case NV_TYPE_U8: +   case NV_TYPE_S8: +      return 0; +   case NV_TYPE_U16: +   case NV_TYPE_S16: +      return 1; +   case NV_TYPE_U32: +   case NV_TYPE_F32: +   case NV_TYPE_S32: +   case NV_TYPE_P32: +      return 2; +   case NV_TYPE_F64: +      return 3; +   } +   assert(0); +} + +static INLINE uint +nv_type_sizeof(ubyte type) +{ +   if (type & 0xf0) +      return (1 << nv_type_order(type)) * (type >> 4); +   return 1 << nv_type_order(type); +} + +static INLINE uint +nv_type_sizeof_base(ubyte type) +{ +   return 1 << nv_type_order(type); +} + +struct nv_reg { +   int id; +   ubyte file; +   ubyte type; /* type of generating instruction's result */ +   union { +      float f32; +      double f64; +      int32_t s32; +      uint32_t u32; +   } imm; +}; + +struct nv_range { +   struct nv_range *next; +   int bgn; +   int end; +}; + +struct nv_value { +   struct nv_reg reg;  +   struct nv_instruction *insn; +   struct nv_value *join; +   int n; +   struct nv_range *livei; +   int refc; + +   struct nv_value *next; +   struct nv_value *prev; +}; + +struct nv_ref { +   struct nv_value *value; +   struct nv_instruction *insn; +   ubyte mod; +   ubyte typecast; +   ubyte flags; /* not used yet */ +}; + +struct nv_basic_block; + +struct nv_instruction { +   struct nv_instruction *next; +   struct nv_instruction *prev; +   uint opcode; +   int serial; +   struct nv_value *def[4]; +   struct nv_value *flags_def; +   struct nv_ref *src[5]; +   struct nv_ref *flags_src; +   struct nv_basic_block *bb; +   struct nv_basic_block *target; /* target block of control flow insn */ +   ubyte cc; +   ubyte set_cond      : 4; +   ubyte fixed         : 1; /* don't optimize away */ +   ubyte is_terminator : 1; +   ubyte is_join       : 1; +   ubyte is_long       : 1; /* for emission */ +   /* */ +   ubyte saturate : 1; +   ubyte centroid : 1; +   ubyte flat     : 1; +   ubyte padding  : 4; +   ubyte tex_live : 1; +   /* */ +   ubyte tex_t; /* TIC binding */ +   ubyte tex_s; /* TSC binding */ +   ubyte tex_argc : 3; +   ubyte tex_cube : 1; +   ubyte tex_mask : 4; +   /* */ +   ubyte quadop; +}; + +struct nv_basic_block { +   struct nv_instruction *entry; /* first non-phi instruction */ +   struct nv_instruction *exit; +   struct nv_instruction *phi; /* very first instruction */ +   int num_instructions; + +   struct nv_basic_block *out[2]; /* no indirect branches -> 2 */ +   struct nv_basic_block **in; +   uint num_in; + +   int id; +   struct nv_basic_block *last_visitor; +   uint priv; +   uint pass_seq; + +   uint32_t bin_pos; /* position, size in emitted code */ +   uint32_t bin_size; + +   uint32_t live_set[NV_PC_MAX_VALUES / 32]; +}; + +#define NV_FIXUP_CFLOW_RELOC 0 +#define NV_FIXUP_PARAM_RELOC 1 + +struct nv_fixup { +   ubyte type; +   ubyte shift; +   uint32_t mask; +   uint32_t data; +   uint32_t offset; +}; + +static INLINE void +nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data) +{ +   uint32_t val; + +   val = bin[fixup->offset / 4] & ~fixup->mask; +   data = (fixup->shift < 0) ? (data >> fixup->shift) : (data << fixup->shift); +   val |= (fixup->data + data) & fixup->mask; +   bin[fixup->offset / 4] = val; +} + +struct nv_pc { +   struct nv50_translation_info *ti; + +   struct nv_basic_block *root; +   struct nv_basic_block *current_block; +   struct nv_basic_block *parent_block; + +   int loop_nesting_bound; +   uint pass_seq; + +   struct nv_value values[NV_PC_MAX_VALUES]; +   struct nv_instruction instructions[NV_PC_MAX_INSTRUCTIONS]; +   struct nv_ref **refs; +   struct nv_basic_block **bb_list; +   int num_values; +   int num_instructions; +   int num_refs; +   int num_blocks; + +   int max_reg[4]; + +   uint32_t *immd_buf; /* populated on emit */ +   unsigned immd_count; + +   uint32_t *emit; +   unsigned bin_size; +   unsigned bin_pos; + +   struct nv_fixup *fixups; +   int num_fixups; +}; + +void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *); + +static INLINE struct nv_instruction * +new_instruction(struct nv_pc *pc, uint opcode) +{ +   struct nv_instruction *insn; + +   insn = &pc->instructions[pc->num_instructions++]; +   assert(pc->num_instructions < NV_PC_MAX_INSTRUCTIONS); + +   insn->cc = NV_CC_TR; +   insn->opcode = opcode; + +   nvbb_insert_tail(pc->current_block, insn); +   return insn; +} + +static INLINE struct nv_value * +new_value(struct nv_pc *pc, ubyte file, ubyte type) +{ +   struct nv_value *value = &pc->values[pc->num_values]; + +   assert(pc->num_values < NV_PC_MAX_VALUES - 1); + +   value->n = pc->num_values++; +   value->join = value; +   value->reg.id = -1; +   value->reg.file = file; +   value->reg.type = type; +   return value; +} + +static INLINE struct nv_ref * +new_ref(struct nv_pc *pc, struct nv_value *val) +{ +   int i; +   struct nv_ref *ref; + +   if ((pc->num_refs % 64) == 0) { +      const unsigned old_size = pc->num_refs * sizeof(struct nv_ref *); +      const unsigned new_size = (pc->num_refs + 64) * sizeof(struct nv_ref *); + +	   pc->refs = REALLOC(pc->refs, old_size, new_size); + +	   ref = CALLOC(64, sizeof(struct nv_ref)); +	   for (i = 0; i < 64; ++i) +		   pc->refs[pc->num_refs + i] = &ref[i]; +   } + +   ref = pc->refs[pc->num_refs++]; +   ref->value = val; +   ref->typecast = val->reg.type; + +   ++val->refc; +   return ref; +} + +static INLINE struct nv_basic_block * +new_basic_block(struct nv_pc *pc) +{ +   struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block); + +   bb->in = CALLOC(sizeof(struct nv_basic_block *), 4); +   bb->id = pc->num_blocks++; +   return bb; +} + +static INLINE void +nv_reference(struct nv_pc *pc, struct nv_ref **d, struct nv_value *s) +{ +   if (*d) +      --(*d)->value->refc; + +   if (s) { +      if (!*d) +         *d = new_ref(pc, s); +      else { +         (*d)->value = s; +         ++(s->refc); +      } +   } else { +      assert(*d); +      *d = NULL; +   } +} + +/* nv50_emit.c */ +void nv50_emit_instruction(struct nv_pc *, struct nv_instruction *); + +/* nv50_print.c */ +const char *nv_opcode_name(uint opcode); +void nv_print_instruction(struct nv_instruction *); + +/* nv50_pc.c */ +void nv_print_program(struct nv_basic_block *b); + +boolean nv_op_commutative(uint opcode); +int nv50_indirect_opnd(struct nv_instruction *); +boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s); +boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *); +ubyte nv50_supported_src_mods(uint opcode, int s); +int nv_nvi_refcount(struct nv_instruction *); +void nv_nvi_delete(struct nv_instruction *); +void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); +void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *); + +int nv_pc_exec_pass0(struct nv_pc *pc); +int nv_pc_exec_pass1(struct nv_pc *pc); +int nv_pc_exec_pass2(struct nv_pc *pc); + +int nv50_tgsi_to_nc(struct nv_pc *, struct nv50_translation_info *); + +#endif // NV50_COMPILER_H diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c new file mode 100644 index 0000000000..b917d23232 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -0,0 +1,1139 @@ +/*************************************************************************/ +/* Copyright (C) 2009                                                    */ +/*                                                                       */ +/* This program is free software: you can redistribute it and/or modify  */ +/* it under the terms of the GNU General Public License as published by  */ +/* the Free Software Foundation, either version 3 of the License, or     */ +/* (at your option) any later version.                                   */ +/*                                                                       */ +/* This program is distributed in the hope that it will be useful,       */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of        */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         */ +/* GNU General Public License for more details.                          */ +/*                                                                       */ +/* You should have received a copy of the GNU General Public License     */ +/* along with this program.  If not, see <http://www.gnu.org/licenses/>. */ +/*************************************************************************/ + +#include "nv50_context.h" +#include "nv50_pc.h" + +// Definitions + +#define FLAGS_CC_SHIFT    7 +#define FLAGS_ID_SHIFT    12 +#define FLAGS_WR_ID_SHIFT 4 +#define FLAGS_CC_MASK     (0x1f << FLAGS_CC_SHIFT) +#define FLAGS_ID_MASK     (0x03 << FLAGS_ID_SHIFT) +#define FLAGS_WR_EN       (1 << 6) +#define FLAGS_WR_ID_MASK  (0x3 << FLAGS_WR_ID_SHIFT) + +const ubyte nv50_inst_min_size_tab[NV_OP_COUNT] = +{ +   0, 0, 0, 8, 8, 4, 4, 4, 8, 4, 4, 8, 8, 8, 8, 8, /* 15 */ +   8, 8, 8, 4, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 31 */ +   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 47 */ +   4, 8, 8, 8, 8, 8, 0, 0 +}; + +/* XXX: silence, you ! */ +unsigned +nv50_inst_min_size(struct nv_instruction *i); + +unsigned +nv50_inst_min_size(struct nv_instruction *i) +{ +   int n; + +   if (nv50_inst_min_size_tab[i->opcode] > 4) +      return 8; + +   if (i->def[0] && i->def[0]->reg.file != NV_FILE_GPR) +      return 8; +   if (i->def[0]->join->reg.id > 63) +      return 8; + +   for (n = 0; n < 3; ++n) { +      if (!i->src[n]) +         break; +      if (i->src[n]->value->reg.file != NV_FILE_GPR && +          i->src[n]->value->reg.file != NV_FILE_MEM_V) +         return 8; +      if (i->src[n]->value->reg.id > 63) +         return 8; +   } + +   if (i->flags_def || i->flags_src || i->src[4]) +      return 8; + +   if (i->src[2]) { +      if (i->saturate || i->src[2]->mod) +         return 8; +      if (i->src[0]->mod ^ i->src[1]->mod) +         return 8; +      if ((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS) +         return 8; +      if (i->def[0]->join->reg.id < 0 || +          i->def[0]->join->reg.id != i->src[2]->value->join->reg.id) +         return 8; +   } + +   return nv50_inst_min_size_tab[i->opcode]; +} + +static INLINE ubyte +STYPE(struct nv_instruction *nvi, int s) +{ +   return nvi->src[s]->typecast; +} + +static INLINE ubyte +DTYPE(struct nv_instruction *nvi, int d) +{ +   return nvi->def[d]->reg.type; +} + +static INLINE struct nv_reg * +SREG(struct nv_ref *ref) +{ +   return &ref->value->join->reg; +} + +static INLINE struct nv_reg * +DREG(struct nv_value *val) +{ +   return &val->join->reg; +} + +static INLINE ubyte +SFILE(struct nv_instruction *nvi, int s) +{ +   return nvi->src[s]->value->reg.file; +} + +static INLINE ubyte +DFILE(struct nv_instruction *nvi, int d) +{ +   return nvi->def[0]->reg.file; +} + +static INLINE void +SID(struct nv_pc *pc, struct nv_ref *ref, int pos) +{ +   pc->emit[pos / 32] |= SREG(ref)->id << (pos % 32); +} + +static INLINE void +DID(struct nv_pc *pc, struct nv_value *val, int pos) +{ +   pc->emit[pos / 32] |= DREG(val)->id << (pos % 32); +} + +static INLINE uint32_t +get_immd_u32(struct nv_ref *ref) +{ +   assert(ref->value->reg.file == NV_FILE_IMM); +   return ref->value->reg.imm.u32; +} + +static INLINE void +set_immd_u32(struct nv_pc *pc, uint32_t u32) +{ +   pc->emit[1] |= 3; +   pc->emit[0] |= (u32 & 0x3f) << 16; +   pc->emit[1] |= (u32 >> 6) << 2; +} + +static INLINE void +set_immd(struct nv_pc *pc, struct nv_ref *ref) +{ +   assert(ref->value->reg.file == NV_FILE_IMM); +   set_immd_u32(pc, get_immd_u32(ref)); +} + +static void +new_fixup(struct nv_pc *pc, unsigned type, uint32_t data, uint32_t m, int s) +{ +   const unsigned size = sizeof(struct nv_fixup); +   const unsigned n = pc->num_fixups; +   return; + +   if (!(n % 8)) +      pc->fixups = REALLOC(pc->fixups, n * size, (n + 8) * size); + +   pc->fixups[n].offset = pc->bin_pos + (s / 32); +   pc->fixups[n].type = type; +   pc->fixups[n].data = data; +   pc->fixups[n].mask = m << (s % 32); +   pc->fixups[n].shift = s % 32; + +   ++pc->num_fixups; + +   assert(((data << (s % 32)) & pc->fixups[n].mask) == (data << (s % 32))); +} + +static void +nv_pc_alloc_immd(struct nv_pc *pc, struct nv_ref *ref) +{ +   uint32_t i, val = get_immd_u32(ref); + +   for (i = 0; i < pc->immd_count; ++i) +      if (pc->immd_buf[i] == val) +         break; + +   if (i == pc->immd_count) { +      if (!(pc->immd_count % 8)) +         pc->immd_buf = REALLOC(pc->immd_buf, +				pc->immd_count * 4, (pc->immd_count + 8) * 4); +      pc->immd_buf[pc->immd_count++] = val; +   } + +   SREG(ref)->id = i; +} + +static INLINE void +set_pred(struct nv_pc *pc, struct nv_instruction *i) +{ +   assert(!(pc->emit[1] & 0x00003f80)); + +   pc->emit[1] |= i->cc << 7; +   if (i->flags_src) +      pc->emit[1] |= SREG(i->flags_src)->id << 12; +} + +static INLINE void +set_pred_wr(struct nv_pc *pc, struct nv_instruction *i) +{ +   assert(!(pc->emit[1] & 0x00000070)); + +   if (i->flags_def) +      pc->emit[1] |= (DREG(i->flags_def)->id << 4) | 0x40; +} + +static INLINE void +set_a16_bits(struct nv_pc *pc, uint id) +{ +   ++id; /* $a0 is always 0 */ +   pc->emit[0] |= (id & 3) << 26; +   pc->emit[1] |= id & 4; +} + +static INLINE void +set_addr(struct nv_pc *pc, struct nv_instruction *i) +{ +   if (i->src[4]) +      set_a16_bits(pc, SREG(i->src[4])->id); +} + +static void +set_dst(struct nv_pc *pc, struct nv_value *value) +{ +   struct nv_reg *reg = &value->join->reg; + +   if (reg->id < 0) { +      debug_printf("WARNING: unused dst, hope we can bucket it !\n"); +      pc->emit[0] |= 127 << 2; +      pc->emit[1] |= 0x8; +      return; +   } + +   if (reg->file == NV_FILE_OUT) +      pc->emit[1] |= 0x8; +   else +   if (reg->file == NV_FILE_ADDR) +	   assert(0); + +   pc->emit[0] |= reg->id << 2; +} + +static void +set_src_0(struct nv_pc *pc, struct nv_ref *ref) +{ +   struct nv_reg *reg = SREG(ref); + +   if (reg->file == NV_FILE_MEM_S) +      pc->emit[1] |= 0x00200000; +   else +   if (reg->file == NV_FILE_MEM_P) +      pc->emit[0] |= 0x01800000; +   else +   if (reg->file != NV_FILE_GPR) +      NOUVEAU_ERR("invalid src0 register file: %d\n", reg->file); + +   assert(reg->id < 128); +   pc->emit[0] |= reg->id << 9; +} + +static void +set_src_1(struct nv_pc *pc, struct nv_ref *ref) +{ +   struct nv_reg *reg = SREG(ref); + +   if (reg->file >= NV_FILE_MEM_C(0) && +       reg->file <= NV_FILE_MEM_C(15)) { +      assert(!(pc->emit[1] & 0x01800000)); + +      pc->emit[0] |= 0x00800000; +      pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; +   } else +   if (reg->file != NV_FILE_GPR) +      NOUVEAU_ERR("invalid src1 register file: %d\n", reg->file); + +   assert(reg->id < 128); +   pc->emit[0] |= reg->id << 16; +} + +static void +set_src_2(struct nv_pc *pc, struct nv_ref *ref) +{ +   struct nv_reg *reg = SREG(ref); + +   if (reg->file >= NV_FILE_MEM_C(0) && +       reg->file <= NV_FILE_MEM_C(15)) { +      assert(!(pc->emit[1] & 0x01800000)); + +      pc->emit[0] |= 0x01000000; +      pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; +   } else +   if (reg->file != NV_FILE_GPR) +      NOUVEAU_ERR("invalid src2 register file: %d\n", reg->file); + +   assert(reg->id < 128); +   pc->emit[1] |= reg->id << 14; +} + +/* the default form: + * - long instruction + * - 1 to 3 sources in slots 0, 1, 2 + * - address & flags + */ +static void +emit_form_MAD(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] |= 1; + +   set_pred(pc, i); +   set_pred_wr(pc, i); + +   if (i->def[0]) +      set_dst(pc, i->def[0]); +   else { +      pc->emit[0] |= 0x01fc; +      pc->emit[1] |= 0x0008; +   } + +   if (i->src[0]) +      set_src_0(pc, i->src[0]); + +   if (i->src[1]) +      set_src_1(pc, i->src[1]); + +   if (i->src[2]) +      set_src_2(pc, i->src[2]); + +   set_addr(pc, i); +} + +/* like default form, but 2nd source in slot 2, no 3rd source */ +static void +emit_form_ADD(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] |= 1; + +   if (i->def[0]) +      set_dst(pc, i->def[0]); +   else { +      pc->emit[0] |= 0x01fc; +      pc->emit[1] |= 0x0008; +   } + +   set_pred(pc, i); +   set_pred_wr(pc, i); + +   if (i->src[0]) +      set_src_0(pc, i->src[0]); + +   if (i->src[1]) +      set_src_2(pc, i->src[1]); + +   set_addr(pc, i); +} + +/* short mul */ +static void +emit_form_MUL(struct nv_pc *pc, struct nv_instruction *i) +{ +   assert(!i->is_long && !(pc->emit[0] & 1)); + +   assert(i->def[0]); +   set_dst(pc, i->def[0]); + +   if (i->src[0]) +      set_src_0(pc, i->src[0]); + +   if (i->src[1]) +      set_src_1(pc, i->src[1]); +} + +/* default immediate form + * - 1 to 3 sources where last is immediate + * - no address or predicate possible + */ +static void +emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask) +{ +   pc->emit[0] |= 1; + +   assert(i->def[0]); +   assert(i->src[0]); +   set_dst(pc, i->def[0]); + +   assert(!i->src[4] && !i->flags_src && !i->flags_def); + +   if (i->src[2]) { +      set_immd(pc, i->src[2]); +      set_src_0(pc, i->src[1]); +      set_src_1(pc, i->src[0]); +   } else +   if (i->src[1]) { +      set_immd(pc, i->src[1]); +      set_src_0(pc, i->src[0]); +   } else +      set_immd(pc, i->src[0]); + +   assert(!mod_mask); +} + +static void +set_ld_st_size(struct nv_pc *pc, ubyte type) +{ +   switch (type) { +   case NV_TYPE_F64: +      pc->emit[1] |= 0x8000; +      break; +   case NV_TYPE_F32: +   case NV_TYPE_S32: +   case NV_TYPE_U32: +      pc->emit[1] |= 0xc000; +      break; +   case NV_TYPE_S16: +      pc->emit[1] |= 0x6000; +      break; +   case NV_TYPE_U16: +      pc->emit[1] |= 0x4000; +      break; +   case NV_TYPE_S8: +      pc->emit[1] |= 0x2000; +      break; +   default: +      break; +   } +} + +static void +emit_ld(struct nv_pc *pc, struct nv_instruction *i) +{ +   ubyte sf = SFILE(i, 0); + +   if (sf == NV_FILE_IMM) { +      sf = NV_FILE_MEM_C(0); +      nv_pc_alloc_immd(pc, i->src[0]); + +      new_fixup(pc, NV_FIXUP_PARAM_RELOC, SREG(i->src[0])->id, 0xffff, 9); +   } + +   if (sf == NV_FILE_MEM_S || +       sf == NV_FILE_MEM_P) { +      pc->emit[0] = 0x10000001; +      pc->emit[1] = 0x04200000 | (0x3c << 12); +      if (sf == NV_FILE_MEM_P) +         pc->emit[0] |= 0x01800000; +   } else +   if (sf >= NV_FILE_MEM_C(0) && +       sf <= NV_FILE_MEM_C(15)) { +      pc->emit[0] = 0x10000001; +      pc->emit[1] = 0x24000000; +      pc->emit[1] |= (sf - NV_FILE_MEM_C(0)) << 22; +   } else +   if (sf >= NV_FILE_MEM_G(0) && +       sf <= NV_FILE_MEM_G(15)) { +      pc->emit[0] = 0xd0000001 | ((sf - NV_FILE_MEM_G(0)) << 16); +      pc->emit[1] = 0xa0000000; + +      assert(i->src[4] && SREG(i->src[4])->file == NV_FILE_GPR); +      SID(pc, i->src[4], 9); +   } else +   if (sf == NV_FILE_MEM_L) { +      pc->emit[0] = 0xd0000001; +      pc->emit[1] = 0x40000000; +   } else { +      NOUVEAU_ERR("invalid ld source file\n"); +      abort(); +   } + +   set_ld_st_size(pc, STYPE(i, 0)); + +   set_dst(pc, i->def[0]); +   set_pred_wr(pc, i); + +   set_pred(pc, i); + +   if (sf < NV_FILE_MEM_G(0) || +       sf > NV_FILE_MEM_G(15)) { +      SID(pc, i->src[0], 9); +      set_addr(pc, i); +   } +} + +static void +emit_st(struct nv_pc *pc, struct nv_instruction *i) +{ + +} + +static int +verify_mov(struct nv_instruction *i) +{ +   ubyte sf = SFILE(i, 0); +   ubyte df = DFILE(i, 0); + +   if (df == NV_FILE_GPR) +      return 0; + +   if (df != NV_FILE_OUT && +       df != NV_FILE_FLAGS && +       df != NV_FILE_ADDR) +      return 1; + +   if (sf == NV_FILE_FLAGS) +      return 2; +   if (sf == NV_FILE_ADDR) +      return 3; +   if (sf == NV_FILE_IMM && df != NV_FILE_OUT) +      return 4; + +   return 0; +} + +static void +emit_mov(struct nv_pc *pc, struct nv_instruction *i) +{ +   assert(!verify_mov(i)); + +   if (SFILE(i, 0) >= NV_FILE_MEM_S) +      emit_ld(pc, i); +   else +   if (SFILE(i, 0) == NV_FILE_FLAGS) { +      pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); +      pc->emit[1] = 0x20000780 | (SREG(i->src[0])->id << 12); +   } else +   if (SFILE(i, 0) == NV_FILE_ADDR) { +      pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); +      pc->emit[1] = 0x40000780; +      set_a16_bits(pc, SREG(i->src[0])->id); +   } else +   if (DFILE(i, 0) == NV_FILE_FLAGS) { +      pc->emit[0] = 0x000001fd; +      pc->emit[1] = 0xa0000788 | (1 << 6); +      pc->emit[0] |= SREG(i->src[0])->id << 9; +      pc->emit[1] |= DREG(i->def[0])->id << 4; +   } else +   if (SFILE(i, 0) == NV_FILE_IMM) { +      if (i->opcode == NV_OP_LDA) +         emit_ld(pc, i); +      else { +         pc->emit[0] = 0x10008001; +         pc->emit[1] = 0x00000003; + +	 emit_form_IMM(pc, i, 0); +      } +   } else { +      pc->emit[0] = 0x10000000; +      pc->emit[0] |= DREG(i->def[0])->id << 2; +      pc->emit[0] |= SREG(i->src[0])->id << 9; + +      if (!i->is_long) +         pc->emit[0] |= 0x8000; +      else { +         pc->emit[0] |= 0x00000001; +         pc->emit[1] = 0x0403c000; + +	 set_pred(pc, i); +      } +   } + +   if (DFILE(i, 0) == NV_FILE_OUT) +      pc->emit[1] |= 0x8; +} + +static void +emit_interp(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0x80000000; + +   assert(DFILE(i, 0) == NV_FILE_GPR); +   assert(SFILE(i, 0) == NV_FILE_MEM_V); + +   DID(pc, i->def[0], 2); +   SID(pc, i->src[0], 16); + +   if (i->flat) +      pc->emit[0] |= 1 << 8; +   else +   if (i->opcode == NV_OP_PINTERP) { +      pc->emit[0] |= 1 << 25; +      pc->emit[0] |= SREG(i->src[1])->id << 9; +   } + +   if (i->centroid) +      pc->emit[0] |= 1 << 24; + +   if (i->is_long) { +      pc->emit[1] |= 0x0780 | +	      (pc->emit[0] & (3 << 24)) >> (24 - 16) | +	      (pc->emit[0] & (1 <<  8)) >> (18 -  8); + +      pc->emit[0] |= 1; +      pc->emit[0] &= ~0x03000100; +   } +} + +static void +emit_minmax(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0x30000000; +   pc->emit[1] = (i->opcode == NV_OP_MIN) ? (2 << 28) : 0; + +   switch (DTYPE(i, 0)) { +   case NV_TYPE_F32: +      pc->emit[0] |= 0x80000000; +      pc->emit[1] |= 0x80000000; +      break; +   case NV_TYPE_S32: +      pc->emit[1] |= 0x8c000000; +      break; +   case NV_TYPE_U32: +      pc->emit[1] |= 0x84000000; +      break; +   } +	 +   emit_form_MAD(pc, i); + +   if (i->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +   if (i->src[1]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00080000; +} + +static void +emit_add_f32(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0xb0000000; + +   if (SFILE(i, 1) == NV_FILE_IMM) { +      emit_form_IMM(pc, i, 0); + +      if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; +      if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; +   } else +   if (i->is_long) { +      emit_form_ADD(pc, i); + +      if (i->src[0]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 26; +      if (i->src[1]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 27; +   } else { +      emit_form_MUL(pc, i); + +      if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; +      if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; +   } +} + +static void +emit_add_b32(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0x20008000; + +   if (SFILE(i, 1) == NV_FILE_IMM) { +      emit_form_IMM(pc, i, 0); +   } else +   if (i->is_long) { +      pc->emit[0] = 0x20000000; +      pc->emit[1] = 0x04000000; +      emit_form_ADD(pc, i); +   } else { +      emit_form_MUL(pc, i); +   } + +   if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 28; +   if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; +} + +static void +emit_add_a16(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0xd0000001 | (get_immd_u32(i->src[0]) << 9); +   pc->emit[1] = 0x20000000; + +   pc->emit[0] |= (DREG(i->def[0])->id + 1) << 2; + +   set_pred(pc, i); + +   if (i->src[1]) +      set_a16_bits(pc, SREG(i->src[1])->id); +} + +static void +emit_flow(struct nv_pc *pc, struct nv_instruction *i, ubyte flow_op) +{ +   pc->emit[0] = 0x00000003 | (flow_op << 28); +   pc->emit[1] = 0x00000000; + +   set_pred(pc, i); + +   if (i->target) { +      new_fixup(pc, NV_FIXUP_CFLOW_RELOC, i->target->bin_pos, 0x7ff800, 11); +      pc->emit[0] |= (i->target->bin_pos / 4) << 11; +   } +} + +static INLINE void +emit_add(struct nv_pc *pc, struct nv_instruction *i) +{ +   if (DFILE(i, 0) == NV_FILE_ADDR) +      emit_add_a16(pc, i); +   else { +      switch (DTYPE(i, 0)) { +      case NV_TYPE_F32: +         emit_add_f32(pc, i); +         break; +      case NV_TYPE_U32: +      case NV_TYPE_S32: +         emit_add_b32(pc, i); +         break; +      } +   } +} + +static void +emit_bitop2(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0xd0000000; + +   if (SFILE(i, 0) == NV_FILE_IMM) { +      emit_form_IMM(pc, i, 0); + +      if (i->opcode == NV_OP_OR) +         pc->emit[0] |= 0x0100; +      else +      if (i->opcode == NV_OP_XOR) +         pc->emit[0] |= 0x8000; +   } else { +      emit_form_MAD(pc, i); + +      pc->emit[1] |= 0x04000000; + +      if (i->opcode == NV_OP_OR) +         pc->emit[1] |= 0x4000; +      else +      if (i->opcode == NV_OP_XOR) +         pc->emit[1] |= 0x8000; +   } +} + +static void +emit_shift(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0x30000001; +   pc->emit[1] = 0xc4000000; + +   if (i->opcode == NV_OP_SHR) +      pc->emit[1] |= 1 << 29; + +   if (SFILE(i, 1) == NV_FILE_IMM) { +      pc->emit[1] |= 1 << 20; +      pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x7f) << 16; + +      set_pred(pc, i); +   } else +      emit_form_MAD(pc, i); + +   if (STYPE(i, 0) == NV_TYPE_S32) +      pc->emit[1] |= 1 << 27; +} + +static void +emit_flop(struct nv_pc *pc, struct nv_instruction *i) +{ +   struct nv_ref *src0 = i->src[0]; + +   pc->emit[0] = 0x90000000; + +   assert(SREG(src0)->type == NV_TYPE_F32); +   assert(SREG(src0)->file == NV_FILE_GPR); + +   if (!i->is_long) { +      emit_form_MUL(pc, i); +      assert(i->opcode == NV_OP_RCP && !src0->mod); +      return; +   } + +   pc->emit[1] = (i->opcode - NV_OP_RCP) << 29; + +   emit_form_MAD(pc, i); + +   if (src0->mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; +   if (src0->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_mad_f32(struct nv_pc *pc, struct nv_instruction *i) +{ +   const boolean neg_mul = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; +   const boolean neg_add = (i->src[2]->mod & NV_MOD_NEG); + +   pc->emit[0] = 0xe0000000; + +   if (!i->is_long) { +      emit_form_MUL(pc, i); +      assert(!neg_mul && !neg_add); +      return; +   } + +   emit_form_MAD(pc, i); + +   if (neg_mul) pc->emit[1] |= 0x04000000; +   if (neg_add) pc->emit[1] |= 0x08000000; + +   if (i->saturate) +      pc->emit[1] |= 0x20000000; +} + +static INLINE void +emit_mad(struct nv_pc *pc, struct nv_instruction *i) +{ +   emit_mad_f32(pc, i); +} + +static void +emit_mul_f32(struct nv_pc *pc, struct nv_instruction *i) +{ +   boolean neg = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; + +   pc->emit[0] = 0xc0000000; + +   if (SFILE(i, 1) == NV_FILE_IMM) { +      emit_form_IMM(pc, i, 0); + +      if (neg) +         pc->emit[0] |= 0x8000; +   } else +   if (i->is_long) { +      emit_form_MAD(pc, i); + +      if (neg) +         pc->emit[1] |= 0x08 << 24; +   } else { +      emit_form_MUL(pc, i); + +      if (neg) +         pc->emit[0] |= 0x8000; +   } +} + +static void +emit_set(struct nv_pc *pc, struct nv_instruction *nvi) +{ +   assert(nvi->is_long); + +   pc->emit[0] = 0x30000000; +   pc->emit[1] = 0x60000000; + +   pc->emit[1] |= nvi->set_cond << 14; + +   switch (STYPE(nvi, 0)) { +   case NV_TYPE_U32: pc->emit[1] |= 0x04000000; break; +   case NV_TYPE_S32: pc->emit[1] |= 0x0c000000; break; +   case NV_TYPE_F32: pc->emit[0] |= 0x80000000; break; +   default: +      assert(0); +      break; +   } + +   emit_form_MAD(pc, nvi); +} + +#define CVT_RN    (0x00 << 16) +#define CVT_FLOOR (0x02 << 16) +#define CVT_CEIL  (0x04 << 16) +#define CVT_TRUNC (0x06 << 16) +#define CVT_SAT   (0x08 << 16) +#define CVT_ABS   (0x10 << 16) + +#define CVT_X32_X32 0x04004000 +#define CVT_X32_S32 0x04014000 +#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) +#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) +#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) +#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) +#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) +#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) +#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) +#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) +#define CVT_U32_U32 ((0x00 << 24) | CVT_X32_X32) + +#define CVT_NEG 0x20000000 +#define CVT_RI  0x08000000 + +static void +emit_cvt(struct nv_pc *pc, struct nv_instruction *nvi) +{ +   ubyte dst_type = nvi->def[0] ? DTYPE(nvi, 0) : STYPE(nvi, 0); + +   pc->emit[0] = 0xa0000000; + +   switch (dst_type) { +   case NV_TYPE_F32: +      switch (STYPE(nvi, 0)) { +      case NV_TYPE_F32: pc->emit[1] = CVT_F32_F32; break; +      case NV_TYPE_S32: pc->emit[1] = CVT_F32_S32; break; +      case NV_TYPE_U32: pc->emit[1] = CVT_F32_U32; break; +      } +      break; +   case NV_TYPE_S32: +      switch (STYPE(nvi, 0)) { +      case NV_TYPE_F32: pc->emit[1] = CVT_S32_F32; break; +      case NV_TYPE_S32: pc->emit[1] = CVT_S32_S32; break; +      case NV_TYPE_U32: pc->emit[1] = CVT_S32_U32; break; +      } +      break; +   case NV_TYPE_U32: +      switch (STYPE(nvi, 0)) { +      case NV_TYPE_F32: pc->emit[1] = CVT_U32_F32; break; +      case NV_TYPE_S32: pc->emit[1] = CVT_U32_S32; break; +      case NV_TYPE_U32: pc->emit[1] = CVT_U32_U32; break; +      } +      break; +   } +   if (pc->emit[1] == CVT_F32_F32 && +       (nvi->opcode == NV_OP_CEIL || nvi->opcode == NV_OP_FLOOR || +	nvi->opcode == NV_OP_TRUNC)) +       pc->emit[1] |= CVT_RI; + +   switch (nvi->opcode) { +   case NV_OP_CEIL:  pc->emit[1] |= CVT_CEIL; break; +   case NV_OP_FLOOR: pc->emit[1] |= CVT_FLOOR; break; +   case NV_OP_TRUNC: pc->emit[1] |= CVT_TRUNC; break; + +   case NV_OP_ABS: pc->emit[1] |= CVT_ABS; break; +   case NV_OP_SAT: pc->emit[1] |= CVT_SAT; break; +   case NV_OP_NEG: pc->emit[1] |= CVT_NEG; break; +   default: +      assert(nvi->opcode == NV_OP_CVT); +      break; +   } +   assert(nvi->opcode != NV_OP_ABS || !(nvi->src[0]->mod & NV_MOD_NEG)); + +   if (nvi->src[0]->mod & NV_MOD_NEG) pc->emit[1] ^= CVT_NEG; +   if (nvi->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= CVT_ABS; + +   emit_form_MAD(pc, nvi); +} + +static void +emit_tex(struct nv_pc *pc, struct nv_instruction *i) +{ +   pc->emit[0] = 0xf0000001; +   pc->emit[1] = 0x00000000; + +   DID(pc, i->def[0], 2); + +   set_pred(pc, i); + +   pc->emit[0] |= i->tex_t << 9; +   pc->emit[0] |= i->tex_s << 17; + +   pc->emit[0] |= i->tex_argc << 22; + +   pc->emit[0] |= (i->tex_mask & 0x3) << 25; +   pc->emit[1] |= (i->tex_mask & 0xc) << 12; + +   if (i->tex_live) +      pc->emit[1] |= 4; + +   if (i->tex_cube) +      pc->emit[0] |= 0x08000000; + +   if (i->opcode == NV_OP_TXB) +      pc->emit[1] |= 0x20000000; +   else +   if (i->opcode == NV_OP_TXL) +      pc->emit[1] |= 0x40000000; +   else +      pc->emit[0] -= 1 << 22; +} + +static void +emit_cvt2fixed(struct nv_pc *pc, struct nv_instruction *i) +{ +   ubyte mod = i->src[0]->mod; + +   pc->emit[0] = 0xb0000000; +   pc->emit[1] = 0xc0000000; + +   if (i->opcode == NV_OP_PREEX2) +      pc->emit[1] |= 0x4000; + +   emit_form_MAD(pc, i); + +   if (mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; +   if (mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_ddx(struct nv_pc *pc, struct nv_instruction *i) +{ +   assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + +   pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0240001 : 0xc0140001; +   pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x86400000 : 0x89800000; + +   DID(pc, i->def[0], 2); +   SID(pc, i->src[0], 9); +   SID(pc, i->src[0], 32 + 14); + +   set_pred(pc, i); +   set_pred_wr(pc, i); +} + +static void +emit_ddy(struct nv_pc *pc, struct nv_instruction *i) +{ +   assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + +   pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0250001 : 0xc0150001; +   pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x85800000 : 0x8a400000; + +   DID(pc, i->def[0], 2); +   SID(pc, i->src[0], 9); +   SID(pc, i->src[0], 32 + 14); + +   set_pred(pc, i); +   set_pred_wr(pc, i); +} + +void +nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) +{ +   // nv_print_instruction(i); + +   switch (i->opcode) { +   case NV_OP_MOV: +      if (DFILE(i, 0) == NV_FILE_ADDR) +         emit_add_a16(pc, i); +      else +         emit_mov(pc, i); +      break; +   case NV_OP_LDA: +      emit_mov(pc, i); +      break; +   case NV_OP_STA: +      emit_st(pc, i); +      break; +   case NV_OP_LINTERP: +   case NV_OP_PINTERP: +      emit_interp(pc, i); +      break; +   case NV_OP_ADD: +      emit_add(pc, i); +      break; +   case NV_OP_AND: +   case NV_OP_OR: +   case NV_OP_XOR: +      emit_bitop2(pc, i); +      break; +   case NV_OP_CVT: +   case NV_OP_ABS: +   case NV_OP_NEG: +   case NV_OP_SAT: +   case NV_OP_CEIL: +   case NV_OP_FLOOR: +   case NV_OP_TRUNC: +      emit_cvt(pc, i); +      break; +   case NV_OP_DFDX: +      emit_ddx(pc, i); +      break; +   case NV_OP_DFDY: +      emit_ddy(pc, i); +      break; +   case NV_OP_RCP: +   case NV_OP_RSQ: +   case NV_OP_LG2: +   case NV_OP_SIN: +   case NV_OP_COS: +   case NV_OP_EX2: +      emit_flop(pc, i); +      break; +   case NV_OP_PRESIN: +   case NV_OP_PREEX2: +      emit_cvt2fixed(pc, i); +      break; +   case NV_OP_MAD: +      emit_mad(pc, i); +      break; +   case NV_OP_MAX: +   case NV_OP_MIN: +      emit_minmax(pc, i); +      break; +   case NV_OP_MUL: +      emit_mul_f32(pc, i); +      break; +   case NV_OP_SET: +      emit_set(pc, i); +      break; +   case NV_OP_SHL: +   case NV_OP_SHR: +      emit_shift(pc, i); +      break; +   case NV_OP_TEX: +   case NV_OP_TXB: +   case NV_OP_TXL: +      emit_tex(pc, i); +      break; +   case NV_OP_KIL: +      emit_flow(pc, i, 0x0); +      break; +   case NV_OP_BRA: +      emit_flow(pc, i, 0x1); +      break; +   case NV_OP_CALL: +      emit_flow(pc, i, 0x2); +      break; +   case NV_OP_RET: +      emit_flow(pc, i, 0x3); +      break; +   case NV_OP_BREAKADDR: +      emit_flow(pc, i, 0x4); +      break; +   case NV_OP_BREAK: +      emit_flow(pc, i, 0x5); +      break; +   case NV_OP_JOINAT: +      emit_flow(pc, i, 0xa); +      break; +   case NV_OP_NOP: +      pc->emit[0] = 0xf0000001; +      pc->emit[1] = 0xe0000000; +      break; +   case NV_OP_PHI: +   case NV_OP_SUB: +      NOUVEAU_ERR("operation \"%s\" should have been eliminated\n", +		  nv_opcode_name(i->opcode)); +      break; +   default: +      NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode); +      abort(); +      break; +   } + +   assert((pc->emit[0] & 1) == i->is_long); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c new file mode 100644 index 0000000000..0811420e42 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -0,0 +1,717 @@ + +#include "nv50_pc.h" + +#define DESCEND_ARBITRARY(j, f)                                 \ +do {                                                            \ +   b->pass_seq = ctx->pc->pass_seq;                             \ +                                                                \ +   for (j = 0; j < 2; ++j)                                      \ +      if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \ +         f(ctx, b->out[j]);	                                  \ +} while (0) + +extern unsigned nv50_inst_min_size(struct nv_instruction *); + +struct nv_pc_pass { +   struct nv_pc *pc; +}; + +static INLINE boolean +values_equal(struct nv_value *a, struct nv_value *b) +{ +   /* XXX: sizes */ +   return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id); +} + +static INLINE boolean +inst_commutation_check(struct nv_instruction *a, +                       struct nv_instruction *b) +{ +   int si, di; + +   for (di = 0; di < 4; ++di) { +      if (!a->def[di]) +         break; +      for (si = 0; si < 5; ++si) { +         if (!b->src[si]) +            continue; +         if (values_equal(a->def[di], b->src[si]->value)) +            return FALSE; +      } +   } + +   if (b->flags_src && b->flags_src->value == a->flags_def) +      return FALSE; + +   return TRUE; +} + +/* Check whether we can swap the order of the instructions, + * where a & b may be either the earlier or the later one. + */ +static boolean +inst_commutation_legal(struct nv_instruction *a, +		       struct nv_instruction *b) +{ +   return inst_commutation_check(a, b) && inst_commutation_check(b, a); +} + +static INLINE boolean +inst_cullable(struct nv_instruction *nvi) +{ +   return (!(nvi->is_terminator || +             nvi->target || +             nvi->fixed || +             nv_nvi_refcount(nvi))); +} + +static INLINE boolean +nvi_isnop(struct nv_instruction *nvi) +{ +   if (nvi->opcode == NV_OP_EXPORT) +      return TRUE; + +   if (nvi->fixed || +       nvi->is_terminator || +       nvi->flags_src || +       nvi->flags_def) +      return FALSE; + +   if (nvi->def[0]->join->reg.id < 0) +      return TRUE; + +   if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) +      return FALSE; + +   if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file) +      return FALSE; + +   if (nvi->src[0]->value->join->reg.id < 0) { +      debug_printf("nvi_isnop: orphaned value detected\n"); +      return TRUE; +   } + +   if (nvi->opcode == NV_OP_SELECT) +      if (!values_equal(nvi->def[0], nvi->src[1]->value)) +         return FALSE; + +   return values_equal(nvi->def[0], nvi->src[0]->value); +} + +static void +nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) +{ +   struct nv_instruction *nvi, *next; +   int j; +   uint size, n32 = 0; + +   b->priv = 0; + +   if (pc->num_blocks) +      b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos + +                   pc->bb_list[pc->num_blocks - 1]->bin_size; + +   pc->bb_list[pc->num_blocks++] = b; + +   /* visit node */ + +   for (nvi = b->entry; nvi; nvi = next) { +      next = nvi->next; +      if (nvi_isnop(nvi)) +         nv_nvi_delete(nvi); +   } + +   for (nvi = b->entry; nvi; nvi = next) { +      next = nvi->next; + +      size = nv50_inst_min_size(nvi); +      if (nvi->next && size < 8) +         ++n32; +      else +      if ((n32 & 1) && nvi->next && +          nv50_inst_min_size(nvi->next) == 4 && +          inst_commutation_legal(nvi, nvi->next)) { +         ++n32; +         debug_printf("permuting: "); +         nv_print_instruction(nvi); +         nv_print_instruction(nvi->next); +         nv_nvi_permute(nvi, nvi->next); +         next = nvi; +      } else { +         nvi->is_long = 1; + +         b->bin_size += n32 & 1; +         if (n32 & 1) +            nvi->prev->is_long = 1; +         n32 = 0; +      } +      b->bin_size += 1 + nvi->is_long; +   } + +   if (!b->entry) { +      debug_printf("block %p is now empty\n", b); +   } else +   if (!b->exit->is_long) { +      assert(n32); +      b->exit->is_long = 1; +      b->bin_size += 1; + +      /* might have del'd a hole tail of instructions */ +      if (!b->exit->prev->is_long && !(n32 & 1)) { +         b->bin_size += 1; +         b->exit->prev->is_long = 1; +      } +   } +   assert(!b->exit || b->exit->is_long); + +   pc->bin_size += b->bin_size *= 4; + +   /* descend CFG */ + +   if (!b->out[0]) +      return; +   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) +      return; + +#if 0 +   /* delete ELSE branch */ +   if (b->entry && +       b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) { +      nv_nvi_delete(b->entry); +      b->bin_size -= 2; +      pc->bin_size -= 8; +   } +#endif +   for (j = 0; j < 2; ++j) +      if (b->out[j] && b->out[j] != b) +         nv_pc_pass_pre_emission(pc, b->out[j]); +} + +int +nv_pc_exec_pass2(struct nv_pc *pc) +{ +   debug_printf("preparing %u blocks for emission\n", pc->num_blocks); + +   pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); +   +   pc->num_blocks = 0; +   nv_pc_pass_pre_emission(pc, pc->root); + +   return 0; +} + +static INLINE boolean +is_cmem_load(struct nv_instruction *nvi) +{ +   return (nvi->opcode == NV_OP_LDA && +	   nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && +	   nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15)); +} + +static INLINE boolean +is_smem_load(struct nv_instruction *nvi) +{ +   return (nvi->opcode == NV_OP_LDA && +	   (nvi->src[0]->value->reg.file == NV_FILE_MEM_S || +	    nvi->src[0]->value->reg.file <= NV_FILE_MEM_P)); +} + +static INLINE boolean +is_immd_move(struct nv_instruction *nvi) +{ +   return (nvi->opcode == NV_OP_MOV && +	   nvi->src[0]->value->reg.file == NV_FILE_IMM); +} + +static INLINE void +check_swap_src_0_1(struct nv_instruction *nvi) +{ +   static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + +   struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1]; + +   if (!nv_op_commutative(nvi->opcode)) +      return; +   assert(src0 && src1); + +   if (is_cmem_load(src0->value->insn)) { +      if (!is_cmem_load(src1->value->insn)) { +         nvi->src[0] = src1; +	 nvi->src[1] = src0; +	 /* debug_printf("swapping cmem load to 1\n"); */ +      } +   } else +   if (is_smem_load(src1->value->insn)) { +      if (!is_smem_load(src0->value->insn)) { +         nvi->src[0] = src1; +	 nvi->src[1] = src0; +	 /* debug_printf("swapping smem load to 0\n"); */ +      } +   } + +   if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0) +      nvi->set_cond = cc_swapped[nvi->set_cond]; +} + +struct nv_pass { +   struct nv_pc *pc; +   int n; +   void *priv; +}; + +static int +nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) +{ +   struct nv_instruction *nvi, *sti; +   int j; + +   for (sti = b->entry; sti; sti = sti->next) { +      if (!sti->def[0]) +         continue; + +      if (sti->def[0]->reg.file != NV_FILE_OUT) +         continue; +      if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA) +         continue; + +      nvi = sti->src[0]->value->insn; +      if (!nvi || nvi->opcode == NV_OP_PHI) +         continue; +      assert(nvi->def[0] == sti->src[0]->value); + +      if (nvi->def[0]->refc > 1) +         continue; + +      nvi->def[0] = sti->def[0]; +      nvi->fixed = 1; +      sti->fixed = 0; +   } +   DESCEND_ARBITRARY(j, nv_pass_fold_stores); + +   return 0; +} + +static int +nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) +{ +   struct nv_instruction *nvi, *ld; +   int j; + +   for (nvi = b->entry; nvi; nvi = nvi->next) { +      check_swap_src_0_1(nvi); + +      for (j = 0; j < 3; ++j) { +         if (!nvi->src[j]) +            break; +         ld = nvi->src[j]->value->insn; +         if (!ld) +            continue; + +         if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) { +            nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); +            debug_printf("folded immediate %i\n", ld->def[0]->n); +            continue; +         } + +         if (ld->opcode != NV_OP_LDA) +            continue; +         if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value)) +            continue; + +         if (j == 0 && ld->src[4]) /* can't load shared mem */ +            continue; + +         /* fold it ! */ /* XXX: ref->insn */ +         nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); +         if (ld->src[4]) +            nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); +      } +   } +   DESCEND_ARBITRARY(j, nv_pass_fold_loads); + +   return 0; +} + +static int +nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) +{ +   int j; +   struct nv_instruction *nvi, *mi, *next; +   ubyte mod; + +   for (nvi = b->entry; nvi; nvi = next) { +      next = nvi->next; +      if (nvi->opcode == NV_OP_SUB) { +         nvi->opcode = NV_OP_ADD; +         nvi->src[1]->mod ^= NV_MOD_NEG; +      } + +      /* should not put any modifiers on NEG and ABS */ +      assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod); +      assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod); + +      for (j = 0; j < 4; ++j) { +         if (!nvi->src[j]) +            break; + +         mi = nvi->src[j]->value->insn; +         if (!mi) +            continue; +         if (mi->def[0]->refc > 1) +            continue; + +         if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG; +         else +         if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS; +         else +            continue; + +         if (nvi->opcode == NV_OP_ABS) +            mod &= ~(NV_MOD_NEG | NV_MOD_ABS); +         else +         if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) { +            nvi->opcode = NV_OP_MOV; +            mod = 0; +         } + +         if (!(nv50_supported_src_mods(nvi->opcode, j) & mod)) +            continue; + +         nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value); + +         nvi->src[j]->mod ^= mod; +      } + +      if (nvi->opcode == NV_OP_SAT) { +         mi = nvi->src[0]->value->insn; + +         if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) { +            mi->saturate = 1; +            mi->def[0] = nvi->def[0]; +            nv_nvi_delete(nvi); +         } +      } +   } +   DESCEND_ARBITRARY(j, nv_pass_lower_mods); + +   return 0; +} + +#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) + +static struct nv_value * +find_immediate(struct nv_ref *ref) +{ +   struct nv_value *src; + +   if (!ref) +      return NULL; + +   src = ref->value; +   while (src->insn && src->insn->opcode == NV_OP_MOV) { +      assert(!src->insn->src[0]->mod); +      src = src->insn->src[0]->value; +   } +   return (src->reg.file == NV_FILE_IMM) ? src : NULL; +} + +static void +constant_operand(struct nv_pc *pc, +                 struct nv_instruction *nvi, struct nv_value *val, int s) +{ +   int t = s ? 0 : 1; +   ubyte type; + +   if (!nvi->def[0]) +      return; +   type = nvi->def[0]->reg.type; + +   switch (nvi->opcode) { +   case NV_OP_MUL: +      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) || +          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) { +         nvi->opcode = NV_OP_MOV; +         nv_reference(pc, &nvi->src[s], NULL); +         if (!s) { +            nvi->src[0] = nvi->src[1]; +            nvi->src[1] = NULL; +         } +      } else +      if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) || +          (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) { +         nvi->opcode = NV_OP_ADD; +         nv_reference(pc, &nvi->src[s], NULL); +         if (!s) { +            nvi->src[0] = nvi->src[1]; +            nvi->src[1] = NULL; +         } +      } else +      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) { +         nvi->opcode = NV_OP_NEG; +         nv_reference(pc, &nvi->src[s], NULL); +         nvi->src[0] = nvi->src[t]; +         nvi->src[1] = NULL; +      } else +      if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) { +         nvi->opcode = NV_OP_ADD; +         assert(!nvi->src[s]->mod); +         nv_reference(pc, &nvi->src[s], nvi->src[t]->value); +         nvi->src[t]->mod ^= NV_MOD_NEG; +         nvi->src[s]->mod |= NV_MOD_NEG; +      } else +      if (val->reg.imm.u32 == 0) { +         nvi->opcode = NV_OP_MOV; +         nv_reference(pc, &nvi->src[t], NULL); +         if (s) { +            nvi->src[0] = nvi->src[1]; +            nvi->src[1] = NULL; +         } +      } +      break; +   case NV_OP_ADD: +      if (val->reg.imm.u32 == 0) { +         nvi->opcode = NV_OP_MOV; +         nv_reference(pc, &nvi->src[s], NULL); +         nvi->src[0] = nvi->src[t]; +         nvi->src[1] = NULL; +      } +      break; +   default: +      break; +   } +} + +static int +nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) +{ +   struct nv_instruction *nvi, *next; +   int j; + +   for (nvi = b->entry; nvi; nvi = next) { +      struct nv_value *src0, *src1, *src; +      int mod; + +      next = nvi->next; + +      if ((src = find_immediate(nvi->src[0])) != NULL) +         constant_operand(ctx->pc, nvi, src, 0); +      else +      if ((src = find_immediate(nvi->src[1])) != NULL) +         constant_operand(ctx->pc, nvi, src, 1); + +      /* try to combine MUL, ADD into MAD */ +      if (nvi->opcode != NV_OP_ADD) +         continue; + +      src0 = nvi->src[0]->value; +      src1 = nvi->src[1]->value; + +      if (SRC_IS_MUL(src0) && src0->refc == 1) +         src = src0; +      else +      if (SRC_IS_MUL(src1) && src1->refc == 1) +         src = src1; +      else +         continue; + +      nvi->opcode = NV_OP_MAD; +      mod = nvi->src[(src == src0) ? 0 : 1]->mod; +      nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL); +      nvi->src[2] = nvi->src[(src == src0) ? 1 : 0]; + +      assert(!(mod & ~NV_MOD_NEG)); +      nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value); +      nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value); +      nvi->src[0]->mod = src->insn->src[0]->mod ^ mod; +      nvi->src[1]->mod = src->insn->src[1]->mod; +   } +   DESCEND_ARBITRARY(j, nv_pass_lower_arith); + +   return 0; +} + +/* +set $r2 g f32 $r2 $r3 +cvt abs rn f32 $r2 s32 $r2 +cvt f32 $c0 # f32 $r2 +e $c0 bra 0x80 +*/ +#if 0 +static int +nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b) +{ +   /* XXX: easier in IR builder for now */ +   return 0; +} +#endif + +/* TODO: reload elimination, redundant store elimination */ + +struct nv_pass_reldelim { +   struct nv_pc *pc; +}; + +static int +nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b) +{ +   int j; +   struct nv_instruction *ld, *next; + +   for (ld = b->entry; ld; ld = next) { +      next = ld->next; + +      if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { + +      } else +      if (ld->opcode == NV_OP_LDA) { +          +      } else +      if (ld->opcode == NV_OP_MOV) { +          +      } +   } +   DESCEND_ARBITRARY(j, nv_pass_reload_elim); + +   return 0; +} + +static int +nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b) +{ +   int i, c, j; + +   for (i = 0; i < ctx->pc->num_instructions; ++i) { +      struct nv_instruction *nvi = &ctx->pc->instructions[i]; +      struct nv_value *def[4]; + +      if (!nv_is_vector_op(nvi->opcode)) +         continue; +      nvi->tex_mask = 0; + +      for (c = 0; c < 4; ++c) { +         if (nvi->def[c]->refc) +            nvi->tex_mask |= 1 << c; +         def[c] = nvi->def[c]; +      } + +      j = 0; +      for (c = 0; c < 4; ++c) +         if (nvi->tex_mask & (1 << c)) +            nvi->def[j++] = def[c]; +      for (c = 0; c < 4; ++c) +         if (!(nvi->tex_mask & (1 << c))) +           nvi->def[j++] = def[c]; +      assert(j == 4); +   } +   return 0; +} + +struct nv_pass_dce { +   struct nv_pc *pc; +   uint removed; +}; + +static int +nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) +{ +   int j; +   struct nv_instruction *nvi, *next; + +   for (nvi = b->entry; nvi; nvi = next) { +      next = nvi->next; + +      if (inst_cullable(nvi)) { +         nv_nvi_delete(nvi); + +         ++ctx->removed; +      } +   } +   DESCEND_ARBITRARY(j, nv_pass_dce); + +   return 0; +} + +static INLINE boolean +bb_simple_if_endif(struct nv_basic_block *bb) +{ +   return (bb->out[0] && bb->out[1] && +           bb->out[0]->out[0] == bb->out[1] && +           !bb->out[0]->out[1]); +} + +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) +{ +   int j; + +   if (bb_simple_if_endif(b)) { +      ++ctx->n; +      debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n); +   } +   DESCEND_ARBITRARY(j, nv_pass_flatten); + +   return 0; +} + +int +nv_pc_exec_pass0(struct nv_pc *pc) +{ +   struct nv_pass_reldelim *reldelim; +   struct nv_pass pass; +   struct nv_pass_dce dce; +   int ret; + +   reldelim = CALLOC_STRUCT(nv_pass_reldelim); +   reldelim->pc = pc; + +   ret = nv_pass_reload_elim(reldelim, pc->root); + +   FREE(reldelim); +   if (ret) +      return ret; + +   pass.pc = pc; + +   pc->pass_seq++; +   ret = nv_pass_flatten(&pass, pc->root); +   if (ret) +      return ret; + +   /* Do this first, so we don't have to pay attention +    * to whether sources are supported memory loads. +    */ +   pc->pass_seq++; +   ret = nv_pass_lower_arith(&pass, pc->root); +   if (ret) +      return ret; + +   pc->pass_seq++; +   ret = nv_pass_fold_loads(&pass, pc->root); +   if (ret) +      return ret; + +   pc->pass_seq++; +   ret = nv_pass_fold_stores(&pass, pc->root); +   if (ret) +      return ret; + +   pc->pass_seq++; +   ret = nv_pass_lower_mods(&pass, pc->root); +   if (ret) +      return ret; + +   dce.pc = pc; +   do { +      dce.removed = 0; +      pc->pass_seq++; +      ret = nv_pass_dce(&dce, pc->root); +      if (ret) +         return ret; +   } while (dce.removed); + +   ret = nv_pass_tex_mask(&pass, pc->root); +   if (ret) +      return ret; + +   return ret; +} diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c new file mode 100644 index 0000000000..09512ffb88 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -0,0 +1,287 @@ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#define NVXX_DEBUG 0 + +#define PRINT(args...) debug_printf(args) + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#endif + +static const char *norm = "\x1b[00m"; +static const char *gree = "\x1b[32m"; +static const char *blue = "\x1b[34m"; +static const char *cyan = "\x1b[36m"; +static const char *orng = "\x1b[33m"; +static const char *mgta = "\x1b[35m"; + +static const char *nv_opcode_names[NV_OP_COUNT + 1] = { +   "phi", +   "extract", +   "combine", +   "lda", +   "sta", +   "mov", +   "add", +   "sub", +   "neg", +   "mul", +   "mad", +   "cvt", +   "sat", +   "not", +   "and", +   "or", +   "xor", +   "shl", +   "shr", +   "rcp", +   "(undefined)", +   "rsqrt", +   "lg2", +   "sin", +   "cos", +   "ex2", +   "presin", +   "preex2", +   "min", +   "max", +   "set", +   "sad", +   "kil", +   "bra", +   "call", +   "ret", +   "break", +   "breakaddr", +   "joinat", +   "tex", +   "texbias", +   "texlod", +   "texfetch", +   "texsize", +   "dfdx", +   "dfdy", +   "quadop", +   "linterp", +   "pinterp", +   "abs", +   "ceil", +   "floor", +   "trunc", +   "nop", +   "select", +   "export", +   "BAD_OP" +}; + +static const char *nv_cond_names[] = +{ +   "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "", +   "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "" +}; + +static const char *nv_modifier_strings[] = +{ +   "", +   "neg", +   "abs", +   "neg abs", +   "not", +   "not neg" +   "not abs", +   "not neg abs", +   "sat", +   "BAD_MOD" +}; + +const char * +nv_opcode_name(uint opcode) +{ +   return nv_opcode_names[MIN2(opcode, ARRAY_SIZE(nv_opcode_names) - 1)]; +} + +static INLINE const char * +nv_type_name(ubyte type) +{ +   switch (type) { +   case NV_TYPE_U16: return "u16"; +   case NV_TYPE_S16: return "s16"; +   case NV_TYPE_F32: return "f32"; +   case NV_TYPE_U32: return "u32"; +   case NV_TYPE_S32: return "s32"; +   case NV_TYPE_P32: return "p32"; +   case NV_TYPE_F64: return "f64"; +   default: +      return "BAD_TYPE"; +   } +} + +static INLINE const char * +nv_cond_name(ubyte cc) +{ +   return nv_cond_names[MIN2(cc, 15)]; +} + +static INLINE const char * +nv_modifier_string(ubyte mod) +{ +   return nv_modifier_strings[MIN2(mod, 9)]; +} + +static INLINE int +nv_value_id(struct nv_value *value) +{ +   if (value->join->reg.id >= 0) +      return value->join->reg.id; +   return value->n; +} + +static INLINE boolean +nv_value_allocated(struct nv_value *value) +{ +   return (value->reg.id >= 0) ? TRUE : FALSE; +} + +static INLINE void +nv_print_address(const char c, int buf, struct nv_value *a, int offset) +{ +   if (buf >= 0) +      PRINT(" %s%c%i[", cyan, c, buf); +   else +      PRINT(" %s%c[", cyan, c); +   if (a) +      PRINT("%s$a%i%s+", mgta, nv_value_id(a), cyan); +   PRINT("%s0x%x%s]", orng, offset, cyan); +} + +static INLINE void +nv_print_cond(struct nv_instruction *nvi) +{ +   PRINT("%s%s%s$c%i ", +         gree, nv_cond_name(nvi->cc), +         mgta, nv_value_id(nvi->flags_src->value)); +} + +static INLINE void +nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) +{ +   char reg_pfx = '$'; + +   if (type == NV_TYPE_ANY) +      type = value->reg.type; + +   if (value->reg.file != NV_FILE_FLAGS) +      PRINT(" %s%s", gree, nv_type_name(type)); + +   if (!nv_value_allocated(value)) +      reg_pfx = '%'; + +   switch (value->reg.file) { +   case NV_FILE_GPR: +      PRINT(" %s%cr%i", blue, reg_pfx, nv_value_id(value)); +      break; +   case NV_FILE_OUT: +      PRINT(" %s%co%i", mgta, reg_pfx, nv_value_id(value)); +      break; +   case NV_FILE_ADDR: +      PRINT(" %s%ca%i", mgta, reg_pfx, nv_value_id(value)); +      break; +   case NV_FILE_FLAGS: +      PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value)); +      break; +   case NV_FILE_MEM_S: +      nv_print_address('s', -1, ind, 4 * nv_value_id(value)); +      break; +   case NV_FILE_MEM_P: +      nv_print_address('p', -1, ind, 4 * nv_value_id(value)); +      break; +   case NV_FILE_MEM_V: +      nv_print_address('v', -1, ind, 4 * nv_value_id(value)); +      break; +   case NV_FILE_IMM: +      switch (type) { +      case NV_TYPE_U16: +      case NV_TYPE_S16: +         PRINT(" %s0x%04x", orng, value->reg.imm.u32); +         break; +      case NV_TYPE_F32: +         PRINT(" %s%f", orng, value->reg.imm.f32); +         break; +      case NV_TYPE_F64: +         PRINT(" %s%f", orng, value->reg.imm.f64); +         break; +      case NV_TYPE_U32: +      case NV_TYPE_S32: +      case NV_TYPE_P32: +         PRINT(" %s0x%08x", orng, value->reg.imm.u32); +         break; +      } +      break; +   default: +      if (value->reg.file >= NV_FILE_MEM_G(0) && +          value->reg.file <= NV_FILE_MEM_G(15)) +         nv_print_address('g', value->reg.file - NV_FILE_MEM_G(0), ind, +                          nv_value_id(value) * 4); +      else +      if (value->reg.file >= NV_FILE_MEM_C(0) && +          value->reg.file <= NV_FILE_MEM_C(15)) +         nv_print_address('c', value->reg.file - NV_FILE_MEM_C(0), ind, +                          nv_value_id(value) * 4); +      else +         NOUVEAU_ERR(" BAD_FILE[%i]", nv_value_id(value)); +      break; +   } +} + +static INLINE void +nv_print_ref(struct nv_ref *ref, struct nv_value *ind) +{ +   nv_print_value(ref->value, ind, ref->typecast); +} + +void +nv_print_instruction(struct nv_instruction *i) +{ +   int j; + +   if (i->flags_src) +      nv_print_cond(i); + +   PRINT("%s", gree); +   if (i->opcode == NV_OP_SET) +      PRINT("set %s", nv_cond_name(i->set_cond)); +   else +   if (i->saturate) +      PRINT("sat %s", nv_opcode_name(i->opcode)); +   else +      PRINT("%s", nv_opcode_name(i->opcode)); + +   if (i->flags_def) +      nv_print_value(i->flags_def, NULL, NV_TYPE_ANY); + +   /* Only STORE & STA can write to MEM, and they do not def +    * anything, so the address is thus part of the source. +    */ +   if (i->def[0]) +      nv_print_value(i->def[0], NULL, NV_TYPE_ANY); +   else +      PRINT(" #"); + +   for (j = 0; j < 4; ++j) { +      if (!i->src[j]) +         continue; + +      if (i->src[j]->mod) +         PRINT(" %s", nv_modifier_string(i->src[j]->mod)); + +      nv_print_ref(i->src[j], +                   (j == nv50_indirect_opnd(i)) ? +                   i->src[4]->value : NULL); +   } +   if (!i->is_long) +      PRINT(" %ss", norm); +   PRINT("\n"); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c new file mode 100644 index 0000000000..eb446d641a --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -0,0 +1,973 @@ +/* + * XXX: phi function live intervals start at first ordinary instruction, + *      add_range should be taking care of that already ... + * + * XXX: TEX must choose TEX's def as representative + * + * XXX: Aieee! Must materialize MOVs if source is in other basic block! + *       -- absolutely, or we cannot execute the MOV conditionally at all + * XXX: Aieee! Must include PHIs in LVA so we pull through liveness if + *      PHI source is e.g. in dominator block. + *       -- seems we lose liveness somehow, track that + */ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "util/u_simple_list.h" + +#define NUM_REGISTER_FILES 4 + +struct register_set { +   struct nv_pc *pc; + +   uint32_t last[NUM_REGISTER_FILES]; +   uint32_t bits[NUM_REGISTER_FILES][8]; +}; + +struct nv_pc_pass { +   struct nv_pc *pc; + +   struct nv_instruction **insns; +   int num_insns; + +   uint pass_seq; +}; + +static void +ranges_coalesce(struct nv_range *range) +{ +   while (range->next && range->end >= range->next->bgn) { +      struct nv_range *rnn = range->next->next; +      assert(range->bgn <= range->next->bgn); +      range->end = MAX2(range->end, range->next->end); +      FREE(range->next); +      range->next = rnn; +   } +} + +static boolean +add_range_ex(struct nv_value *val, int bgn, int end, struct nv_range *new_range) +{ +   struct nv_range *range, **nextp = &val->livei; + +   for (range = val->livei; range; range = range->next) { +      if (end < range->bgn) +         break; /* insert before */ + +      if (bgn > range->end) { +         nextp = &range->next; +         continue; /* insert after */ +      } + +      /* overlap */ +      if (bgn < range->bgn) { +         range->bgn = bgn; +         if (end > range->end) +            range->end = end; +         ranges_coalesce(range); +         return TRUE; +      } +      if (end > range->end) { +         range->end = end; +         ranges_coalesce(range); +         return TRUE; +      } +      assert(bgn >= range->bgn); +      assert(end <= range->end); +      return TRUE; +   } + +   if (!new_range) +      new_range = CALLOC_STRUCT(nv_range); + +   new_range->bgn = bgn; +   new_range->end = end; +   new_range->next = range; +   *(nextp) = new_range; +   return FALSE; +} + +static void +add_range(struct nv_value *val, struct nv_basic_block *b, int end) +{ +   int bgn; + +   if (!val->insn) /* ignore non-def values */ +      return; +   assert(b->entry->serial <= b->exit->serial); +   assert(b->phi->serial <= end); +   assert(b->exit->serial + 1 >= end); + +   bgn = val->insn->serial; +   if (bgn < b->entry->serial || bgn > b->exit->serial) +      bgn = b->entry->serial; +   // debug_printf("add_range(value %i): [%i, %i)\n", val->n, bgn, end); + +   if (bgn > end) { +      debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n", +                   b->entry->serial, b->exit->serial, bgn, end); +   } +   assert(bgn <= end); + +   if (bgn < val->insn->serial) +      debug_printf("WARNING: leaking value %i ?\n", val->n); + +   add_range_ex(val, bgn, end, NULL); +} + +#ifdef NV50_RA_DEBUG_JOIN +static void +livei_print(struct nv_value *a) +{ +   struct nv_range *r = a->livei; + +   debug_printf("livei %i: ", a->n); +   while (r) { +      debug_printf("[%i, %i) ", r->bgn, r->end); +      r = r->next; +   } +   debug_printf("\n"); +} +#endif + +static void +livei_unify(struct nv_value *dst, struct nv_value *src) +{ +   struct nv_range *range, *next; + +   for (range = src->livei; range; range = next) { +      next = range->next; +      if (add_range_ex(dst, range->bgn, range->end, range)) +         FREE(range); +   } +   src->livei = NULL; +} + +static void +livei_release(struct nv_value *val) +{ +   struct nv_range *range, *next; + +   for (range = val->livei; range; range = next) { +      next = range->next; +      FREE(range); +   } +} + +static boolean +livei_have_overlap(struct nv_value *a, struct nv_value *b) +{ +   struct nv_range *r_a, *r_b; + +   for (r_a = a->livei; r_a; r_a = r_a->next) { +      for (r_b = b->livei; r_b; r_b = r_b->next) { +         if (r_b->bgn < r_a->end && +             r_b->end > r_a->bgn) +            return TRUE; +      } +   } +   return FALSE; +} + +static int +livei_end(struct nv_value *a) +{ +   struct nv_range *r = a->livei; + +   assert(r); +   while (r->next) +      r = r->next; +   return r->end; +} + +static boolean +livei_contains(struct nv_value *a, int pos) +{ +   struct nv_range *r; + +   for (r = a->livei; r && r->bgn <= pos; r = r->next) +      if (r->end > pos) +         return TRUE; +   return FALSE; +} + +static boolean +reg_assign(struct register_set *set, struct nv_value **def, int n) +{ +   int i, id, s; +   uint m; +   int f = def[0]->reg.file; + +   s = n << (nv_type_order(def[0]->reg.type) - 1); +   m = (1 << s) - 1; + +   id = set->last[f]; + +   for (i = 0; i * 32 < set->last[f]; ++i) { +      if (set->bits[f][i] == 0xffffffff) +         continue; + +      for (id = 0; id < 32; id += s) +         if (!(set->bits[f][i] & (m << id))) +            break; +      if (id < 32) +         break; +   } +   if (i * 32 + id > set->last[f]) +      return FALSE; + +   set->bits[f][i] |= m << id; + +   id += i * 32; + +   set->pc->max_reg[f] = MAX2(set->pc->max_reg[f], id + s - 1); + +   id >>= nv_type_order(def[0]->reg.type) - 1; + +   for (i = 0; i < n; ++i) +      if (def[i]->livei) +         def[i]->reg.id = id++; + +   return TRUE; +} + +static INLINE void +reg_occupy(struct register_set *set, struct nv_value *val) +{ +   int s, id = val->reg.id, f = val->reg.file; +   uint m; + +   if (id < 0) +      return; +   s = nv_type_order(val->reg.type) - 1; +   id <<= s; +   m = (1 << (1 << s)) - 1; + +   set->bits[f][id / 32] |= m << (id % 32); + +   if (set->pc->max_reg[f] < id) +      set->pc->max_reg[f] = id; +} + +static INLINE void +reg_release(struct register_set *set, struct nv_value *val) +{ +   int s, id = val->reg.id, f = val->reg.file; +   uint m; + +   if (id < 0) +      return; + +   s = nv_type_order(val->reg.type) - 1; +   id <<= s; +   m = (1 << (1 << s)) - 1; + +   set->bits[f][id / 32] &= ~(m << (id % 32)); +} + +static INLINE boolean +join_allowed(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ +   int i; +   struct nv_value *val; + +   if (a->reg.file != b->reg.file || +       nv_type_sizeof(a->reg.type) != nv_type_sizeof(b->reg.type)) +      return FALSE; + +   if (a->join->reg.id == b->join->reg.id) +      return TRUE; + +#if 1 +   /* either a or b or both have been assigned */ + +   if (a->join->reg.id >= 0 && b->join->reg.id >= 0) +      return FALSE; +   else +   if (b->join->reg.id >= 0) { +      if (a->join->reg.id >= 0) +         return FALSE; +      val = a; +      a = b; +      b = val; +   } + +   for (i = 0; i < ctx->pc->num_values; ++i) { +      val = &ctx->pc->values[i]; + +      if (val->join->reg.id != a->join->reg.id) +         continue; +      if (val->join != a->join && livei_have_overlap(val->join, b->join)) +         return FALSE; +   } +   return TRUE; +#endif +   return FALSE; +} + +static INLINE void +do_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ +   int j; +   struct nv_value *bjoin = b->join; + +   if (b->join->reg.id >= 0) +      a->join->reg.id = b->join->reg.id; + +   livei_unify(a->join, b->join); + +#ifdef NV50_RA_DEBUG_JOIN +   debug_printf("joining %i to %i\n", b->n, a->n); +#endif +    +   /* make a->join the new representative */ +   for (j = 0; j < ctx->pc->num_values; ++j)  +      if (ctx->pc->values[j].join == bjoin) +         ctx->pc->values[j].join = a->join; + +   assert(b->join == a->join); +} + +static INLINE void +try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ +   if (!join_allowed(ctx, a, b)) { +#ifdef NV50_RA_DEBUG_JOIN +      debug_printf("cannot join %i to %i: not allowed\n", b->n, a->n); +#endif +      return; +   } +   if (livei_have_overlap(a->join, b->join)) { +#ifdef NV50_RA_DEBUG_JOIN +      debug_printf("cannot join %i to %i: livei overlap\n", b->n, a->n); +      livei_print(a); +      livei_print(b); +#endif +      return; +   } + +   do_join_values(ctx, a, b); +} + +/* For each operand of each phi in b, generate a new value by inserting a MOV + * at the end of the block it is coming from and replace the operand with it. + * This eliminates liveness conflicts. + */ +static int +pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ +   struct nv_instruction *i, *i2; +   struct nv_basic_block *p, *pn; +   struct nv_value *val; +   int n, j; + +   b->pass_seq = ctx->pc->pass_seq; + +   for (n = 0; n < b->num_in; ++n) { +      p = b->in[n]; +      assert(p); + +      if (b->num_in > 1 && p->out[0] && p->out[1]) { /* if without else */ +         pn = new_basic_block(ctx->pc); + +         if (p->out[0] == b) +            p->out[0] = pn; +         else +            p->out[1] = pn; + +         if (p->exit->target == b) /* target to new else-block */ +            p->exit->target = pn; + +         for (j = 0; j < b->num_in; ++j) { +            if (b->in[j] == p) { +               b->in[j] = pn; +               break; +            } +         } + +         pn->out[0] = b; +         pn->in[0] = p; +         pn->num_in = 1; +      } else +         pn = p; + +      ctx->pc->current_block = pn; + +      /* every block with PHIs will also have other operations */ +      for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { +         for (j = 0; j < 4; ++j) { +            if (!i->src[j]) +               j = 3; +            else +            if (i->src[j]->value->insn->bb == p) +               break; +         } +         if (j >= 4) +            continue; +         assert(i->src[j]); +         val = i->src[j]->value; + +         /* XXX: should probably not insert this after terminator */ +         i2 = new_instruction(ctx->pc, NV_OP_MOV); + +         i2->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); +         i2->src[0] = new_ref  (ctx->pc, val); +         i2->def[0]->insn = i2; + +         nv_reference(ctx->pc, &i->src[j], i2->def[0]); +      } +      if (pn != p && pn->exit) { +         /* XXX: this branch should probably be eliminated */ +         ctx->pc->current_block = b->in[n ? 0 : 1]; +         i2 = new_instruction(ctx->pc, NV_OP_BRA); +         i2->target = b; +         i2->is_terminator = 1; +      } +   } + +   if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) { +      pass_generate_phi_movs(ctx, b->out[0]); +   } + +   if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) { +      pass_generate_phi_movs(ctx, b->out[1]); +   } + +   return 0; +} + +static int +pass_join_values(struct nv_pc_pass *ctx, int iter) +{ +   int c, n; + +   for (n = 0; n < ctx->num_insns; ++n) { +      struct nv_instruction *i = ctx->insns[n]; + +      switch (i->opcode) { +      case NV_OP_PHI: +         if (!iter) +            continue; +         try_join_values(ctx, i->src[0]->value, i->src[1]->value); +         try_join_values(ctx, i->def[0], i->src[0]->value); +         break; +      case NV_OP_MOV: +         if (iter && i->src[0]->value->insn && +             !nv_is_vector_op(i->src[0]->value->join->insn->opcode)) +            try_join_values(ctx, i->def[0], i->src[0]->value); +         break; +      case NV_OP_SELECT: +         if (!iter) +            break; +         assert(join_allowed(ctx, i->def[0], i->src[0]->value)); +         assert(join_allowed(ctx, i->def[0], i->src[1]->value)); +         do_join_values(ctx, i->def[0], i->src[0]->value); +         do_join_values(ctx, i->def[0], i->src[1]->value); +         break; +      case NV_OP_TEX: +      case NV_OP_TXB: +      case NV_OP_TXL: +      case NV_OP_TXQ: +         if (iter) +            break; +         for (c = 0; c < 4; ++c) { +            if (!i->src[c]) +               break; +            do_join_values(ctx, i->def[c], i->src[c]->value); +         } +         break; +      default: +         break; +      } +   } +   return 0; +} + +static int +pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ +   struct nv_instruction *i; + +   b->priv = 0; + +   assert(!b->exit || !b->exit->next); +   for (i = b->phi; i; i = i->next) { +      i->serial = ctx->num_insns; +      ctx->insns[ctx->num_insns++] = i; +   } + +   b->pass_seq = ctx->pc->pass_seq; + +   if (!b->out[0]) +      return 0; +   if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) +      return 0; + +   if (b->out[0] != b) +      pass_order_instructions(ctx, b->out[0]); +   if (b->out[1] && b->out[1] != b) +      pass_order_instructions(ctx, b->out[1]); + +   return 0; +} + +static void +bb_live_set_print(struct nv_pc *pc, struct nv_basic_block *b) +{ +#ifdef NV50_RA_DEBUG_LIVE_SETS +   int j; +   struct nv_value *val; + +   debug_printf("live_set of %p: ", b); + +   for (j = 0; j < pc->num_values; ++j) { +      if (!(b->live_set[j / 32] & (1 << (j % 32)))) +         continue; +      val = &pc->values[j]; +      if (!val->insn) +         continue; +      debug_printf("%i ", val->n); +   } +   debug_printf("\n"); +#endif +} + +static INLINE void +live_set_add(struct nv_basic_block *b, struct nv_value *val) +{ +   if (!val->insn) /* don't add non-def values */ +      return; +   /* debug_printf("live[%p] <- %i\n", b, val->n); */ + +   b->live_set[val->n / 32] |= 1 << (val->n % 32); +} + +static INLINE void +live_set_rem(struct nv_basic_block *b, struct nv_value *val) +{ +   /* if (val->insn) +      debug_printf("live[%p] -> %i\n", b, val->n); */ +   b->live_set[val->n / 32] &= ~(1 << (val->n % 32)); +} + +static INLINE boolean +live_set_test(struct nv_basic_block *b, struct nv_ref *ref) +{ +   int n = ref->value->n; +   return b->live_set[n / 32] & (1 << (n % 32)); +} + +/* check if bf (future) can be reached from bp (past) */ +static boolean +bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, +		struct nv_basic_block *bt) +{ +   if (bf == bp) +      return TRUE; +   if (bp == bt) +      return FALSE; + +   if (bp->out[0] && bp->out[0] != bp && +       bb_reachable_by(bf, bp->out[0], bt)) +      return TRUE; +   if (bp->out[1] && bp->out[1] != bp && +       bb_reachable_by(bf, bp->out[1], bt)) +      return TRUE; +   return FALSE; +} + +/* The live set of a block contains those values that are live immediately + * before the beginning of the block. + */ +static int +pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ +   struct nv_instruction *i; +   int j, n, ret = 0; + +   /* slight hack for undecidedness: set phi = entry if it's undefined */ +   if (!b->phi) +      b->phi = b->entry; + +   for (n = 0; n < 2; ++n) { +      if (!b->out[n] || b->out[n] == b) +         continue; +      ret = pass_build_live_sets(ctx, b->out[n]); +      if (ret) +         return ret; + +      if (n == 0) { +         for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) +            b->live_set[j] = b->out[n]->live_set[j]; +      } else { +         for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) +            b->live_set[j] |= b->out[n]->live_set[j]; +      } + +      /* Kick values out of our live set that are created in incoming +       * blocks of our successors that are not us. +       */ +      for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) { +         for (j = 0; j < 4; ++j) { +            if (!i->src[j]) +               break; +            assert(i->src[j]->value->insn); + +            if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) { +               live_set_add(b, i->src[j]->value); +               debug_printf("%p: live set + %i\n", b, i->src[j]->value->n); +            } else { +               live_set_rem(b, i->src[j]->value); +               debug_printf("%p: live set - %i\n", b, i->src[j]->value->n); +            } +         } +      } +   } + +   if (b->pass_seq >= ctx->pc->pass_seq) +      return 0; +   b->pass_seq = ctx->pc->pass_seq; + +   debug_printf("%s: visiting block %p\n", __FUNCTION__, b); + +   if (!b->entry) +      return 0; +   bb_live_set_print(ctx->pc, b); + +   for (i = b->exit; i; i = i->prev) { +      for (j = 0; j < 4; j++) { +         if (!i->def[j]) +            break; +         live_set_rem(b, i->def[j]); +      } +      for (j = 0; j < 4; j++) { +         if (!i->src[j]) +            break; +         live_set_add(b, i->src[j]->value); +      } +      if (i->src[4]) +         live_set_add(b, i->src[4]->value); +      if (i->flags_def) +         live_set_rem(b, i->flags_def); +      if (i->flags_src) +         live_set_add(b, i->flags_src->value); +   } +   bb_live_set_print(ctx->pc, b); + +   return 0; +} + +static void collect_live_values(struct nv_basic_block *b, const int n) +{ +   int i; + +   if (b->out[0]) { +      if (b->out[1]) { /* what to do about back-edges ? */ +         for (i = 0; i < n; ++i) +            b->live_set[i] = b->out[0]->live_set[i] | b->out[1]->live_set[i]; +      } else { +         memcpy(b->live_set, b->out[0]->live_set, n * sizeof(uint32_t)); +      } +   } else +   if (b->out[1]) { +      memcpy(b->live_set, b->out[1]->live_set, n * sizeof(uint32_t)); +   } else { +      memset(b->live_set, 0, n * sizeof(uint32_t)); +   } +} + +/* NOTE: the live intervals of phi functions start the the first non-phi instruction */ +static int +pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ +   struct nv_instruction *i, *i_stop; +   int j, s; +   const int n = (ctx->pc->num_values + 31) / 32; + +   debug_printf("building intervals for BB %i\n", b->id); + +   /* verify that first block does not have live-in values */ +   if (b->num_in == 0) +      for (j = 0; j < n; ++j) +         assert(b->live_set[j] == 0); + +   collect_live_values(b, n); + +   /* remove live-outs def'd in a parallel block, hopefully they're all phi'd */ +   for (j = 0; j < 2; ++j) { +      if (!b->out[j] || !b->out[j]->phi) +         continue; +      for (i = b->out[j]->phi; i->opcode == NV_OP_PHI; i = i->next) { +         live_set_rem(b, i->def[0]); + +         for (s = 0; s < 4; ++s) { +            if (!i->src[s]) +               break; +            assert(i->src[s]->value->insn); +            if (bb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j])) +               live_set_add(b, i->src[s]->value); +            else +               live_set_rem(b, i->src[s]->value); +         } +      } +   } + +   /* remaining live-outs are live until the end */ +   for (j = 0; j < ctx->pc->num_values; ++j) { +      if (!(b->live_set[j / 32] & (1 << (j % 32)))) +         continue; +#ifdef NV50_RA_DEBUG_LIVEI +      debug_printf("adding range for live value %i\n", j); +#endif +      add_range(&ctx->pc->values[j], b, b->exit->serial + 1); +   } +   debug_printf("%s: looping through instructions now\n", __func__); + +   i_stop = b->entry ? b->entry->prev : NULL; + +   /* don't have to include phi functions here (will have 0 live range) */ +   for (i = b->exit; i != i_stop; i = i->prev) { +      assert(i->serial >= b->phi->serial && i->serial <= b->exit->serial); +      for (j = 0; j < 4; ++j) { +         if (i->def[j]) +            live_set_rem(b, i->def[j]); +      } +      if (i->flags_def) +         live_set_rem(b, i->flags_def); + +      for (j = 0; j < 5; ++j) { +         if (i->src[j] && !live_set_test(b, i->src[j])) { +            live_set_add(b, i->src[j]->value); +#ifdef NV50_RA_DEBUG_LIVEI +            debug_printf("adding range for source that ends living: %i\n", +                         i->src[j]->value->n); +#endif +            add_range(i->src[j]->value, b, i->serial); +         } +      } +      if (i->flags_src && !live_set_test(b, i->flags_src)) { +         live_set_add(b, i->flags_src->value); +#ifdef NV50_RA_DEBUG_LIVEI +         debug_printf("adding range for source that ends living: %i\n", +                      i->flags_src->value->n); +#endif +         add_range(i->flags_src->value, b, i->serial); +      } +   } + +   b->pass_seq = ctx->pc->pass_seq; + +   if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) +      pass_build_intervals(ctx, b->out[0]); + +   if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) +      pass_build_intervals(ctx, b->out[1]); + +   debug_printf("built intervals for block %p\n", b); + +   return 0; +} + +static INLINE void +nv50_ctor_register_set(struct nv_pc *pc, struct register_set *set) +{ +   memset(set, 0, sizeof(*set)); + +   set->last[NV_FILE_GPR] = 255; +   set->last[NV_FILE_OUT] = 127; +   set->last[NV_FILE_FLAGS] = 4; +   set->last[NV_FILE_ADDR] = 4; + +   set->pc = pc; +} + +static void +insert_ordered_tail(struct nv_value *list, struct nv_value *nval) +{ +   struct nv_value *elem = list->prev; + +   // debug_printf("inserting value %i\n", nval->n); + +   for (elem = list->prev; +	elem != list && elem->livei->bgn > nval->livei->bgn; +	elem = elem->prev); +   /* now elem begins before or at the same time as val */ + +   nval->prev = elem; +   nval->next = elem->next; +   elem->next->prev = nval; +   elem->next = nval; +} + +static int +pass_linear_scan(struct nv_pc_pass *ctx, int iter) +{ +   struct nv_instruction *i; +   struct register_set f, free; +   int k, n; +   struct nv_value *cur, *val, *tmp[2]; +   struct nv_value active, inactive, handled, unhandled; + +   make_empty_list(&active); +   make_empty_list(&inactive); +   make_empty_list(&handled); +   make_empty_list(&unhandled); + +   nv50_ctor_register_set(ctx->pc, &free); + +   /* joined values should have range = NULL and thus not be added; +    * also, fixed memory values won't be added because they're not +    * def'd, just used +    */ +   for (n = 0; n < ctx->num_insns; ++n) { +      i = ctx->insns[n]; + +      for (k = 0; k < 4; ++k) { +         if (i->def[k] && i->def[k]->livei) +            insert_ordered_tail(&unhandled, i->def[k]); +         else +         if (0 && i->def[k]) +            debug_printf("skipping def'd value %i: no livei\n", i->def[k]->n); +      } +      if (i->flags_def && i->flags_def->livei) +         insert_ordered_tail(&unhandled, i->flags_def); +   } + +   for (val = unhandled.next; val != unhandled.prev; val = val->next) { +      assert(val->join == val); +      assert(val->livei->bgn <= val->next->livei->bgn); +   } + +   foreach_s(cur, tmp[0], &unhandled) { +      remove_from_list(cur); + +      /* debug_printf("handling value %i\n", cur->n); */ + +      foreach_s(val, tmp[1], &active) { +         if (livei_end(val) <= cur->livei->bgn) { +            reg_release(&free, val); +            move_to_head(&handled, val); +         } else +         if (!livei_contains(val, cur->livei->bgn)) { +            reg_release(&free, val); +            move_to_head(&inactive, val); +         } +      } + +      foreach_s(val, tmp[1], &inactive) { +         if (livei_end(val) <= cur->livei->bgn) +            move_to_head(&handled, val); +         else +         if (livei_contains(val, cur->livei->bgn)) { +            reg_occupy(&free, val); +            move_to_head(&active, val); +         } +      } + +      f = free; + +      foreach(val, &inactive) +         if (livei_have_overlap(val, cur)) +            reg_occupy(&f, val); + +      foreach(val, &unhandled) +         if (val->reg.id >= 0 && livei_have_overlap(val, cur)) +            reg_occupy(&f, val); + +      if (cur->reg.id < 0) { +         boolean mem = FALSE; + +         if (nv_is_vector_op(cur->insn->opcode)) +            mem = !reg_assign(&f, &cur->insn->def[0], 4); +         else +         if (iter) +            mem = !reg_assign(&f, &cur, 1); + +         if (mem) { +            NOUVEAU_ERR("out of registers\n"); +            abort(); +         } +      } +      insert_at_head(&active, cur); +      reg_occupy(&free, cur); +   } + +   return 0; +} + +static int +pass_eliminate_moves(struct nv_pc_pass *ctx) +{ +   return 0; +} + +int +nv_pc_exec_pass1(struct nv_pc *pc) +{ +   struct nv_pc_pass *ctx; +   int i, ret; + +   debug_printf("REGISTER ALLOCATION - entering\n"); + +   ctx = CALLOC_STRUCT(nv_pc_pass); +   if (!ctx) +      return -1; +   ctx->pc = pc; + +   nv_print_program(ctx->pc->root); + +   ctx->insns = CALLOC(pc->num_instructions, sizeof(struct nv_instruction *)); + +   pc->pass_seq++; +   ret = pass_generate_phi_movs(ctx, pc->root); +   assert(!ret); + +   nv_print_program(ctx->pc->root); + +   for (i = 0; i < pc->loop_nesting_bound; ++i) { +      pc->pass_seq++; +      ret = pass_build_live_sets(ctx, pc->root); +      assert(!ret && "live sets"); +      if (ret) { +         NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i); +         goto out; +      } +   } + +   pc->pass_seq++; +   ret = pass_order_instructions(ctx, pc->root); +   assert(!ret && "order instructions"); +   if (ret) +      goto out; + +   pc->pass_seq++; +   ret = pass_build_intervals(ctx, pc->root); +   assert(!ret && "build intervals"); +   if (ret) { +      NOUVEAU_ERR("failed to build live intervals\n"); +      goto out; +   } + +   for (i = 0; i < 2; ++i) { +      ret = pass_join_values(ctx, i); +      if (ret) +         goto out; +      ret = pass_linear_scan(ctx, i); +      if (ret) +         goto out; +   } +   assert(!ret && "joining"); + +   ret = pass_eliminate_moves(ctx); + +   for (i = 0; i < pc->num_values; ++i) +      livei_release(&pc->values[i]); + +   debug_printf("REGISTER ALLOCATION - leaving\n"); +   nv_print_program(ctx->pc->root); + +out: +   FREE(ctx); +   return ret; +} diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 8cb1639013..26d1be8db8 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -1,5 +1,5 @@  /* - * Copyright 2008 Ben Skeggs + * Copyright 2010 Chrsitoph Bumiller   *   * Permission is hereby granted, free of charge, to any person obtaining a   * copy of this software and associated documentation files (the "Software"), @@ -20,4674 +20,553 @@   * SOFTWARE.   */ -#include "pipe/p_context.h" -#include "pipe/p_defines.h" -#include "pipe/p_state.h" -#include "util/u_inlines.h" +#include "nv50_program.h" +#include "nv50_pc.h" +#include "nv50_context.h"  #include "pipe/p_shader_tokens.h"  #include "tgsi/tgsi_parse.h"  #include "tgsi/tgsi_util.h" -#include "nv50_context.h" -#include "nv50_transfer.h" - -#define NV50_SU_MAX_TEMP 127 -#define NV50_SU_MAX_ADDR 4 -//#define NV50_PROGRAM_DUMP - -/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ - -/* ARL - gallium craps itself on progs/vp/arl.txt - * - * MSB - Like MAD, but MUL+SUB - * 	- Fuck it off, introduce a way to negate args for ops that - * 	  support it. - * - * Look into inlining IMMD for ops other than MOV (make it general?) - * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, - * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this - * - * In ops such as ADD it's possible to construct a bad opcode in the !is_long() - * case, if the emit_src() causes the inst to suddenly become long. - * - * Verify half-insns work where expected - and force disable them where they - * don't work - MUL has it forcibly disabled atm as it fixes POW.. - * - * FUCK! watch dst==src vectors, can overwrite components that are needed. - * 	ie. SUB R0, R0.yzxw, R0 - * - * Things to check with renouveau: - * 	FP attr/result assignment - how? - * 		attrib - * 			- 0x16bc maps vp output onto fp hpos - * 			- 0x16c0 maps vp output onto fp col0 - * 		result - * 			- colr always 0-3 - * 			- depr always 4 - * 0x16bc->0x16e8 --> some binding between vp/fp regs - * 0x16b8 --> VP output count - * - * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 - * 	      "MOV rcol.x, fcol.y" = 0x00000004 - * 0x19a8 --> as above but 0x00000100 and 0x00000000 - * 	- 0x00100000 used when KIL used - * 0x196c --> as above but 0x00000011 and 0x00000000 - * - * 0x1988 --> 0xXXNNNNNN - * 	- XX == FP high something - */ -struct nv50_reg { -	enum { -		P_TEMP, -		P_ATTR, -		P_RESULT, -		P_CONST, -		P_IMMD, -		P_ADDR -	} type; -	int index; - -	int hw; -	int mod; - -	int rhw; /* result hw for FP outputs, or interpolant index */ -	int acc; /* instruction where this reg is last read (first insn == 1) */ - -	int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */ -	int indirect[2]; /* index into pc->addr, or -1 */ - -	ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */ -}; - -#define NV50_MOD_NEG 1 -#define NV50_MOD_ABS 2 -#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) -#define NV50_MOD_SAT 4 -#define NV50_MOD_I32 8 - -/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ - -/* STACK: Conditionals and loops have to use the (per warp) stack. - * Stack entries consist of an entry type (divergent path, join at), - * a mask indicating the active threads of the warp, and an address. - * MPs can store 12 stack entries internally, if we need more (and - * we probably do), we have to create a stack buffer in VRAM. - */ -/* impose low limits for now */ -#define NV50_MAX_COND_NESTING 4 -#define NV50_MAX_LOOP_NESTING 3 - -#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 - -struct nv50_pc { -	struct nv50_program *p; - -	/* hw resources */ -	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; -	struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; - -	/* tgsi resources */ -	struct nv50_reg *temp; -	int temp_nr; -	struct nv50_reg *attr; -	int attr_nr; -	struct nv50_reg *result; -	int result_nr; -	struct nv50_reg *param; -	int param_nr; -	struct nv50_reg *immd; -	uint32_t *immd_buf; -	int immd_nr; -	struct nv50_reg **addr; -	int addr_nr; -	struct nv50_reg *sysval; -	int sysval_nr; - -	struct nv50_reg *temp_temp[16]; -	struct nv50_program_exec *temp_temp_exec[16]; -	unsigned temp_temp_nr; - -	/* broadcast and destination replacement regs */ -	struct nv50_reg *r_brdc; -	struct nv50_reg *r_dst[4]; - -	struct nv50_reg reg_instances[16]; -	unsigned reg_instance_nr; - -	unsigned interp_mode[32]; -	/* perspective interpolation registers */ -	struct nv50_reg *iv_p; -	struct nv50_reg *iv_c; - -	struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; -	struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; -	struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; -	int if_lvl, loop_lvl; -	unsigned loop_pos[NV50_MAX_LOOP_NESTING]; - -	unsigned *insn_pos; /* actual program offset of each TGSI insn */ -	boolean in_subroutine; - -	/* current instruction and total number of insns */ -	unsigned insn_cur; -	unsigned insn_nr; - -	boolean allow32; - -	uint8_t edgeflag_out; -}; - -static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *); - -static INLINE void -ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) -{ -	reg->type = type; -	reg->index = index; -	reg->hw = hw; -	reg->mod = 0; -	reg->rhw = -1; -	reg->vtx = -1; -	reg->acc = 0; -	reg->indirect[0] = reg->indirect[1] = -1; -	reg->buf_index = (type == P_CONST) ? 1 : 0; -} -  static INLINE unsigned -popcnt4(uint32_t val) -{ -	static const unsigned cnt[16] -	= { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; -	return cnt[val & 0xf]; -} - -static void -terminate_mbb(struct nv50_pc *pc) -{ -	int i; - -	/* remove records of temporary address register values */ -	for (i = 0; i < NV50_SU_MAX_ADDR; ++i) -		if (pc->r_addr[i].index < 0) -			pc->r_addr[i].acc = 0; -} - -static void -alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) -{ -	int i = 0; - -	if (reg->type == P_RESULT) { -		if (pc->p->cfg.high_result < (reg->hw + 1)) -			pc->p->cfg.high_result = reg->hw + 1; -	} - -	if (reg->type != P_TEMP) -		return; - -	if (reg->hw >= 0) { -		/*XXX: do this here too to catch FP temp-as-attr usage.. -		 *     not clean, but works */ -		if (pc->p->cfg.high_temp < (reg->hw + 1)) -			pc->p->cfg.high_temp = reg->hw + 1; -		return; -	} - -	if (reg->rhw != -1) { -		/* try to allocate temporary with index rhw first */ -		if (!(pc->r_temp[reg->rhw])) { -			pc->r_temp[reg->rhw] = reg; -			reg->hw = reg->rhw; -			if (pc->p->cfg.high_temp < (reg->rhw + 1)) -				pc->p->cfg.high_temp = reg->rhw + 1; -			return; -		} -		/* make sure we don't get things like $r0 needs to go -		 * in $r1 and $r1 in $r0 -		 */ -		i = pc->result_nr * 4; -	} - -	for (; i < NV50_SU_MAX_TEMP; i++) { -		if (!(pc->r_temp[i])) { -			pc->r_temp[i] = reg; -			reg->hw = i; -			if (pc->p->cfg.high_temp < (i + 1)) -				pc->p->cfg.high_temp = i + 1; -			return; -		} -	} - -	NOUVEAU_ERR("out of registers\n"); -	abort(); -} - -static INLINE struct nv50_reg * -reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) -{ -	struct nv50_reg *ri; - -	assert(pc->reg_instance_nr < 16); -	ri = &pc->reg_instances[pc->reg_instance_nr++]; -	if (reg) { -		alloc_reg(pc, reg); -		*ri = *reg; -		reg->indirect[0] = reg->indirect[1] = -1; -		reg->mod = 0; -	} -	return ri; -} - -/* XXX: For shaders that aren't executed linearly (e.g. shaders that - * contain loops), we need to assign all hw regs to TGSI TEMPs early, - * lest we risk temp_temps overwriting regs alloc'd "later". - */ -static struct nv50_reg * -alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) -{ -	struct nv50_reg *r; -	int i; - -	if (dst && dst->type == P_TEMP && dst->hw == -1) -		return dst; - -	for (i = 0; i < NV50_SU_MAX_TEMP; i++) { -		if (!pc->r_temp[i]) { -			r = MALLOC_STRUCT(nv50_reg); -			ctor_reg(r, P_TEMP, -1, i); -			pc->r_temp[i] = r; -			return r; -		} -	} - -	NOUVEAU_ERR("out of registers\n"); -	abort(); -	return NULL; -} - -/* release the hardware resource held by r */ -static void -release_hw(struct nv50_pc *pc, struct nv50_reg *r) -{ -	assert(r->type == P_TEMP); -	if (r->hw == -1) -		return; - -	assert(pc->r_temp[r->hw] == r); -	pc->r_temp[r->hw] = NULL; - -	r->acc = 0; -	if (r->index == -1) -		FREE(r); -} - -static void -free_temp(struct nv50_pc *pc, struct nv50_reg *r) -{ -	if (r->index == -1) { -		unsigned hw = r->hw; - -		FREE(pc->r_temp[hw]); -		pc->r_temp[hw] = NULL; -	} -} - -static int -alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) -{ -	int i; - -	if ((idx + 4) >= NV50_SU_MAX_TEMP) -		return 1; - -	if (pc->r_temp[idx] || pc->r_temp[idx + 1] || -	    pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) -		return alloc_temp4(pc, dst, idx + 4); - -	for (i = 0; i < 4; i++) { -		dst[i] = MALLOC_STRUCT(nv50_reg); -		ctor_reg(dst[i], P_TEMP, -1, idx + i); -		pc->r_temp[idx + i] = dst[i]; -	} - -	return 0; -} - -static void -free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) -{ -	int i; - -	for (i = 0; i < 4; i++) -		free_temp(pc, reg[i]); -} - -static struct nv50_reg * -temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ -	if (pc->temp_temp_nr >= 16) -		assert(0); - -	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); -	pc->temp_temp_exec[pc->temp_temp_nr] = e; -	return pc->temp_temp[pc->temp_temp_nr++]; -} - -/* This *must* be called for all nv50_program_exec that have been - * given as argument to temp_temp, or the temps will be leaked ! - */ -static void -kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ -	int i; - -	for (i = 0; i < pc->temp_temp_nr; i++) -		if (pc->temp_temp_exec[i] == e) -			free_temp(pc, pc->temp_temp[i]); -	if (!e) -		pc->temp_temp_nr = 0; -} - -static int -ctor_immd_4u32(struct nv50_pc *pc, -	       uint32_t x, uint32_t y, uint32_t z, uint32_t w) -{ -	unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); - -	pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); - -	pc->immd_buf[(pc->immd_nr * 4) + 0] = x; -	pc->immd_buf[(pc->immd_nr * 4) + 1] = y; -	pc->immd_buf[(pc->immd_nr * 4) + 2] = z; -	pc->immd_buf[(pc->immd_nr * 4) + 3] = w; - -	return pc->immd_nr++; -} - -static INLINE int -ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) -{ -	return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); -} - -static struct nv50_reg * -alloc_immd(struct nv50_pc *pc, float f) -{ -	struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); -	unsigned hw; - -	for (hw = 0; hw < pc->immd_nr * 4; hw++) -		if (pc->immd_buf[hw] == fui(f)) -			break; - -	if (hw == pc->immd_nr * 4) -		hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; - -	ctor_reg(r, P_IMMD, -1, hw); -	return r; -} - -static struct nv50_program_exec * -exec(struct nv50_pc *pc) -{ -	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); - -	e->param.index = -1; -	return e; -} - -static void -emit(struct nv50_pc *pc, struct nv50_program_exec *e) -{ -	struct nv50_program *p = pc->p; - -	if (p->exec_tail) -		p->exec_tail->next = e; -	if (!p->exec_head) -		p->exec_head = e; -	p->exec_tail = e; -	p->exec_size += (e->inst[0] & 1) ? 2 : 1; - -	kill_temp_temp(pc, e); -} - -static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); - -static boolean -is_long(struct nv50_program_exec *e) -{ -	if (e->inst[0] & 1) -		return TRUE; -	return FALSE; -} - -static boolean -is_immd(struct nv50_program_exec *e) -{ -	if (is_long(e) && (e->inst[1] & 3) == 3) -		return TRUE; -	return FALSE; -} - -static boolean -is_join(struct nv50_program_exec *e) -{ -	if (is_long(e) && (e->inst[1] & 3) == 2) -		return TRUE; -	return FALSE; -} - -static INLINE boolean -is_control_flow(struct nv50_program_exec *e) -{ -	return (e->inst[0] & 2); -} - -static INLINE void -set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, -	 struct nv50_program_exec *e) -{ -	assert(!is_immd(e)); -	set_long(pc, e); -	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); -	e->inst[1] |= (pred << 7) | (idx << 12); -} - -static INLINE void -set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, -	    struct nv50_program_exec *e) -{ -	set_long(pc, e); -	e->inst[1] &= ~((0x3 << 4) | (1 << 6)); -	e->inst[1] |= (idx << 4) | (on << 6); -} - -static INLINE void -set_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ -	if (is_long(e)) -		return; - -	e->inst[0] |= 1; -	set_pred(pc, 0xf, 0, e); -	set_pred_wr(pc, 0, 0, e); -} - -static INLINE void -set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) -{ -	if (dst->type == P_RESULT) { -		set_long(pc, e); -		e->inst[1] |= 0x00000008; -	} - -	alloc_reg(pc, dst); -	if (dst->hw > 63) -		set_long(pc, e); -	e->inst[0] |= (dst->hw << 2); -} - -static INLINE void -set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) -{ -	set_long(pc, e); -	/* XXX: can't be predicated - bits overlap; cases where both -	 * are required should be avoided by using pc->allow32 */ -	set_pred(pc, 0, 0, e); -	set_pred_wr(pc, 0, 0, e); - -	e->inst[1] |= 0x00000002 | 0x00000001; -	e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; -	e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; -} - -static INLINE void -set_addr(struct nv50_program_exec *e, struct nv50_reg *a) -{ -	assert(a->type == P_ADDR); - -	assert(!(e->inst[0] & 0x0c000000)); -	assert(!(e->inst[1] & 0x00000004)); - -	e->inst[0] |= (a->hw & 3) << 26; -	e->inst[1] |= a->hw & 4; -} - -static void -emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t); - -static void -emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int); - -static void -emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst, -		   struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[1] = 0x40000000; -	set_long(pc, e); -	set_dst(pc, dst, e); -	set_addr(e, src); - -	emit(pc, e); -} - -static void -emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, -		  struct nv50_reg *src0, uint16_t src1_val) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0xd0000000 | (src1_val << 9); -	e->inst[1] = 0x20000000; -	set_long(pc, e); -	e->inst[0] |= dst->hw << 2; -	if (src0) /* otherwise will add to $a0, which is always 0 */ -		set_addr(e, src0); - -	emit(pc, e); -} - -#define INTERP_LINEAR		0 -#define INTERP_FLAT		1 -#define INTERP_PERSPECTIVE	2 -#define INTERP_CENTROID		4 - -/* interpolant index has been stored in dst->rhw */ -static void -emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, -		unsigned mode) -{ -	struct nv50_program_exec *e = exec(pc); -	assert(dst->rhw != -1); - -	e->inst[0] |= 0x80000000; -	set_dst(pc, dst, e); -	e->inst[0] |= (dst->rhw << 16); - -	if (mode & INTERP_FLAT) { -		e->inst[0] |= (1 << 8); -	} else { -		if (mode & INTERP_PERSPECTIVE) { -			e->inst[0] |= (1 << 25); -			alloc_reg(pc, iv); -			e->inst[0] |= (iv->hw << 9); -		} - -		if (mode & INTERP_CENTROID) -			e->inst[0] |= (1 << 24); -	} - -	emit(pc, e); -} - -static void -set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, -	 struct nv50_program_exec *e) -{ -	set_long(pc, e); - -	e->param.index = src->hw & 127; -	e->param.shift = s; -	e->param.mask = m << (s % 32); - -	if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */ -		set_addr(e, get_address_reg(pc, src)); -	else -	if (src->acc < 0) { -		assert(src->type == P_CONST); -		set_addr(e, pc->addr[src->indirect[0]]); -	} - -	e->inst[1] |= (src->buf_index << 22); -} - -/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ -static void -emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0x10000000; -	if (!pc->allow32) -		set_long(pc, e); - -	set_dst(pc, dst, e); - -	if (!is_long(e) && src->type == P_IMMD) { -		set_immd(pc, src, e); -		/*XXX: 32-bit, but steals part of "half" reg space - need to -		 *     catch and handle this case if/when we do half-regs -		 */ -	} else -	if (src->type == P_IMMD || src->type == P_CONST) { -		set_long(pc, e); -		set_data(pc, src, 0x7f, 9, e); -		e->inst[1] |= 0x20000000; /* mov from c[] */ -	} else { -		if (src->type == P_ATTR) { -			set_long(pc, e); -			e->inst[1] |= 0x00200000; - -			if (src->vtx >= 0) { -				/* indirect (vertex base + c) load from p[] */ -				e->inst[0] |= 0x01800000; -				set_addr(e, get_address_reg(pc, src)); -			} -		} - -		alloc_reg(pc, src); -		if (src->hw > 63) -			set_long(pc, e); -		e->inst[0] |= (src->hw << 9); -	} - -	if (is_long(e) && !is_immd(e)) { -		e->inst[1] |= 0x04000000; /* 32-bit */ -		e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ -		if (!(e->inst[1] & 0x20000000)) -			e->inst[1] |= 0x00030000; /* lane mask 2:3 */ -	} else -		e->inst[0] |= 0x00008000; - -	emit(pc, e); -} - -static INLINE void -emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) -{ -	struct nv50_reg *imm = alloc_immd(pc, f); -	emit_mov(pc, dst, imm); -	FREE(imm); -} - -/* Assign the hw of the discarded temporary register src - * to the tgsi register dst and free src. - */ -static void -assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	assert(src->index == -1 && src->hw != -1); - -	if (pc->if_lvl || pc->loop_lvl || -	    (dst->type != P_TEMP) || -	    (src->hw < pc->result_nr * 4 && -	     pc->p->type == PIPE_SHADER_FRAGMENT) || -	    pc->p->info.opcode_count[TGSI_OPCODE_CAL] || -	    pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { - -		emit_mov(pc, dst, src); -		free_temp(pc, src); -		return; -	} - -	if (dst->hw != -1) -		pc->r_temp[dst->hw] = NULL; -	pc->r_temp[src->hw] = dst; -	dst->hw = src->hw; - -	FREE(src); -} - -static void -emit_nop(struct nv50_pc *pc) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0xf0000000; -	set_long(pc, e); -	e->inst[1] = 0xe0000000; -	emit(pc, e); -} - -static boolean -check_swap_src_0_1(struct nv50_pc *pc, -		   struct nv50_reg **s0, struct nv50_reg **s1) -{ -	struct nv50_reg *src0 = *s0, *src1 = *s1; - -	if (src0->type == P_CONST) { -		if (src1->type != P_CONST) { -			*s0 = src1; -			*s1 = src0; -			return TRUE; -		} -	} else -	if (src1->type == P_ATTR) { -		if (src0->type != P_ATTR) { -			*s0 = src1; -			*s1 = src0; -			return TRUE; -		} -	} - -	return FALSE; -} - -static void -set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, -		     struct nv50_program_exec *e) -{ -	struct nv50_reg *temp; - -	if (src->type != P_TEMP) { -		temp = temp_temp(pc, e); -		emit_mov(pc, temp, src); -		src = temp; -	} - -	alloc_reg(pc, src); -	if (src->hw > 63) -		set_long(pc, e); -	e->inst[0] |= (src->hw << 9); -} - -static void -set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ -	if (src->type == P_ATTR) { -		set_long(pc, e); -		e->inst[1] |= 0x00200000; - -		if (src->vtx >= 0) { -			e->inst[0] |= 0x01800000; /* src from p[] */ -			set_addr(e, get_address_reg(pc, src)); -		} -	} else -	if (src->type == P_CONST || src->type == P_IMMD) { -		struct nv50_reg *temp = temp_temp(pc, e); - -		emit_mov(pc, temp, src); -		src = temp; -	} - -	alloc_reg(pc, src); -	if (src->hw > 63) -		set_long(pc, e); -	e->inst[0] |= (src->hw << 9); -} - -static void -set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ -	if (src->type == P_ATTR) { -		struct nv50_reg *temp = temp_temp(pc, e); - -		emit_mov(pc, temp, src); -		src = temp; -	} else -	if (src->type == P_CONST || src->type == P_IMMD) { -		if (e->inst[0] & 0x01800000) { -			struct nv50_reg *temp = temp_temp(pc, e); - -			emit_mov(pc, temp, src); -			src = temp; -		} else { -			assert(!(e->inst[0] & 0x00800000)); -			set_data(pc, src, 0x7f, 16, e); -			e->inst[0] |= 0x00800000; -		} -	} - -	alloc_reg(pc, src); -	if (src->hw > 63) -		set_long(pc, e); -	e->inst[0] |= ((src->hw & 127) << 16); -} - -static void -set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ -	set_long(pc, e); - -	if (src->type == P_ATTR) { -		struct nv50_reg *temp = temp_temp(pc, e); - -		emit_mov(pc, temp, src); -		src = temp; -	} else -	if (src->type == P_CONST || src->type == P_IMMD) { -		if (e->inst[0] & 0x01800000) { -			struct nv50_reg *temp = temp_temp(pc, e); - -			emit_mov(pc, temp, src); -			src = temp; -		} else { -			assert(!(e->inst[0] & 0x01000000)); -			set_data(pc, src, 0x7f, 32+14, e); -			e->inst[0] |= 0x01000000; -		} -	} - -	alloc_reg(pc, src); -	e->inst[1] |= ((src->hw & 127) << 14); -} - -static void -set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh, -	     struct nv50_program_exec *e, int pos) -{ -	struct nv50_reg *r = src; - -	alloc_reg(pc, r); -	if (r->type != P_TEMP) { -		r = temp_temp(pc, e); -		emit_mov(pc, r, src); -	} - -	if (r->hw > (NV50_SU_MAX_TEMP / 2)) { -		NOUVEAU_ERR("out of low GPRs\n"); -		abort(); -	} - -	e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32); -} - -static void -emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) -{ -	struct nv50_program_exec *e = exec(pc); - -	assert(dst->type == P_TEMP); -	e->inst[1] = 0x20000000 | (pred << 12); -	set_long(pc, e); -	set_dst(pc, dst, e); - -	emit(pc, e); -} - -static void -emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0x000001fc; -	e->inst[1] = 0xa0000008; -	set_long(pc, e); -	set_pred_wr(pc, 1, pred, e); -	set_src_0_restricted(pc, src, e); - -	emit(pc, e); -} - -static void -emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, -	 struct nv50_reg *src1) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] |= 0xc0000000; - -	if (!pc->allow32) -		set_long(pc, e); - -	check_swap_src_0_1(pc, &src0, &src1); -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); -	if (src1->type == P_IMMD && !is_long(e)) { -		if (src0->mod ^ src1->mod) -			e->inst[0] |= 0x00008000; -		set_immd(pc, src1, e); -	} else { -		set_src_1(pc, src1, e); -		if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { -			if (is_long(e)) -				e->inst[1] |= 0x08000000; -			else -				e->inst[0] |= 0x00008000; -		} -	} - -	emit(pc, e); -} - -static void -emit_add(struct nv50_pc *pc, struct nv50_reg *dst, -	 struct nv50_reg *src0, struct nv50_reg *src1) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0xb0000000; - -	alloc_reg(pc, src1); -	check_swap_src_0_1(pc, &src0, &src1); - -	if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { -		set_long(pc, e); -		e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | -			      ((src1->mod & NV50_MOD_NEG) << 27); -	} - -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); -	if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) -		set_src_2(pc, src1, e); -	else -	if (src1->type == P_IMMD) -		set_immd(pc, src1, e); -	else -		set_src_1(pc, src1, e); - -	emit(pc, e); -} - -static void -emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, -	 uint8_t s) -{ -	struct nv50_program_exec *e = exec(pc); - -	set_long(pc, e); -	e->inst[1] |= 0xc0000000; - -	e->inst[0] |= dst->hw << 2; -	e->inst[0] |= s << 16; /* shift left */ -	set_src_0(pc, src, e); - -	emit(pc, e); -} - -static boolean -address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r) -{ -	if (!r) -		return FALSE; - -	if (r->vtx != a->vtx) -		return FALSE; -	if (r->vtx >= 0) -		return (r->indirect[1] == a->indirect[1]); - -	if (r->hw < a->rhw || (r->hw - a->rhw) >= 128) -		return FALSE; - -	if (a->index >= 0) -		return (a->index == r->indirect[0]); -	return (a->indirect[0] == r->indirect[0]); -} - -static void -load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst, -		 struct nv50_reg *a, int shift) -{ -	struct nv50_reg mem, *temp; - -	ctor_reg(&mem, P_ATTR, -1, dst->vtx); - -	assert(dst->type == P_ADDR); -	if (!a) { -		emit_arl(pc, dst, &mem, 0); -		return; -	} -	temp = alloc_temp(pc, NULL); - -	if (shift) { -		emit_mov_from_addr(pc, temp, a); -		if (shift < 0) -			emit_shl_imm(pc, temp, temp, shift); -		emit_arl(pc, dst, temp, MAX2(shift, 0)); -	} -	emit_mov(pc, temp, &mem); -	set_addr(pc->p->exec_tail, dst); - -	emit_arl(pc, dst, temp, 0); -	free_temp(pc, temp); -} - -/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS - * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX - * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX - * case (vtx < 0, acc >= 0): memory address too high to encode - * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS - */ -static struct nv50_reg * -get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref) -{ -	int i; -	struct nv50_reg *a_ref, *a = NULL; - -	for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { -		if (pc->r_addr[i].acc == 0) -			a = &pc->r_addr[i]; /* an unused address reg */ -		else -		if (address_reg_suitable(&pc->r_addr[i], ref)) { -			pc->r_addr[i].acc = pc->insn_cur; -			return &pc->r_addr[i]; -		} else -		if (!a && pc->r_addr[i].index < 0 && -		    pc->r_addr[i].acc < pc->insn_cur) -			a = &pc->r_addr[i]; -	} -	if (!a) { -		/* We'll be able to spill address regs when this -		 * mess is replaced with a proper compiler ... -		 */ -		NOUVEAU_ERR("out of address regs\n"); -		abort(); -		return NULL; -	} - -	/* initialize and reserve for this TGSI instruction */ -	a->rhw = 0; -	a->index = a->indirect[0] = a->indirect[1] = -1; -	a->acc = pc->insn_cur; - -	if (!ref) { -		a->vtx = -1; -		return a; -	} -	a->vtx = ref->vtx; - -	/* now put in the correct value ... */ - -	if (ref->vtx >= 0) { -		a->indirect[1] = ref->indirect[1]; - -		/* For an indirect vertex index, we need to shift address right -		 * by 2, the address register will contain vtx * 16, we need to -		 * load from a[vtx * 4]. -		 */ -		load_vertex_base(pc, a, (ref->acc < 0) ? -				 pc->addr[ref->indirect[1]] : NULL, -2); -	} else { -		assert(ref->acc < 0 || ref->indirect[0] < 0); - -		a->rhw = ref->hw & ~0x7f; -		a->indirect[0] = ref->indirect[0]; -		a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL; - -		emit_add_addr_imm(pc, a, a_ref, a->rhw * 4); -	} -	return a; -} - -#define NV50_MAX_F32 0x880 -#define NV50_MAX_S32 0x08c -#define NV50_MAX_U32 0x084 -#define NV50_MIN_F32 0x8a0 -#define NV50_MIN_S32 0x0ac -#define NV50_MIN_U32 0x0a4 - -static void -emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, -	    struct nv50_reg *src0, struct nv50_reg *src1) -{ -	struct nv50_program_exec *e = exec(pc); - -	set_long(pc, e); -	e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20); -	e->inst[1] |= (sub << 24); - -	check_swap_src_0_1(pc, &src0, &src1); -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); -	set_src_1(pc, src1, e); - -	if (src0->mod & NV50_MOD_ABS) -		e->inst[1] |= 0x00100000; -	if (src1->mod & NV50_MOD_ABS) -		e->inst[1] |= 0x00080000; - -	emit(pc, e); -} - -static INLINE void -emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, -	 struct nv50_reg *src1) -{ -	src1->mod ^= NV50_MOD_NEG; -	emit_add(pc, dst, src0, src1); -	src1->mod ^= NV50_MOD_NEG; -} - -static void -emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, -	    struct nv50_reg *src1, unsigned op) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0xd0000000; -	set_long(pc, e); - -	check_swap_src_0_1(pc, &src0, &src1); -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); - -	if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && -	    op != TGSI_OPCODE_XOR) -		assert(!"invalid bit op"); - -	assert(!(src0->mod | src1->mod)); - -	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { -		set_immd(pc, src1, e); -		if (op == TGSI_OPCODE_OR) -			e->inst[0] |= 0x0100; -		else -		if (op == TGSI_OPCODE_XOR) -			e->inst[0] |= 0x8000; -	} else { -		set_src_1(pc, src1, e); -		e->inst[1] |= 0x04000000; /* 32 bit */ -		if (op == TGSI_OPCODE_OR) -			e->inst[1] |= 0x4000; -		else -		if (op == TGSI_OPCODE_XOR) -			e->inst[1] |= 0x8000; -	} - -	emit(pc, e); -} - -static void -emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0xd0000000; -	e->inst[1] = 0x0402c000; -	set_long(pc, e); -	set_dst(pc, dst, e); -	set_src_1(pc, src, e); - -	emit(pc, e); -} - -static void -emit_shift(struct nv50_pc *pc, struct nv50_reg *dst, -	   struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0x30000000; -	e->inst[1] = 0xc4000000; - -	set_long(pc, e); -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); - -	if (src1->type == P_IMMD) { -		e->inst[1] |= (1 << 20); -		e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16; -	} else -		set_src_1(pc, src1, e); - -	if (dir != TGSI_OPCODE_SHL) -		e->inst[1] |= (1 << 29); - -	if (dir == TGSI_OPCODE_ISHR) -		e->inst[1] |= (1 << 27); - -	emit(pc, e); -} - -static void -emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst, -	     struct nv50_reg *src, int s) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0x30000000; -	e->inst[1] = 0xc4100000; -	if (s < 0) { -		e->inst[1] |= 1 << 29; -		s = -s; -	} -	e->inst[1] |= ((s & 0x7f) << 16); - -	set_long(pc, e); -	set_dst(pc, dst, e); -	set_src_0(pc, src, e); - -	emit(pc, e); -} - -static void -emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, -	 struct nv50_reg *src1, struct nv50_reg *src2) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] |= 0xe0000000; - -	check_swap_src_0_1(pc, &src0, &src1); -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); -	set_src_1(pc, src1, e); -	set_src_2(pc, src2, e); - -	if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) -		e->inst[1] |= 0x04000000; -	if (src2->mod & NV50_MOD_NEG) -		e->inst[1] |= 0x08000000; - -	emit(pc, e); -} - -static INLINE void -emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, -	 struct nv50_reg *src1, struct nv50_reg *src2) -{ -	src2->mod ^= NV50_MOD_NEG; -	emit_mad(pc, dst, src0, src1, src2); -	src2->mod ^= NV50_MOD_NEG; -} - -#define NV50_FLOP_RCP 0 -#define NV50_FLOP_RSQ 2 -#define NV50_FLOP_LG2 3 -#define NV50_FLOP_SIN 4 -#define NV50_FLOP_COS 5 -#define NV50_FLOP_EX2 6 - -/* rcp, rsqrt, lg2 support neg and abs */ -static void -emit_flop(struct nv50_pc *pc, unsigned sub, -	  struct nv50_reg *dst, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] |= 0x90000000; -	if (sub || src->mod) { -		set_long(pc, e); -		e->inst[1] |= (sub << 29); -	} - -	set_dst(pc, dst, e); -	set_src_0_restricted(pc, src, e); - -	assert(!src->mod || sub < 4); - -	if (src->mod & NV50_MOD_NEG) -		e->inst[1] |= 0x04000000; -	if (src->mod & NV50_MOD_ABS) -		e->inst[1] |= 0x00100000; - -	emit(pc, e); -} - -static void -emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] |= 0xb0000000; - -	set_dst(pc, dst, e); -	set_src_0(pc, src, e); -	set_long(pc, e); -	e->inst[1] |= (6 << 29) | 0x00004000; - -	if (src->mod & NV50_MOD_NEG) -		e->inst[1] |= 0x04000000; -	if (src->mod & NV50_MOD_ABS) -		e->inst[1] |= 0x00100000; - -	emit(pc, e); -} - -static void -emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] |= 0xb0000000; - -	set_dst(pc, dst, e); -	set_src_0(pc, src, e); -	set_long(pc, e); -	e->inst[1] |= (6 << 29); - -	if (src->mod & NV50_MOD_NEG) -		e->inst[1] |= 0x04000000; -	if (src->mod & NV50_MOD_ABS) -		e->inst[1] |= 0x00100000; - -	emit(pc, e); -} - -#define CVT_RN    (0x00 << 16) -#define CVT_FLOOR (0x02 << 16) -#define CVT_CEIL  (0x04 << 16) -#define CVT_TRUNC (0x06 << 16) -#define CVT_SAT   (0x08 << 16) -#define CVT_ABS   (0x10 << 16) - -#define CVT_X32_X32 0x04004000 -#define CVT_X32_S32 0x04014000 -#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) -#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) -#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) -#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) -#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) -#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) -#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) -#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) - -#define CVT_NEG 0x20000000 -#define CVT_RI  0x08000000 - -static void -emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, -	 int wp, uint32_t cvn) -{ -	struct nv50_program_exec *e; - -	e = exec(pc); - -	if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; -	if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; - -	e->inst[0] = 0xa0000000; -	e->inst[1] = cvn; -	set_long(pc, e); -	set_src_0(pc, src, e); - -	if (wp >= 0) -		set_pred_wr(pc, 1, wp, e); - -	if (dst) -		set_dst(pc, dst, e); -	else { -		e->inst[0] |= 0x000001fc; -		e->inst[1] |= 0x00000008; -	} - -	emit(pc, e); -} - -/* nv50 Condition codes: - *  0x1 = LT - *  0x2 = EQ - *  0x3 = LE - *  0x4 = GT - *  0x5 = NE - *  0x6 = GE - *  0x7 = set condition code ? (used before bra.lt/le/gt/ge) - *  0x8 = unordered bit (allows NaN) - * - *  mode = 0x04 (u32), 0x0c (s32), 0x80 (f32) - */ -static void -emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, -	 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode) -{ -	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; - -	struct nv50_program_exec *e = exec(pc); -	struct nv50_reg *rdst; - -	assert(ccode < 16); -	if (check_swap_src_0_1(pc, &src0, &src1)) -		ccode = cc_swapped[ccode & 7] | (ccode & 8); - -	rdst = dst; -	if (dst && dst->type != P_TEMP) -		dst = alloc_temp(pc, NULL); - -	set_long(pc, e); -	e->inst[0] |= 0x30000000 | (mode << 24); -	e->inst[1] |= 0x60000000 | (ccode << 14); - -	if (wp >= 0) -		set_pred_wr(pc, 1, wp, e); -	if (dst) -		set_dst(pc, dst, e); -	else { -		e->inst[0] |= 0x000001fc; -		e->inst[1] |= 0x00000008; -	} - -	set_src_0(pc, src0, e); -	set_src_1(pc, src1, e); - -	emit(pc, e); - -	if (rdst && mode == 0x80) /* convert to float ? */ -		emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); -	if (rdst && rdst != dst) -		free_temp(pc, dst); -} - -static INLINE void -map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty) -{ -	switch (op) { -	case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break; -	case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break; -	case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break; -	case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break; -	case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break; -	case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break; - -	case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break; -	case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break; -	case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break; -	case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break; -	case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break; -	case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break; -	default: -		assert(0); -		return; -	} -} - -static void -emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst, -	     struct nv50_reg *src0, struct nv50_reg *rsrc1) -{ -	struct nv50_program_exec *e = exec(pc); -	struct nv50_reg *src1; - -	e->inst[0] = 0x20000000; - -	alloc_reg(pc, rsrc1); -	check_swap_src_0_1(pc, &src0, &rsrc1); - -	src1 = rsrc1; -	if (src0->mod & rsrc1->mod & NV50_MOD_NEG) { -		src1 = temp_temp(pc, e); -		emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32); -	} - -	if (!pc->allow32 || src1->hw > 63 || -	    (src1->type != P_TEMP && src1->type != P_IMMD)) -		set_long(pc, e); - -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); - -	if (is_long(e)) { -		e->inst[1] |= 1 << 26; -		set_src_2(pc, src1, e); -	} else { -		e->inst[0] |= 0x8000; -		if (src1->type == P_IMMD) -			set_immd(pc, src1, e); -		else -			set_src_1(pc, src1, e); -	} - -	if (src0->mod & NV50_MOD_NEG) -		e->inst[0] |= 1 << 28; -	else -	if (src1->mod & NV50_MOD_NEG) -		e->inst[0] |= 1 << 22; - -	emit(pc, e); -} - -static void -emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst, -	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1, -	     struct nv50_reg *src2) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0x60000000; -	if (!pc->allow32) -		set_long(pc, e); -	set_dst(pc, dst, e); - -	set_half_src(pc, src0, lh_0, e, 9); -	set_half_src(pc, src1, lh_1, e, 16); -	alloc_reg(pc, src2); -	if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw)) -		set_src_2(pc, src2, e); - -	emit(pc, e); -} - -static void -emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst, -	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0x40000000; -	set_long(pc, e); -	set_dst(pc, dst, e); - -	set_half_src(pc, src0, lh_0, e, 9); -	set_half_src(pc, src1, lh_1, e, 16); - -	emit(pc, e); -} - -static void -emit_sad(struct nv50_pc *pc, struct nv50_reg *dst, -	 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0x50000000; -	if (!pc->allow32) -		set_long(pc, e); -	check_swap_src_0_1(pc, &src0, &src1); -	set_dst(pc, dst, e); -	set_src_0(pc, src0, e); -	set_src_1(pc, src1, e); -	alloc_reg(pc, src2); -	if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw)) -		set_src_2(pc, src2, e); - -	if (is_long(e)) -		e->inst[1] |= 0x0c << 24; -	else -		e->inst[0] |= 0x81 << 8; - -	emit(pc, e); -} - -static INLINE void -emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); -} - -static void -emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, -	 struct nv50_reg *v, struct nv50_reg *e) -{ -	struct nv50_reg *temp = alloc_temp(pc, NULL); - -	emit_flop(pc, NV50_FLOP_LG2, temp, v); -	emit_mul(pc, temp, temp, e); -	emit_preex2(pc, temp, temp); -	emit_flop(pc, NV50_FLOP_EX2, dst, temp); - -	free_temp(pc, temp); -} - -static INLINE void -emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); -} - -static void -emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, -	 struct nv50_reg **src) -{ -	struct nv50_reg *one = alloc_immd(pc, 1.0); -	struct nv50_reg *zero = alloc_immd(pc, 0.0); -	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); -	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999); -	struct nv50_reg *tmp[4] = { 0 }; -	boolean allow32 = pc->allow32; - -	pc->allow32 = FALSE; - -	if (mask & (3 << 1)) { -		tmp[0] = alloc_temp(pc, NULL); -		emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero); -	} - -	if (mask & (1 << 2)) { -		set_pred_wr(pc, 1, 0, pc->p->exec_tail); - -		tmp[1] = temp_temp(pc, NULL); -		emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero); - -		tmp[3] = temp_temp(pc, NULL); -		emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128); -		emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128); - -		emit_pow(pc, dst[2], tmp[1], tmp[3]); -		emit_mov(pc, dst[2], zero); -		set_pred(pc, 3, 0, pc->p->exec_tail); -	} - -	if (mask & (1 << 1)) -		assimilate_temp(pc, dst[1], tmp[0]); -	else -	if (mask & (1 << 2)) -		free_temp(pc, tmp[0]); - -	pc->allow32 = allow32; - -	/* do this last, in case src[i,j] == dst[0,3] */ -	if (mask & (1 << 0)) -		emit_mov(pc, dst[0], one); - -	if (mask & (1 << 3)) -		emit_mov(pc, dst[3], one); - -	FREE(pos128); -	FREE(neg128); -	FREE(zero); -	FREE(one); -} - -static void -emit_kil(struct nv50_pc *pc, struct nv50_reg *src) -{ -	struct nv50_program_exec *e; -	const int r_pred = 1; - -	e = exec(pc); -	e->inst[0] = 0x00000002; /* discard */ -	set_long(pc, e); /* sets cond code to ALWAYS */ - -	if (src) { -		set_pred(pc, 0x1 /* cc = LT */, r_pred, e); -		/* write to predicate reg */ -		emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); -	} - -	emit(pc, e); -} - -static struct nv50_program_exec * -emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = (op << 28) | 2; -	set_long(pc, e); -	if (pred >= 0) -		set_pred(pc, cc, pred, e); - -	emit(pc, e); -	return e; -} - -static INLINE struct nv50_program_exec * -emit_breakaddr(struct nv50_pc *pc) -{ -	return emit_control_flow(pc, 0x4, -1, 0); -} - -static INLINE void -emit_break(struct nv50_pc *pc, int pred, unsigned cc) -{ -	emit_control_flow(pc, 0x5, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_joinat(struct nv50_pc *pc) -{ -	return emit_control_flow(pc, 0xa, -1, 0); -} - -static INLINE struct nv50_program_exec * -emit_branch(struct nv50_pc *pc, int pred, unsigned cc) -{ -	return emit_control_flow(pc, 0x1, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_call(struct nv50_pc *pc, int pred, unsigned cc) -{ -	return emit_control_flow(pc, 0x2, pred, cc); -} - -static INLINE void -emit_ret(struct nv50_pc *pc, int pred, unsigned cc) -{ -	emit_control_flow(pc, 0x3, pred, cc); -} - -static void -emit_prim_cmd(struct nv50_pc *pc, unsigned cmd) -{ -	struct nv50_program_exec *e = exec(pc); - -	e->inst[0] = 0xf0000000 | (cmd << 9); -	e->inst[1] = 0xc0000000; -	set_long(pc, e); - -	emit(pc, e); -} - -#define QOP_ADD 0 -#define QOP_SUBR 1 -#define QOP_SUB 2 -#define QOP_MOV_SRC1 3 - -/* For a quad of threads / top left, top right, bottom left, bottom right - * pixels, do a different operation, and take src0 from a specific thread. - */ -static void -emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, -	    struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) -{ -       struct nv50_program_exec *e = exec(pc); - -       e->inst[0] = 0xc0000000; -       e->inst[1] = 0x80000000; -       set_long(pc, e); -       e->inst[0] |= lane_src0 << 16; -       set_src_0(pc, src0, e); -       set_src_2(pc, src1, e); - -       if (wp >= 0) -	       set_pred_wr(pc, 1, wp, e); - -       if (dst) -	       set_dst(pc, dst, e); -       else { -	       e->inst[0] |= 0x000001fc; -	       e->inst[1] |= 0x00000008; -       } - -       e->inst[0] |= (qop & 3) << 20; -       e->inst[1] |= (qop >> 2) << 22; - -       emit(pc, e); -} - -static void -load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], -		     struct nv50_reg **src, unsigned arg, boolean proj) +bitcount4(const uint32_t val)  { -	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; - -	src[0]->mod |= NV50_MOD_ABS; -	src[1]->mod |= NV50_MOD_ABS; -	src[2]->mod |= NV50_MOD_ABS; - -	emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]); -	emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]); - -	src[0]->mod = mod[0]; -	src[1]->mod = mod[1]; -	src[2]->mod = mod[2]; - -	if (proj && 0 /* looks more correct without this */) -		emit_mul(pc, t[2], t[2], src[3]); -	else -	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ -		emit_mov(pc, t[3], src[3]); - -	emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); - -	emit_mul(pc, t[0], src[0], t[2]); -	emit_mul(pc, t[1], src[1], t[2]); -	emit_mul(pc, t[2], src[2], t[2]); +   static const unsigned cnt[16] +   = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; +   return cnt[val & 0xf];  } -static void -load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], -		     struct nv50_reg **src, unsigned dim, unsigned arg) -{ -	unsigned c, mode; - -	if (src[0]->type == P_TEMP && src[0]->rhw != -1) { -		mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; - -		t[3]->rhw = src[3]->rhw; -		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); -		emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); - -		for (c = 0; c < dim; ++c) { -			t[c]->rhw = src[c]->rhw; -			emit_interp(pc, t[c], t[3], mode); -		} -		if (arg != dim) { /* depth reference value */ -			t[dim]->rhw = src[2]->rhw; -			emit_interp(pc, t[dim], t[3], mode); -		} -	} else { -		/* XXX: for some reason the blob sometimes uses MAD -		 * (mad f32 $rX $rY $rZ neg $r63) -		 */ -		emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); -		for (c = 0; c < dim; ++c) -			emit_mul(pc, t[c], src[c], t[3]); -		if (arg != dim) /* depth reference value */ -			emit_mul(pc, t[dim], src[2], t[3]); -	} -} - -static INLINE void -get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) -{ -	switch (type) { -	case TGSI_TEXTURE_1D: -		*arg = *dim = 1; -		break; -	case TGSI_TEXTURE_SHADOW1D: -		*dim = 1; -		*arg = 2; -		break; -	case TGSI_TEXTURE_UNKNOWN: -	case TGSI_TEXTURE_2D: -	case TGSI_TEXTURE_RECT: -		*arg = *dim = 2; -		break; -	case TGSI_TEXTURE_SHADOW2D: -	case TGSI_TEXTURE_SHADOWRECT: -		*dim = 2; -		*arg = 3; -		break; -	case TGSI_TEXTURE_3D: -	case TGSI_TEXTURE_CUBE: -		*dim = *arg = 3; -		break; -	default: -		assert(0); -		break; -	} -} - -/* We shouldn't execute TEXLOD if any of the pixels in a quad have - * different LOD values, so branch off groups of equal LOD. - */ -static void -emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, -		     struct nv50_reg *src, struct nv50_program_exec *tex) -{ -	struct nv50_program_exec *join_at; -	unsigned i, target = pc->p->exec_size + 9 * 2; - -	if (pc->p->type != PIPE_SHADER_FRAGMENT) { -		emit(pc, tex); -		return; -	} -	pc->allow32 = FALSE; - -	/* Subtract lod of each pixel from lod of top left pixel, jump -	 * texlod insn if result is 0, then repeat for 2 other pixels. -	 */ -	join_at = emit_joinat(pc); -	emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); -	emit_branch(pc, 0, 2)->param.index = target; - -	for (i = 1; i < 4; ++i) { -		emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); -		emit_branch(pc, 0, 2)->param.index = target; -	} - -	emit_mov(pc, tlod, src); /* target */ -	emit(pc, tex); /* texlod */ - -	join_at->param.index = target + 2 * 2; -	JOIN_ON(emit_nop(pc)); /* join _after_ tex */ -} - -static void -emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, -		      struct nv50_program_exec *tex) -{ -	struct nv50_program_exec *e; -	struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); -	int r_pred = 0; -	unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; - -	pc->allow32 = FALSE; -	ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); - -	/* Subtract bias value of thread i from bias values of each thread, -	 * store result in r_pred, and set bit i in r_bits if result was 0. -	 */ -	assert(arg < 4); -	for (i = 0; i < 4; ++i, ++imm_1248.hw) { -		emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); -		emit_mov(pc, r_bits, &imm_1248); -		set_pred(pc, 2, r_pred, pc->p->exec_tail); -	} -	emit_mov_to_pred(pc, r_pred, r_bits); - -	/* The lanes of a quad are now grouped by the bit in r_pred they have -	 * set. Put the input values for TEX into a new register set for each -	 * group and execute TEX only for a specific group. -	 * We cannot use the same register set for each group because we need -	 * the derivatives, which are implicitly calculated, to be correct. -	 */ -	for (i = 1; i < 4; ++i) { -		alloc_temp4(pc, t123[i], 0); - -		for (c = 0; c <= arg; ++c) -			emit_mov(pc, t123[i][c], t[c]); - -		*(e = exec(pc)) = *(tex); -		e->inst[0] &= ~0x01fc; -		set_dst(pc, t123[i][0], e); -		set_pred(pc, cc[i], r_pred, e); -		emit(pc, e); -	} -	/* finally TEX on the original regs (where we kept the input) */ -	set_pred(pc, cc[0], r_pred, tex); -	emit(pc, tex); - -	/* put the 3 * n other results into regs for lane 0 */ -	n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); -	for (i = 1; i < 4; ++i) { -		for (c = 0; c < n; ++c) { -			emit_mov(pc, t[c], t123[i][c]); -			set_pred(pc, cc[i], r_pred, pc->p->exec_tail); -		} -		free_temp4(pc, t123[i]); -	} - -	emit_nop(pc); -	free_temp(pc, r_bits); -} - -static void -emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, -	 struct nv50_reg **src, unsigned unit, unsigned type, -	 boolean proj, int bias_lod) -{ -	struct nv50_reg *t[4]; -	struct nv50_program_exec *e; -	unsigned c, dim, arg; - -	/* t[i] must be within a single 128 bit super-reg */ -	alloc_temp4(pc, t, 0); - -	e = exec(pc); -	e->inst[0] = 0xf0000000; -	set_long(pc, e); -	set_dst(pc, t[0], e); - -	/* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ -	e->inst[0] |= (unit << 9) /* | (unit << 17) */; - -	/* live flag (don't set if TEX results affect input to another TEX): */ -	/* e->inst[0] |= 0x00000004; */ - -	get_tex_dim(type, &dim, &arg); - -	if (type == TGSI_TEXTURE_CUBE) { -		e->inst[0] |= 0x08000000; -		load_cube_tex_coords(pc, t, src, arg, proj); -	} else -	if (proj) -		load_proj_tex_coords(pc, t, src, dim, arg); -	else { -		for (c = 0; c < dim; c++) -			emit_mov(pc, t[c], src[c]); -		if (arg != dim) /* depth reference value (always src.z here) */ -			emit_mov(pc, t[dim], src[2]); -	} - -	e->inst[0] |= (mask & 0x3) << 25; -	e->inst[1] |= (mask & 0xc) << 12; - -	if (!bias_lod) { -		e->inst[0] |= (arg - 1) << 22; -		emit(pc, e); -	} else -	if (bias_lod < 0) { -		assert(pc->p->type == PIPE_SHADER_FRAGMENT); -		e->inst[0] |= arg << 22; -		e->inst[1] |= 0x20000000; /* texbias */ -		emit_mov(pc, t[arg], src[3]); -		emit_texbias_sequence(pc, t, arg, e); -	} else { -		e->inst[0] |= arg << 22; -		e->inst[1] |= 0x40000000; /* texlod */ -		emit_mov(pc, t[arg], src[3]); -		emit_texlod_sequence(pc, t[arg], src[3], e); -	} - -#if 1 -	c = 0; -	if (mask & 1) emit_mov(pc, dst[0], t[c++]); -	if (mask & 2) emit_mov(pc, dst[1], t[c++]); -	if (mask & 4) emit_mov(pc, dst[2], t[c++]); -	if (mask & 8) emit_mov(pc, dst[3], t[c]); - -	free_temp4(pc, t); -#else -	/* XXX: if p.e. MUL is used directly after TEX, it would still use -	 * the texture coordinates, not the fetched values: latency ? */ - -	for (c = 0; c < 4; c++) { -		if (mask & (1 << c)) -			assimilate_temp(pc, dst[c], t[c]); -		else -			free_temp(pc, t[c]); -	} -#endif -} - -static void -emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	assert(src->type == P_TEMP); - -	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; -	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; -	set_long(pc, e); -	set_dst(pc, dst, e); -	set_src_0(pc, src, e); -	set_src_2(pc, src, e); - -	emit(pc, e); -} - -static void -emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ -	struct nv50_program_exec *e = exec(pc); - -	assert(src->type == P_TEMP); - -	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; -	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; -	set_long(pc, e); -	set_dst(pc, dst, e); -	set_src_0(pc, src, e); -	set_src_2(pc, src, e); - -	emit(pc, e); -} - -static void -convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ -	unsigned q = 0, m = ~0; - -	assert(!is_long(e)); - -	switch (e->inst[0] >> 28) { -	case 0x1: -		/* MOV */ -		q = 0x0403c000; -		m = 0xffff7fff; -		break; -	case 0x2: -	case 0x3: -		/* ADD, SUB, SUBR b32 */ -		m = ~(0x8000 | (127 << 16)); -		q = ((e->inst[0] & (~m)) >> 2) | (1 << 26); -		break; -	case 0x5: -		/* SAD */ -		m = ~(0x81 << 8); -		q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12); -		break; -	case 0x6: -		/* MAD u16 */ -		q = (e->inst[0] & (0x7f << 2)) << 12; -		break; -	case 0x8: -		/* INTERP (move centroid, perspective and flat bits) */ -		m = ~0x03000100; -		q = (e->inst[0] & (3 << 24)) >> (24 - 16); -		q |= (e->inst[0] & (1 << 8)) << (18 - 8); -		break; -	case 0x9: -		/* RCP */ -		break; -	case 0xB: -		/* ADD */ -		m = ~(127 << 16); -		q = ((e->inst[0] & (~m)) >> 2); -		break; -	case 0xC: -		/* MUL */ -		m = ~0x00008000; -		q = ((e->inst[0] & (~m)) << 12); -		break; -	case 0xE: -		/* MAD (if src2 == dst) */ -		q = ((e->inst[0] & 0x1fc) << 12); -		break; -	default: -		assert(0); -		break; -	} - -	set_long(pc, e); -	pc->p->exec_size++; - -	e->inst[0] &= m; -	e->inst[1] |= q; -} - -/* Some operations support an optional negation flag. */ -static int -get_supported_mods(const struct tgsi_full_instruction *insn, int i) -{ -	switch (insn->Instruction.Opcode) { -	case TGSI_OPCODE_ADD: -	case TGSI_OPCODE_COS: -	case TGSI_OPCODE_DDX: -	case TGSI_OPCODE_DDY: -	case TGSI_OPCODE_DP3: -	case TGSI_OPCODE_DP4: -	case TGSI_OPCODE_EX2: -	case TGSI_OPCODE_KIL: -	case TGSI_OPCODE_LG2: -	case TGSI_OPCODE_MAD: -	case TGSI_OPCODE_MUL: -	case TGSI_OPCODE_POW: -	case TGSI_OPCODE_RCP: -	case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ -	case TGSI_OPCODE_SCS: -	case TGSI_OPCODE_SIN: -	case TGSI_OPCODE_SUB: -		return NV50_MOD_NEG; -	case TGSI_OPCODE_MAX: -	case TGSI_OPCODE_MIN: -	case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ -		return NV50_MOD_ABS; -	case TGSI_OPCODE_CEIL: -	case TGSI_OPCODE_FLR: -	case TGSI_OPCODE_TRUNC: -		return NV50_MOD_NEG | NV50_MOD_ABS; -	case TGSI_OPCODE_F2I: -	case TGSI_OPCODE_F2U: -	case TGSI_OPCODE_I2F: -	case TGSI_OPCODE_U2F: -		return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; -	case TGSI_OPCODE_UADD: -		return NV50_MOD_NEG | NV50_MOD_I32; -	case TGSI_OPCODE_SAD: -	case TGSI_OPCODE_SHL: -	case TGSI_OPCODE_IMAX: -	case TGSI_OPCODE_IMIN: -	case TGSI_OPCODE_ISHR: -	case TGSI_OPCODE_NOT: -	case TGSI_OPCODE_UMAD: -	case TGSI_OPCODE_UMAX: -	case TGSI_OPCODE_UMIN: -	case TGSI_OPCODE_UMUL: -	case TGSI_OPCODE_USHR: -		return NV50_MOD_I32; -	default: -		return 0; -	} -} - -/* Return a read mask for source registers deduced from opcode & write mask. */  static unsigned -nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) +nv50_tgsi_src_mask(const struct tgsi_full_instruction *inst, int c)  { -	unsigned x, mask = insn->Dst[0].Register.WriteMask; - -	switch (insn->Instruction.Opcode) { -	case TGSI_OPCODE_COS: -	case TGSI_OPCODE_SIN: -		return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); -	case TGSI_OPCODE_DP3: -		return 0x7; -	case TGSI_OPCODE_DP4: -	case TGSI_OPCODE_DPH: -	case TGSI_OPCODE_KIL: /* WriteMask ignored */ -		return 0xf; -	case TGSI_OPCODE_DST: -		return mask & (c ? 0xa : 0x6); -	case TGSI_OPCODE_EX2: -	case TGSI_OPCODE_EXP: -	case TGSI_OPCODE_LG2: -	case TGSI_OPCODE_LOG: -	case TGSI_OPCODE_POW: -	case TGSI_OPCODE_RCP: -	case TGSI_OPCODE_RSQ: -	case TGSI_OPCODE_SCS: -		return 0x1; -	case TGSI_OPCODE_IF: -		return 0x1; -	case TGSI_OPCODE_LIT: -		return 0xb; -	case TGSI_OPCODE_TEX: -	case TGSI_OPCODE_TXB: -	case TGSI_OPCODE_TXL: -	case TGSI_OPCODE_TXP: -	{ -		const struct tgsi_instruction_texture *tex; - -		assert(insn->Instruction.Texture); -		tex = &insn->Texture; +   unsigned mask = inst->Dst[0].Register.WriteMask; -		mask = 0x7; -		if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && -		    insn->Instruction.Opcode != TGSI_OPCODE_TXD) -			mask |= 0x8; /* bias, lod or proj */ +   switch (inst->Instruction.Opcode) { +   case TGSI_OPCODE_COS: +   case TGSI_OPCODE_SIN: +      return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); +   case TGSI_OPCODE_DP3: +      return 0x7; +   case TGSI_OPCODE_DP4: +   case TGSI_OPCODE_DPH: +   case TGSI_OPCODE_KIL: /* WriteMask ignored */ +      return 0xf; +   case TGSI_OPCODE_DST: +      return mask & (c ? 0xa : 0x6); +   case TGSI_OPCODE_EX2: +   case TGSI_OPCODE_EXP: +   case TGSI_OPCODE_LG2: +   case TGSI_OPCODE_LOG: +   case TGSI_OPCODE_POW: +   case TGSI_OPCODE_RCP: +   case TGSI_OPCODE_RSQ: +   case TGSI_OPCODE_SCS: +      return 0x1; +   case TGSI_OPCODE_IF: +      return 0x1; +   case TGSI_OPCODE_LIT: +      return 0xb; +   case TGSI_OPCODE_TEX: +   case TGSI_OPCODE_TXB: +   case TGSI_OPCODE_TXL: +   case TGSI_OPCODE_TXP: +   { +      const struct tgsi_instruction_texture *tex; -		switch (tex->Texture) { -		case TGSI_TEXTURE_1D: -			mask &= 0x9; -			break; -		case TGSI_TEXTURE_SHADOW1D: -			mask &= 0x5; -			break; -		case TGSI_TEXTURE_2D: -			mask &= 0xb; -			break; -		default: -			break; -		} -	} -		return mask; -	case TGSI_OPCODE_XPD: -		x = 0; -		if (mask & 1) x |= 0x6; -		if (mask & 2) x |= 0x5; -		if (mask & 4) x |= 0x3; -		return x; -	default: -		break; -	} +      assert(inst->Instruction.Texture); +      tex = &inst->Texture; -	return mask; -} +      mask = 0x7; +      if (inst->Instruction.Opcode != TGSI_OPCODE_TEX && +          inst->Instruction.Opcode != TGSI_OPCODE_TXD) +         mask |= 0x8; /* bias, lod or proj */ -static struct nv50_reg * -tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) -{ -	switch (dst->Register.File) { -	case TGSI_FILE_TEMPORARY: -		return &pc->temp[dst->Register.Index * 4 + c]; -	case TGSI_FILE_OUTPUT: -		return &pc->result[dst->Register.Index * 4 + c]; -	case TGSI_FILE_ADDRESS: -	{ -		struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; -		if (!r) { -			r = get_address_reg(pc, NULL); -			r->index = dst->Register.Index * 4 + c; -			pc->addr[r->index] = r; -		} -		assert(r); -		return r; -	} -	case TGSI_FILE_NULL: -		return NULL; -	case TGSI_FILE_SYSTEM_VALUE: -		assert(pc->sysval[dst->Register.Index].type == P_RESULT); -		assert(c == 0); -		return &pc->sysval[dst->Register.Index]; -	default: -		break; -	} +      switch (tex->Texture) { +      case TGSI_TEXTURE_1D: +         mask &= 0x9; +         break; +      case TGSI_TEXTURE_SHADOW1D: +         mask &= 0x5; +         break; +      case TGSI_TEXTURE_2D: +         mask &= 0xb; +         break; +      default: +         break; +      } +   } +  	   return mask; +   case TGSI_OPCODE_XPD: +   { +      unsigned x = 0; +      if (mask & 1) x |= 0x6; +      if (mask & 2) x |= 0x5; +      if (mask & 4) x |= 0x3; +      return x; +   } +   default: +      break; +   } -	return NULL; -} - -static struct nv50_reg * -tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, -	 int mod) -{ -	struct nv50_reg *r = NULL; -	struct nv50_reg *temp = NULL; -	unsigned sgn, c, swz, cvn; - -	if (src->Register.File != TGSI_FILE_CONSTANT) -		assert(!src->Register.Indirect); - -	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); - -	c = tgsi_util_get_full_src_register_swizzle(src, chan); -	switch (c) { -	case TGSI_SWIZZLE_X: -	case TGSI_SWIZZLE_Y: -	case TGSI_SWIZZLE_Z: -	case TGSI_SWIZZLE_W: -		switch (src->Register.File) { -		case TGSI_FILE_INPUT: -			r = &pc->attr[src->Register.Index * 4 + c]; - -			if (!src->Dimension.Dimension) -				break; -			r = reg_instance(pc, r); -			r->vtx = src->Dimension.Index; - -			if (!src->Dimension.Indirect) -				break; -			swz = tgsi_util_get_src_register_swizzle( -				&src->DimIndirect, 0); -			r->acc = -1; -			r->indirect[1] = src->DimIndirect.Index * 4 + swz; -			break; -		case TGSI_FILE_TEMPORARY: -			r = &pc->temp[src->Register.Index * 4 + c]; -			break; -		case TGSI_FILE_CONSTANT: -			if (!src->Register.Indirect) { -				r = &pc->param[src->Register.Index * 4 + c]; -				break; -			} -			/* Indicate indirection by setting r->acc < 0 and -			 * use the index field to select the address reg. -			 */ -			r = reg_instance(pc, NULL); -			ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c); - -			swz = tgsi_util_get_src_register_swizzle( -				&src->Indirect, 0); -			r->acc = -1; -			r->indirect[0] = src->Indirect.Index * 4 + swz; -			break; -		case TGSI_FILE_IMMEDIATE: -			r = &pc->immd[src->Register.Index * 4 + c]; -			break; -		case TGSI_FILE_SAMPLER: -			return NULL; -		case TGSI_FILE_ADDRESS: -			r = pc->addr[src->Register.Index * 4 + c]; -			assert(r); -			break; -		case TGSI_FILE_SYSTEM_VALUE: -			assert(c == 0); -			r = &pc->sysval[src->Register.Index]; -			break; -		default: -			assert(0); -			break; -		} -		break; -	default: -		assert(0); -		break; -	} - -	cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; - -	switch (sgn) { -	case TGSI_UTIL_SIGN_CLEAR: -		r->mod = NV50_MOD_ABS; -		break; -	case TGSI_UTIL_SIGN_SET: -		r->mod = NV50_MOD_NEG_ABS; -		break; -	case TGSI_UTIL_SIGN_TOGGLE: -		r->mod = NV50_MOD_NEG; -		break; -	default: -		assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); -		break; -	} - -	if ((r->mod & mod) != r->mod) { -		temp = temp_temp(pc, NULL); -		emit_cvt(pc, temp, r, -1, cvn); -		r->mod = 0; -		r = temp; -	} else -		r->mod |= mod & NV50_MOD_I32; - -	assert(r); -	if (r->acc >= 0 && r->vtx < 0 && r != temp) -		return reg_instance(pc, r); /* will clear r->mod */ -	return r; -} - -/* return TRUE for ops that produce only a single result */ -static boolean -is_scalar_op(unsigned op) -{ -	switch (op) { -	case TGSI_OPCODE_COS: -	case TGSI_OPCODE_DP2: -	case TGSI_OPCODE_DP3: -	case TGSI_OPCODE_DP4: -	case TGSI_OPCODE_DPH: -	case TGSI_OPCODE_EX2: -	case TGSI_OPCODE_LG2: -	case TGSI_OPCODE_POW: -	case TGSI_OPCODE_RCP: -	case TGSI_OPCODE_RSQ: -	case TGSI_OPCODE_SIN: -		/* -	case TGSI_OPCODE_KIL: -	case TGSI_OPCODE_LIT: -	case TGSI_OPCODE_SCS: -		*/ -		return TRUE; -	default: -		return FALSE; -	} -} - -/* Returns a bitmask indicating which dst components depend - * on source s, component c (reverse of nv50_tgsi_src_mask). - */ -static unsigned -nv50_tgsi_dst_revdep(unsigned op, int s, int c) -{ -	if (is_scalar_op(op)) -		return 0x1; - -	switch (op) { -	case TGSI_OPCODE_DST: -		return (1 << c) & (s ? 0xa : 0x6); -	case TGSI_OPCODE_XPD: -		switch (c) { -		case 0: return 0x6; -		case 1: return 0x5; -		case 2: return 0x3; -		case 3: return 0x0; -		default: -			assert(0); -			return 0x0; -		} -	case TGSI_OPCODE_EXP: -	case TGSI_OPCODE_LOG: -	case TGSI_OPCODE_LIT: -	case TGSI_OPCODE_SCS: -	case TGSI_OPCODE_TEX: -	case TGSI_OPCODE_TXB: -	case TGSI_OPCODE_TXL: -	case TGSI_OPCODE_TXP: -		/* these take care of dangerous swizzles themselves */ -		return 0x0; -	case TGSI_OPCODE_IF: -	case TGSI_OPCODE_KIL: -		/* don't call this function for these ops */ -		assert(0); -		return 0; -	default: -		/* linear vector instruction */ -		return (1 << c); -	} -} - -static INLINE boolean -has_pred(struct nv50_program_exec *e, unsigned cc) -{ -	if (!is_long(e) || is_immd(e)) -		return FALSE; -	return ((e->inst[1] & 0x780) == (cc << 7)); -} - -/* on ENDIF see if we can do "@p0.neu single_op" instead of: - *        join_at ENDIF - *        @p0.eq bra ENDIF - *        single_op - * ENDIF: nop.join - */ -static boolean -nv50_kill_branch(struct nv50_pc *pc) -{ -	int lvl = pc->if_lvl; - -	if (pc->if_insn[lvl]->next != pc->p->exec_tail) -		return FALSE; -	if (is_immd(pc->p->exec_tail)) -		return FALSE; - -	/* if ccode == 'true', the BRA is from an ELSE and the predicate -	 * reg may no longer be valid, since we currently always use $p0 -	 */ -	if (has_pred(pc->if_insn[lvl], 0xf)) -		return FALSE; -	assert(pc->if_insn[lvl] && pc->if_join[lvl]); - -	/* We'll use the exec allocated for JOIN_AT (we can't easily -	 * access nv50_program_exec's prev). -	 */ -	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ - -	*pc->if_join[lvl] = *pc->p->exec_tail; - -	FREE(pc->if_insn[lvl]); -	FREE(pc->p->exec_tail); - -	pc->p->exec_tail = pc->if_join[lvl]; -	pc->p->exec_tail->next = NULL; -	set_pred(pc, 0xd, 0, pc->p->exec_tail); - -	return TRUE; +   return mask;  }  static void -nv50_fp_move_results(struct nv50_pc *pc) +nv50_indirect_inputs(struct nv50_translation_info *ti, int id)  { -	struct nv50_reg reg; -	unsigned i; +   int i, c; -	ctor_reg(®, P_TEMP, -1, -1); +   for (i = 0; i < PIPE_MAX_SHADER_INPUTS; ++i) +      for (c = 0; c < 4; ++c) +         ti->input_access[i][c] = id; -	for (i = 0; i < pc->result_nr * 4; ++i) { -		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) -			continue; -		if (pc->result[i].rhw != pc->result[i].hw) { -			reg.hw = pc->result[i].rhw; -			emit_mov(pc, ®, &pc->result[i]); -		} -	} -} - -static boolean -nv50_program_tx_insn(struct nv50_pc *pc, -		     const struct tgsi_full_instruction *inst) -{ -	struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; -	unsigned mask, sat, unit = 0; -	int i, c; - -	mask = inst->Dst[0].Register.WriteMask; -	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; - -	memset(src, 0, sizeof(src)); - -	for (c = 0; c < 4; c++) { -		if ((mask & (1 << c)) && !pc->r_dst[c]) -			dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); -		else -			dst[c] = pc->r_dst[c]; -		rdst[c] = dst[c]; -	} - -	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { -		const struct tgsi_full_src_register *fs = &inst->Src[i]; -		unsigned src_mask; -		int mod_supp; - -		src_mask = nv50_tgsi_src_mask(inst, i); -		mod_supp = get_supported_mods(inst, i); - -		if (fs->Register.File == TGSI_FILE_SAMPLER) -			unit = fs->Register.Index; - -		for (c = 0; c < 4; c++) -			if (src_mask & (1 << c)) -				src[i][c] = tgsi_src(pc, c, fs, mod_supp); -	} - -	brdc = temp = pc->r_brdc; -	if (brdc && brdc->type != P_TEMP) { -		temp = temp_temp(pc, NULL); -		if (sat) -			brdc = temp; -	} else -	if (sat) { -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) -				continue; -			/* rdst[c] = dst[c]; */ /* done above */ -			dst[c] = temp_temp(pc, NULL); -		} -	} - -	assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); - -	switch (inst->Instruction.Opcode) { -	case TGSI_OPCODE_ABS: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, -				 CVT_ABS | CVT_F32_F32); -		} -		break; -	case TGSI_OPCODE_ADD: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_add(pc, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_AND: -	case TGSI_OPCODE_XOR: -	case TGSI_OPCODE_OR: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_bitop2(pc, dst[c], src[0][c], src[1][c], -				    inst->Instruction.Opcode); -		} -		break; -	case TGSI_OPCODE_ARL: -		temp = temp_temp(pc, NULL); -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, temp, src[0][c], -1, -				 CVT_FLOOR | CVT_S32_F32); -			emit_arl(pc, dst[c], temp, 4); -		} -		break; -	case TGSI_OPCODE_BGNLOOP: -		pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); -		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; -		terminate_mbb(pc); -		break; -	case TGSI_OPCODE_BGNSUB: -		assert(!pc->in_subroutine); -		pc->in_subroutine = TRUE; -		/* probably not necessary, but align to 8 byte boundary */ -		if (!is_long(pc->p->exec_tail)) -			convert_to_long(pc, pc->p->exec_tail); -		break; -	case TGSI_OPCODE_BRK: -		assert(pc->loop_lvl > 0); -		emit_break(pc, -1, 0); -		break; -	case TGSI_OPCODE_CAL: -		assert(inst->Label.Label < pc->insn_nr); -		emit_call(pc, -1, 0)->param.index = inst->Label.Label; -		/* replaced by actual offset in nv50_program_fixup_insns */ -		break; -	case TGSI_OPCODE_CEIL: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, -				 CVT_CEIL | CVT_F32_F32 | CVT_RI); -		} -		break; -	case TGSI_OPCODE_CMP: -		pc->allow32 = FALSE; -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); -			emit_mov(pc, dst[c], src[1][c]); -			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ -			emit_mov(pc, dst[c], src[2][c]); -			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ -		} -		break; -	case TGSI_OPCODE_CONT: -		assert(pc->loop_lvl > 0); -		emit_branch(pc, -1, 0)->param.index = -			pc->loop_pos[pc->loop_lvl - 1]; -		break; -	case TGSI_OPCODE_COS: -		if (mask & 8) { -			emit_precossin(pc, temp, src[0][3]); -			emit_flop(pc, NV50_FLOP_COS, dst[3], temp); -			if (!(mask &= 7)) -				break; -			if (temp == dst[3]) -				temp = brdc = temp_temp(pc, NULL); -		} -		emit_precossin(pc, temp, src[0][0]); -		emit_flop(pc, NV50_FLOP_COS, brdc, temp); -		break; -	case TGSI_OPCODE_DDX: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_ddx(pc, dst[c], src[0][c]); -		} -		break; -	case TGSI_OPCODE_DDY: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_ddy(pc, dst[c], src[0][c]); -		} -		break; -	case TGSI_OPCODE_DP3: -		emit_mul(pc, temp, src[0][0], src[1][0]); -		emit_mad(pc, temp, src[0][1], src[1][1], temp); -		emit_mad(pc, brdc, src[0][2], src[1][2], temp); -		break; -	case TGSI_OPCODE_DP4: -		emit_mul(pc, temp, src[0][0], src[1][0]); -		emit_mad(pc, temp, src[0][1], src[1][1], temp); -		emit_mad(pc, temp, src[0][2], src[1][2], temp); -		emit_mad(pc, brdc, src[0][3], src[1][3], temp); -		break; -	case TGSI_OPCODE_DPH: -		emit_mul(pc, temp, src[0][0], src[1][0]); -		emit_mad(pc, temp, src[0][1], src[1][1], temp); -		emit_mad(pc, temp, src[0][2], src[1][2], temp); -		emit_add(pc, brdc, src[1][3], temp); -		break; -	case TGSI_OPCODE_DST: -		if (mask & (1 << 1)) -			emit_mul(pc, dst[1], src[0][1], src[1][1]); -		if (mask & (1 << 2)) -			emit_mov(pc, dst[2], src[0][2]); -		if (mask & (1 << 3)) -			emit_mov(pc, dst[3], src[1][3]); -		if (mask & (1 << 0)) -			emit_mov_immdval(pc, dst[0], 1.0f); -		break; -	case TGSI_OPCODE_ELSE: -		emit_branch(pc, -1, 0); -		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; -		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; -		terminate_mbb(pc); -		break; -	case TGSI_OPCODE_EMIT: -		emit_prim_cmd(pc, 1); -		break; -	case TGSI_OPCODE_ENDIF: -		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; - -		/* try to replace branch over 1 insn with a predicated insn */ -		if (nv50_kill_branch(pc) == TRUE) -			break; - -		if (pc->if_join[pc->if_lvl]) { -			pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; -			pc->if_join[pc->if_lvl] = NULL; -		} -		terminate_mbb(pc); -		/* emit a NOP as join point, we could set it on the next -		 * one, but would have to make sure it is long and !immd -		 */ -		JOIN_ON(emit_nop(pc)); -		break; -	case TGSI_OPCODE_ENDLOOP: -		emit_branch(pc, -1, 0)->param.index = -			pc->loop_pos[--pc->loop_lvl]; -		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; -		terminate_mbb(pc); -		break; -	case TGSI_OPCODE_ENDPRIM: -		emit_prim_cmd(pc, 2); -		break; -	case TGSI_OPCODE_ENDSUB: -		assert(pc->in_subroutine); -		terminate_mbb(pc); -		pc->in_subroutine = FALSE; -		break; -	case TGSI_OPCODE_EX2: -		emit_preex2(pc, temp, src[0][0]); -		emit_flop(pc, NV50_FLOP_EX2, brdc, temp); -		break; -	case TGSI_OPCODE_EXP: -	{ -		struct nv50_reg *t[2]; - -		assert(!temp); -		t[0] = temp_temp(pc, NULL); -		t[1] = temp_temp(pc, NULL); - -		if (mask & 0x6) -			emit_mov(pc, t[0], src[0][0]); -		if (mask & 0x3) -			emit_flr(pc, t[1], src[0][0]); - -		if (mask & (1 << 1)) -			emit_sub(pc, dst[1], t[0], t[1]); -		if (mask & (1 << 0)) { -			emit_preex2(pc, t[1], t[1]); -			emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); -		} -		if (mask & (1 << 2)) { -			emit_preex2(pc, t[0], t[0]); -			emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); -		} -		if (mask & (1 << 3)) -			emit_mov_immdval(pc, dst[3], 1.0f); -	} -		break; -	case TGSI_OPCODE_F2I: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, -				 CVT_TRUNC | CVT_S32_F32); -		} -		break; -	case TGSI_OPCODE_F2U: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, -				 CVT_TRUNC | CVT_U32_F32); -		} -		break; -	case TGSI_OPCODE_FLR: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_flr(pc, dst[c], src[0][c]); -		} -		break; -	case TGSI_OPCODE_FRC: -		temp = temp_temp(pc, NULL); -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_flr(pc, temp, src[0][c]); -			emit_sub(pc, dst[c], src[0][c], temp); -		} -		break; -	case TGSI_OPCODE_I2F: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); -		} -		break; -	case TGSI_OPCODE_IF: -		assert(pc->if_lvl < NV50_MAX_COND_NESTING); -		emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); -		pc->if_join[pc->if_lvl] = emit_joinat(pc); -		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; -		terminate_mbb(pc); -		break; -	case TGSI_OPCODE_IMAX: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_IMIN: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_INEG: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, -				 CVT_S32_S32 | CVT_NEG); -		} -		break; -	case TGSI_OPCODE_KIL: -		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); -		emit_kil(pc, src[0][0]); -		emit_kil(pc, src[0][1]); -		emit_kil(pc, src[0][2]); -		emit_kil(pc, src[0][3]); -		break; -	case TGSI_OPCODE_KILP: -		emit_kil(pc, NULL); -		break; -	case TGSI_OPCODE_LIT: -		emit_lit(pc, &dst[0], mask, &src[0][0]); -		break; -	case TGSI_OPCODE_LG2: -		emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); -		break; -	case TGSI_OPCODE_LOG: -	{ -		struct nv50_reg *t[2]; - -		t[0] = temp_temp(pc, NULL); -		if (mask & (1 << 1)) -			t[1] = temp_temp(pc, NULL); -		else -			t[1] = t[0]; - -		emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); -		emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); -		if (mask & (1 << 2)) -			emit_mov(pc, dst[2], t[1]); -		emit_flr(pc, t[1], t[1]); -		if (mask & (1 << 0)) -			emit_mov(pc, dst[0], t[1]); -		if (mask & (1 << 1)) { -			t[1]->mod = NV50_MOD_NEG; -			emit_preex2(pc, t[1], t[1]); -			t[1]->mod = 0; -			emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); -			emit_mul(pc, dst[1], t[0], t[1]); -		} -		if (mask & (1 << 3)) -			emit_mov_immdval(pc, dst[3], 1.0f); -	} -		break; -	case TGSI_OPCODE_LRP: -		temp = temp_temp(pc, NULL); -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_sub(pc, temp, src[1][c], src[2][c]); -			emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); -		} -		break; -	case TGSI_OPCODE_MAD: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); -		} -		break; -	case TGSI_OPCODE_MAX: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_MIN: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_MOV: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_mov(pc, dst[c], src[0][c]); -		} -		break; -	case TGSI_OPCODE_MUL: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_mul(pc, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_NOT: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_not(pc, dst[c], src[0][c]); -		} -		break; -	case TGSI_OPCODE_POW: -		emit_pow(pc, brdc, src[0][0], src[1][0]); -		break; -	case TGSI_OPCODE_RCP: -		if (!sat && popcnt4(mask) == 1) -			brdc = dst[ffs(mask) - 1]; -		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); -		break; -	case TGSI_OPCODE_RET: -		if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) -			nv50_fp_move_results(pc); -		emit_ret(pc, -1, 0); -		break; -	case TGSI_OPCODE_RSQ: -		if (!sat && popcnt4(mask) == 1) -			brdc = dst[ffs(mask) - 1]; -		src[0][0]->mod |= NV50_MOD_ABS; -		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); -		break; -	case TGSI_OPCODE_SAD: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]); -		} -		break; -	case TGSI_OPCODE_SCS: -		temp = temp_temp(pc, NULL); -		if (mask & 3) -			emit_precossin(pc, temp, src[0][0]); -		if (mask & (1 << 0)) -			emit_flop(pc, NV50_FLOP_COS, dst[0], temp); -		if (mask & (1 << 1)) -			emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); -		if (mask & (1 << 2)) -			emit_mov_immdval(pc, dst[2], 0.0); -		if (mask & (1 << 3)) -			emit_mov_immdval(pc, dst[3], 1.0); -		break; -	case TGSI_OPCODE_SHL: -	case TGSI_OPCODE_ISHR: -	case TGSI_OPCODE_USHR: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_shift(pc, dst[c], src[0][c], src[1][c], -				   inst->Instruction.Opcode); -		} -		break; -	case TGSI_OPCODE_SIN: -		if (mask & 8) { -			emit_precossin(pc, temp, src[0][3]); -			emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); -			if (!(mask &= 7)) -				break; -			if (temp == dst[3]) -				temp = brdc = temp_temp(pc, NULL); -		} -		emit_precossin(pc, temp, src[0][0]); -		emit_flop(pc, NV50_FLOP_SIN, brdc, temp); -		break; -	case TGSI_OPCODE_SLT: -	case TGSI_OPCODE_SGE: -	case TGSI_OPCODE_SEQ: -	case TGSI_OPCODE_SGT: -	case TGSI_OPCODE_SLE: -	case TGSI_OPCODE_SNE: -	case TGSI_OPCODE_ISLT: -	case TGSI_OPCODE_ISGE: -	case TGSI_OPCODE_USEQ: -	case TGSI_OPCODE_USGE: -	case TGSI_OPCODE_USLT: -	case TGSI_OPCODE_USNE: -	{ -		uint8_t cc, ty; - -		map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty); - -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty); -		} -	} -		break; -	case TGSI_OPCODE_SUB: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_sub(pc, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_TEX: -		emit_tex(pc, dst, mask, src[0], unit, -			 inst->Texture.Texture, FALSE, 0); -		break; -	case TGSI_OPCODE_TXB: -		emit_tex(pc, dst, mask, src[0], unit, -			 inst->Texture.Texture, FALSE, -1); -		break; -	case TGSI_OPCODE_TXL: -		emit_tex(pc, dst, mask, src[0], unit, -			 inst->Texture.Texture, FALSE, 1); -		break; -	case TGSI_OPCODE_TXP: -		emit_tex(pc, dst, mask, src[0], unit, -			 inst->Texture.Texture, TRUE, 0); -		break; -	case TGSI_OPCODE_TRUNC: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, -				 CVT_TRUNC | CVT_F32_F32 | CVT_RI); -		} -		break; -	case TGSI_OPCODE_U2F: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); -		} -		break; -	case TGSI_OPCODE_UADD: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_add_b32(pc, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_UMAX: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_UMIN: -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]); -		} -		break; -	case TGSI_OPCODE_UMAD: -	{ -		assert(!temp); -		temp = temp_temp(pc, NULL); -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); -			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, -				     temp); -			emit_shl_imm(pc, temp, temp, 16); -			emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0, -				     temp); -			emit_add_b32(pc, dst[c], temp, src[2][c]); -		} -	} -		break; -	case TGSI_OPCODE_UMUL: -	{ -		assert(!temp); -		temp = temp_temp(pc, NULL); -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); -			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, -				     temp); -			emit_shl_imm(pc, temp, temp, 16); -			emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0, -				     temp); -		} -	} -		break; -	case TGSI_OPCODE_XPD: -		temp = temp_temp(pc, NULL); -		if (mask & (1 << 0)) { -			emit_mul(pc, temp, src[0][2], src[1][1]); -			emit_msb(pc, dst[0], src[0][1], src[1][2], temp); -		} -		if (mask & (1 << 1)) { -			emit_mul(pc, temp, src[0][0], src[1][2]); -			emit_msb(pc, dst[1], src[0][2], src[1][0], temp); -		} -		if (mask & (1 << 2)) { -			emit_mul(pc, temp, src[0][1], src[1][0]); -			emit_msb(pc, dst[2], src[0][0], src[1][1], temp); -		} -		if (mask & (1 << 3)) -			emit_mov_immdval(pc, dst[3], 1.0); -		break; -	case TGSI_OPCODE_END: -		if (pc->p->type == PIPE_SHADER_FRAGMENT) -			nv50_fp_move_results(pc); - -		if (!pc->p->exec_tail || -		    is_immd(pc->p->exec_tail) || -		    is_join(pc->p->exec_tail) || -		    is_control_flow(pc->p->exec_tail)) -			emit_nop(pc); - -		/* last insn must be long so it can have the exit bit set */ -		if (!is_long(pc->p->exec_tail)) -			convert_to_long(pc, pc->p->exec_tail); - -		pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ - -		terminate_mbb(pc); -		break; -	default: -		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); -		return FALSE; -	} - -	if (brdc) { -		if (sat) -			emit_sat(pc, brdc, brdc); -		for (c = 0; c < 4; c++) -			if ((mask & (1 << c)) && dst[c] != brdc) -				emit_mov(pc, dst[c], brdc); -	} else -	if (sat) { -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			/* In this case we saturate later, and dst[c] won't -			 * be another temp_temp (and thus lost), since rdst -			 * already is TEMP (see above). */ -			if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) -				continue; -			emit_sat(pc, rdst[c], dst[c]); -		} -	} - -	kill_temp_temp(pc, NULL); -	pc->reg_instance_nr = 0; - -	return TRUE; +   ti->indirect_inputs = TRUE;  }  static void -prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) -{ -	struct nv50_reg *r, *reg = NULL; -	const struct tgsi_full_src_register *src; -	const struct tgsi_dst_register *dst; -	unsigned i, c, k, mask; - -	dst = &insn->Dst[0].Register; -	mask = dst->WriteMask; - -        if (dst->File == TGSI_FILE_TEMPORARY) -		reg = pc->temp; -        else -	if (dst->File == TGSI_FILE_OUTPUT) { -		reg = pc->result; - -		if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && -		    dst->Index == pc->edgeflag_out && -		    insn->Src[0].Register.File == TGSI_FILE_INPUT) -			pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; -	} - -	if (reg) { -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			reg[dst->Index * 4 + c].acc = pc->insn_nr; -		} -	} - -	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { -		src = &insn->Src[i]; - -		if (src->Register.File == TGSI_FILE_TEMPORARY) -			reg = pc->temp; -		else -		if (src->Register.File == TGSI_FILE_INPUT) -			reg = pc->attr; -		else -			continue; - -		mask = nv50_tgsi_src_mask(insn, i); - -		for (c = 0; c < 4; c++) { -			if (!(mask & (1 << c))) -				continue; -			k = tgsi_util_get_full_src_register_swizzle(src, c); - -			r = ®[src->Register.Index * 4 + k]; - -			/* If used before written, pre-allocate the reg, -			 * lest we overwrite results from a subroutine. -			 */ -			if (!r->acc && r->type == P_TEMP) -				alloc_reg(pc, r); - -			r->acc = pc->insn_nr; -		} -	} -} - -/* Returns a bitmask indicating which dst components need to be - * written to temporaries first to avoid 'corrupting' sources. - * - * m[i]   (out) indicate component to write in the i-th position - * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source - */ -static unsigned -nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) -{ -	unsigned i, c, x, unsafe = 0; - -	for (c = 0; c < 4; c++) -		m[c] = c; - -	/* Swap as long as a dst component written earlier is depended on -	 * by one written later, but the next one isn't depended on by it. -	 */ -	for (c = 0; c < 3; c++) { -		if (rdep[m[c + 1]] & (1 << m[c])) -			continue; /* if next one is depended on by us */ -		for (i = c + 1; i < 4; i++) -			/* if we are depended on by a later one */ -			if (rdep[m[c]] & (1 << m[i])) -				break; -		if (i == 4) -			continue; -		/* now, swap */ -		x = m[c]; -		m[c] = m[c + 1]; -		m[c + 1] = x; - -		/* restart */ -		c = 0; -	} - -	/* mark dependencies that could not be resolved by reordering */ -	for (i = 0; i < 3; ++i) -		for (c = i + 1; c < 4; ++c) -			if (rdep[m[i]] & (1 << m[c])) -				unsafe |= (1 << i); - -	/* NOTE: $unsafe is with respect to order, not component */ -	return unsafe; -} - -/* Select a suitable dst register for broadcasting scalar results, - * or return NULL if we have to allocate an extra TEMP. - * - * If e.g. only 1 component is written, we may also emit the final - * result to a write-only register. - */ -static struct nv50_reg * -tgsi_broadcast_dst(struct nv50_pc *pc, -		   const struct tgsi_full_dst_register *fd, unsigned mask) -{ -	if (fd->Register.File == TGSI_FILE_TEMPORARY) { -		int c = ffs(~mask & fd->Register.WriteMask); -		if (c) -			return tgsi_dst(pc, c - 1, fd); -	} else { -		int c = ffs(fd->Register.WriteMask) - 1; -		if ((1 << c) == fd->Register.WriteMask) -			return tgsi_dst(pc, c, fd); -	} - -	return NULL; -} - -/* Scan source swizzles and return a bitmask indicating dst regs that - * also occur among the src regs, and fill rdep for nv50_revdep_reoder. - */ -static unsigned -nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, -		       unsigned rdep[4]) +nv50_indirect_outputs(struct nv50_translation_info *ti, int id)  { -	const struct tgsi_full_dst_register *fd = &insn->Dst[0]; -	const struct tgsi_full_src_register *fs; -	unsigned i, deqs = 0; - -	for (i = 0; i < 4; ++i) -		rdep[i] = 0; - -	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { -		unsigned chn, mask = nv50_tgsi_src_mask(insn, i); -		int ms = get_supported_mods(insn, i); - -		fs = &insn->Src[i]; -		if (fs->Register.File != fd->Register.File || -		    fs->Register.Index != fd->Register.Index) -			continue; - -		for (chn = 0; chn < 4; ++chn) { -			unsigned s, c; - -			if (!(mask & (1 << chn))) /* src is not read */ -				continue; -			c = tgsi_util_get_full_src_register_swizzle(fs, chn); -			s = tgsi_util_get_full_src_register_sign_mode(fs, chn); - -			if (!(fd->Register.WriteMask & (1 << c))) -				continue; - -			if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) -					continue; -			if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) -					continue; -			if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) -					continue; - -			rdep[c] |= nv50_tgsi_dst_revdep( -				insn->Instruction.Opcode, i, chn); -			deqs |= (1 << c); -		} -	} - -	return deqs; -} - -static boolean -nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) -{ -	struct tgsi_full_instruction insn = tok->FullInstruction; -	const struct tgsi_full_dst_register *fd; -	unsigned i, deqs, rdep[4], m[4]; - -	fd = &tok->FullInstruction.Dst[0]; -	deqs = nv50_tgsi_scan_swizzle(&insn, rdep); - -	if (is_scalar_op(insn.Instruction.Opcode)) { -		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); -		if (!pc->r_brdc) -			pc->r_brdc = temp_temp(pc, NULL); -		return nv50_program_tx_insn(pc, &insn); -	} -	pc->r_brdc = NULL; - -	if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) -		return nv50_program_tx_insn(pc, &insn); - -	deqs = nv50_revdep_reorder(m, rdep); - -	for (i = 0; i < 4; ++i) { -		assert(pc->r_dst[m[i]] == NULL); - -		insn.Dst[0].Register.WriteMask = -			fd->Register.WriteMask & (1 << m[i]); - -		if (!insn.Dst[0].Register.WriteMask) -			continue; - -		if (deqs & (1 << i)) -			pc->r_dst[m[i]] = alloc_temp(pc, NULL); - -		if (!nv50_program_tx_insn(pc, &insn)) -			return FALSE; -	} - -	for (i = 0; i < 4; i++) { -		struct nv50_reg *reg = pc->r_dst[i]; -		if (!reg) -			continue; -		pc->r_dst[i] = NULL; - -		if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) -			emit_sat(pc, tgsi_dst(pc, i, fd), reg); -		else -			emit_mov(pc, tgsi_dst(pc, i, fd), reg); -		free_temp(pc, reg); -	} - -	return TRUE; -} - -static void -load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) -{ -	struct nv50_reg *iv, **ppiv; -	unsigned mode = pc->interp_mode[reg->index]; - -	ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; -	iv = *ppiv; - -	if ((mode & INTERP_PERSPECTIVE) && !iv) { -		iv = *ppiv = alloc_temp(pc, NULL); -		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; - -		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); -		emit_flop(pc, NV50_FLOP_RCP, iv, iv); - -		/* XXX: when loading interpolants dynamically, move these -		 * to the program head, or make sure it can't be skipped. -		 */ -	} - -	emit_interp(pc, reg, iv, mode); -} - -/* The face input is always at v[255] (varying space), with a - * value of 0 for back-facing, and 0xffffffff for front-facing. - */ -static void -load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv) -{ -	struct nv50_reg *temp = alloc_temp(pc, NULL); -	int r_pred = 0; - -	temp->rhw = 255; -	emit_interp(pc, temp, NULL, INTERP_FLAT); - -	emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32); - -	emit_not(pc, temp, temp); -	set_pred(pc, 0x2, r_pred, pc->p->exec_tail); -	emit_cvt(pc, sv, temp, -1, CVT_F32_S32); -	set_pred(pc, 0x2, r_pred, pc->p->exec_tail); - -	free_temp(pc, temp); -} - -static void -load_instance_id(struct nv50_pc *pc, unsigned index) -{ -	struct nv50_reg reg, mem; - -	ctor_reg(®, P_TEMP, -1, -1); -	ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */ -	mem.buf_index = 2; - -	emit_add_b32(pc, ®, &pc->sysval[index], &mem); -	pc->sysval[index] = reg; -} - -static void -copy_semantic_info(struct nv50_program *p) -{ -	unsigned i, id; - -	for (i = 0; i < p->cfg.in_nr; ++i) { -		id = p->cfg.in[i].id; -		p->cfg.in[i].sn = p->info.input_semantic_name[id]; -		p->cfg.in[i].si = p->info.input_semantic_index[id]; -	} - -	for (i = 0; i < p->cfg.out_nr; ++i) { -		id = p->cfg.out[i].id; -		p->cfg.out[i].sn = p->info.output_semantic_name[id]; -		p->cfg.out[i].si = p->info.output_semantic_index[id]; -	} -} - -static boolean -nv50_program_tx_prep(struct nv50_pc *pc) -{ -	struct tgsi_parse_context tp; -	struct nv50_program *p = pc->p; -	boolean ret = FALSE; -	unsigned i, c, instance_id = 0, vertex_id = 0, flat_nr = 0; - -	tgsi_parse_init(&tp, pc->p->pipe.tokens); -	while (!tgsi_parse_end_of_tokens(&tp)) { -		const union tgsi_full_token *tok = &tp.FullToken; - -		tgsi_parse_token(&tp); -		switch (tok->Token.Type) { -		case TGSI_TOKEN_TYPE_IMMEDIATE: -		{ -			const struct tgsi_full_immediate *imm = -				&tp.FullToken.FullImmediate; - -			ctor_immd_4f32(pc, imm->u[0].Float, -				       imm->u[1].Float, -				       imm->u[2].Float, -				       imm->u[3].Float); -		} -			break; -		case TGSI_TOKEN_TYPE_DECLARATION: -		{ -			const struct tgsi_full_declaration *d; -			unsigned si, last, first, mode; - -			d = &tp.FullToken.FullDeclaration; -			first = d->Range.First; -			last = d->Range.Last; - -			switch (d->Declaration.File) { -			case TGSI_FILE_TEMPORARY: -				break; -			case TGSI_FILE_OUTPUT: -				if (!d->Declaration.Semantic || -				    p->type == PIPE_SHADER_FRAGMENT) -					break; - -				si = d->Semantic.Index; -				switch (d->Semantic.Name) { -				case TGSI_SEMANTIC_BCOLOR: -					p->cfg.two_side[si].hw = first; -					if (p->cfg.out_nr > first) -						p->cfg.out_nr = first; -					break; -				case TGSI_SEMANTIC_PSIZE: -					p->cfg.psiz = first; -					if (p->cfg.out_nr > first) -						p->cfg.out_nr = first; -					break; -				case TGSI_SEMANTIC_EDGEFLAG: -					pc->edgeflag_out = first; -					break; -					/* -				case TGSI_SEMANTIC_CLIP_DISTANCE: -					p->cfg.clpd = MIN2(p->cfg.clpd, first); -					break; -					*/ -				default: -					break; -				} -				break; -			case TGSI_FILE_INPUT: -			{ -				if (p->type != PIPE_SHADER_FRAGMENT) -					break; - -				switch (d->Declaration.Interpolate) { -				case TGSI_INTERPOLATE_CONSTANT: -					mode = INTERP_FLAT; -					flat_nr++; -					break; -				case TGSI_INTERPOLATE_PERSPECTIVE: -					mode = INTERP_PERSPECTIVE; -					p->cfg.regs[1] |= 0x08 << 24; -					break; -				default: -					mode = INTERP_LINEAR; -					break; -				} -				if (d->Declaration.Centroid) -					mode |= INTERP_CENTROID; - -				assert(last < 32); -				for (i = first; i <= last; i++) -					pc->interp_mode[i] = mode; -			} -				break; -			case TGSI_FILE_SYSTEM_VALUE: -				assert(d->Declaration.Semantic); -				switch (d->Semantic.Name) { -				case TGSI_SEMANTIC_FACE: -					assert(p->type == PIPE_SHADER_FRAGMENT); -					load_frontfacing(pc, -							 &pc->sysval[first]); -					break; -				case TGSI_SEMANTIC_INSTANCEID: -					assert(p->type == PIPE_SHADER_VERTEX); -					instance_id = first; -					p->cfg.regs[0] |= (1 << 4); -					break; -				case TGSI_SEMANTIC_PRIMID: -					assert(p->type != PIPE_SHADER_VERTEX); -					p->cfg.prim_id = first; -					break; -					/* -				case TGSI_SEMANTIC_PRIMIDIN: -					assert(p->type == PIPE_SHADER_GEOMETRY); -					pc->sysval[first].hw = 6; -					p->cfg.regs[0] |= (1 << 8); -					break; -				case TGSI_SEMANTIC_VERTEXID: -					assert(p->type == PIPE_SHADER_VERTEX); -					vertex_id = first; -					p->cfg.regs[0] |= (1 << 12) | (1 << 0); -					break; -					*/ -				} -				break; -			case TGSI_FILE_ADDRESS: -			case TGSI_FILE_CONSTANT: -			case TGSI_FILE_SAMPLER: -				break; -			default: -				NOUVEAU_ERR("bad decl file %d\n", -					    d->Declaration.File); -				goto out_err; -			} -		} -			break; -		case TGSI_TOKEN_TYPE_INSTRUCTION: -			pc->insn_nr++; -			prep_inspect_insn(pc, &tok->FullInstruction); -			break; -		default: -			break; -		} -	} - -	if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) { -		int rid = 0; - -		if (p->type == PIPE_SHADER_GEOMETRY) { -			for (i = 0; i < pc->attr_nr; ++i) { -				p->cfg.in[i].hw = rid; -				p->cfg.in[i].id = i; - -				for (c = 0; c < 4; ++c) { -					int n = i * 4 + c; -					if (!pc->attr[n].acc) -						continue; -					pc->attr[n].hw = rid++; -					p->cfg.in[i].mask |= 1 << c; -				} -			} -		} else { -			for (i = 0; i < pc->attr_nr * 4; ++i) { -				if (pc->attr[i].acc) { -					pc->attr[i].hw = rid++; -					p->cfg.attr[i / 32] |= 1 << (i % 32); -				} -			} -			if (p->cfg.regs[0] & (1 << 0)) -				pc->sysval[vertex_id].hw = rid++; -			if (p->cfg.regs[0] & (1 << 4)) { -				pc->sysval[instance_id].hw = rid++; -				load_instance_id(pc, instance_id); -			} -		} - -		for (i = 0, rid = 0; i < pc->result_nr; ++i) { -			p->cfg.out[i].hw = rid; -			p->cfg.out[i].id = i; - -			for (c = 0; c < 4; ++c) { -				int n = i * 4 + c; -				if (!pc->result[n].acc) -					continue; -				pc->result[n].hw = rid++; -				p->cfg.out[i].mask |= 1 << c; -			} -		} -		if (p->cfg.prim_id < 0x40) { -			/* GP has to write to PrimitiveID */ -			ctor_reg(&pc->sysval[p->cfg.prim_id], -				 P_RESULT, p->cfg.prim_id, rid); -			p->cfg.prim_id = rid++; -		} - -		for (c = 0; c < 2; ++c) -			if (p->cfg.two_side[c].hw < 0x40) -				p->cfg.two_side[c] = p->cfg.out[ -					p->cfg.two_side[c].hw]; - -		if (p->cfg.psiz < 0x40) -			p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw; - -		copy_semantic_info(p); -	} else -	if (p->type == PIPE_SHADER_FRAGMENT) { -		int rid = 0, aid; -		unsigned n = 0, m = pc->attr_nr - flat_nr; - -		pc->allow32 = TRUE; - -		/* do we read FragCoord ? */ -		if (pc->attr_nr && -		    p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) { -			/* select FCRD components we want accessible */ -			for (c = 0; c < 4; ++c) -				if (pc->attr[c].acc) -					p->cfg.regs[1] |= 1 << (24 + c); -			aid = 0; -		} else /* offset by 1 if FCRD.w is needed for pinterp */ -			aid = popcnt4(p->cfg.regs[1] >> 24); - -		/* non-flat interpolants have to be mapped to -		 * the lower hardware IDs, so sort them: -		 */ -		for (i = 0; i < pc->attr_nr; i++) { -			if (pc->interp_mode[i] == INTERP_FLAT) -				p->cfg.in[m++].id = i; -			else { -				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) -					p->cfg.in[n].linear = TRUE; -				p->cfg.in[n++].id = i; -			} -		} -		copy_semantic_info(p); - -		for (n = 0; n < pc->attr_nr; ++n) { -			p->cfg.in[n].hw = rid = aid; -			i = p->cfg.in[n].id; - -			if (p->info.input_semantic_name[i] == -			    TGSI_SEMANTIC_FACE) { -				load_frontfacing(pc, &pc->attr[i * 4]); -				continue; -			} - -			for (c = 0; c < 4; ++c) { -				if (!pc->attr[i * 4 + c].acc) -					continue; -				pc->attr[i * 4 + c].rhw = rid++; -				p->cfg.in[n].mask |= 1 << c; +   int i, c; -				load_interpolant(pc, &pc->attr[i * 4 + c]); -			} -			aid += popcnt4(p->cfg.in[n].mask); -		} +   for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) +      for (c = 0; c < 4; ++c) +         ti->output_access[i][c] = id; -		m = popcnt4(p->cfg.regs[1] >> 24); - -		/* set count of non-position inputs and of non-flat -		 * non-position inputs for FP_INTERPOLANT_CTRL -		 */ -		p->cfg.regs[1] |= aid - m; - -		if (flat_nr) { -			i = p->cfg.in[pc->attr_nr - flat_nr].hw; -			p->cfg.regs[1] |= (i - m) << 16; -		} else -			p->cfg.regs[1] |= p->cfg.regs[1] << 16; - -		/* mark color semantic for light-twoside */ -		n = 0x80; -		for (i = 0; i < p->cfg.in_nr; i++) { -			if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) { -				n = MIN2(n, p->cfg.in[i].hw - m); -				p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i]; - -				p->cfg.regs[0] += /* increase colour count */ -					popcnt4(p->cfg.in[i].mask) << 16; -			} -		} -		if (n < 0x80) -			p->cfg.regs[0] += n; - -		if (p->cfg.prim_id < 0x40) { -			pc->sysval[p->cfg.prim_id].rhw = rid++; -			emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL, -				    INTERP_FLAT); -			/* increase FP_INTERPOLANT_CTRL_COUNT */ -			p->cfg.regs[1] += 1; -		} - -		/* Initialize FP results: -		 * FragDepth is always first TGSI and last hw output -		 */ -		i = p->info.writes_z ? 4 : 0; -		for (rid = 0; i < pc->result_nr * 4; i++) -			pc->result[i].rhw = rid++; -		if (p->info.writes_z) -			pc->result[2].rhw = rid++; - -		p->cfg.high_result = rid; - -		/* separate/different colour results for MRTs ? */ -		if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) -			p->cfg.regs[2] |= 1; -	} - -	if (pc->immd_nr) { -		int rid = 0; - -		pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); -		if (!pc->immd) -			goto out_err; - -		for (i = 0; i < pc->immd_nr; i++) { -			for (c = 0; c < 4; c++, rid++) -				ctor_reg(&pc->immd[rid], P_IMMD, i, rid); -		} -	} - -	ret = TRUE; -out_err: -	if (pc->iv_p) -		free_temp(pc, pc->iv_p); -	if (pc->iv_c) -		free_temp(pc, pc->iv_c); - -	tgsi_parse_free(&tp); -	return ret; +   ti->indirect_outputs = TRUE;  }  static void -free_nv50_pc(struct nv50_pc *pc) +prog_inst(struct nv50_translation_info *ti, +          const struct tgsi_full_instruction *inst, int id)  { -	if (pc->immd) -		FREE(pc->immd); -	if (pc->param) -		FREE(pc->param); -	if (pc->result) -		FREE(pc->result); -	if (pc->attr) -		FREE(pc->attr); -	if (pc->temp) -		FREE(pc->temp); -	if (pc->sysval) -		FREE(pc->sysval); -	if (pc->insn_pos) -		FREE(pc->insn_pos); - -	FREE(pc); -} - -static INLINE uint32_t -nv50_map_gs_output_prim(unsigned pprim) -{ -	switch (pprim) { -	case PIPE_PRIM_POINTS: -		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS; -	case PIPE_PRIM_LINE_STRIP: -		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP; -	case PIPE_PRIM_TRIANGLE_STRIP: -		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP; -	default: -		NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim); -		abort(); -		return 0; -	} -} - -static boolean -ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) -{ -	int i, c; -	unsigned rtype[2] = { P_ATTR, P_RESULT }; - -	pc->p = p; -	pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; -	pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; -	pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; -	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; -	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; -	assert(pc->addr_nr <= 2); -	pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1; - -	p->cfg.high_temp = 4; - -	p->cfg.two_side[0].hw = 0x40; -	p->cfg.two_side[1].hw = 0x40; -	p->cfg.prim_id = 0x40; - -	p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; - -	for (i = 0; i < p->info.num_properties; ++i) { -		unsigned *data = &p->info.properties[i].data[0]; - -		switch (p->info.properties[i].name) { -		case TGSI_PROPERTY_GS_OUTPUT_PRIM: -			p->cfg.prim_type = nv50_map_gs_output_prim(data[0]); -			break; -		case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES: -			p->cfg.vert_count = data[0]; -			break; -		default: -			break; -		} -	} - -	switch (p->type) { -	case PIPE_SHADER_VERTEX: -		p->cfg.psiz = 0x40; -		p->cfg.clpd = 0x40; -		p->cfg.out_nr = pc->result_nr; -		break; -	case PIPE_SHADER_GEOMETRY: -		assert(p->cfg.prim_type); -		assert(p->cfg.vert_count); - -		p->cfg.psiz = 0x80; -		p->cfg.clpd = 0x80; -		p->cfg.prim_id = 0x80; -		p->cfg.out_nr = pc->result_nr; -		p->cfg.in_nr = pc->attr_nr; - -		p->cfg.two_side[0].hw = 0x80; -		p->cfg.two_side[1].hw = 0x80; -		break; -	case PIPE_SHADER_FRAGMENT: -		rtype[0] = rtype[1] = P_TEMP; +   const struct tgsi_dst_register *dst; +   const struct tgsi_src_register *src; +   int s, c, k; +   unsigned mask; -		p->cfg.regs[0] = 0x01000004; -		p->cfg.in_nr = pc->attr_nr; +   if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) { +      for (c = 0; c < 4; ++c) { +         dst = &inst->Dst[0].Register; +         if (inst->Dst[0].Register.Indirect) +            nv50_indirect_outputs(ti, id); +         if (!(dst->WriteMask & (1 << c))) +            continue; +         ti->output_access[dst->Index][c] = id; +      } -		if (p->info.writes_z) { -			p->cfg.regs[2] |= 0x00000100; -			p->cfg.regs[3] |= 0x00000011; -		} -		if (p->info.uses_kill) -			p->cfg.regs[2] |= 0x00100000; -		break; -	} +      if (inst->Instruction.Opcode == TGSI_OPCODE_MOV && +          inst->Src[0].Register.File == TGSI_FILE_INPUT && +          dst->Index == ti->edgeflag_out) +         ti->p->vp.edgeflag = inst->Src[0].Register.Index; +   } -	if (pc->temp_nr) { -		pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); -		if (!pc->temp) -			return FALSE; +   for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { +      src = &inst->Src[s].Register; +      if (src->File != TGSI_FILE_INPUT) +         continue; +      mask = nv50_tgsi_src_mask(inst, s); -		for (i = 0; i < pc->temp_nr * 4; ++i) -			ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); -	} +      if (inst->Src[s].Register.Indirect) +         nv50_indirect_inputs(ti, id); -	if (pc->attr_nr) { -		pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); -		if (!pc->attr) -			return FALSE; - -		for (i = 0; i < pc->attr_nr * 4; ++i) -			ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); -	} - -	if (pc->result_nr) { -		unsigned nr = pc->result_nr * 4; - -		pc->result = MALLOC(nr * sizeof(struct nv50_reg)); -		if (!pc->result) -			return FALSE; - -		for (i = 0; i < nr; ++i) -			ctor_reg(&pc->result[i], rtype[1], i / 4, -1); -	} - -	if (pc->param_nr) { -		int rid = 0; - -		pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); -		if (!pc->param) -			return FALSE; - -		for (i = 0; i < pc->param_nr; ++i) -			for (c = 0; c < 4; ++c, ++rid) -				ctor_reg(&pc->param[rid], P_CONST, i, rid); -	} - -	if (pc->addr_nr) { -		pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); -		if (!pc->addr) -			return FALSE; -	} -	for (i = 0; i < NV50_SU_MAX_ADDR; ++i) -		ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1); - -	if (pc->sysval_nr) { -		pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *)); -		if (!pc->sysval) -			return FALSE; -		/* will only ever use SYSTEM_VALUE[i].x (hopefully) */ -		for (i = 0; i < pc->sysval_nr; ++i) -			ctor_reg(&pc->sysval[i], rtype[0], i, -1); -	} - -	return TRUE; +      for (c = 0; c < 4; ++c) { +         if (!(mask & (1 << c))) +            continue; +         k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c); +         if (k <= TGSI_SWIZZLE_W) +            ti->input_access[src->Index][k] = id; +      } +   }  }  static void -nv50_program_fixup_insns(struct nv50_pc *pc) +prog_immediate(struct nv50_translation_info *ti, +               const struct tgsi_full_immediate *imm)  { -	struct nv50_program_exec *e, **bra_list; -	unsigned i, n, pos; - -	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); - -	/* Collect branch instructions, we need to adjust their offsets -	 * when converting 32 bit instructions to 64 bit ones -	 */ -	for (n = 0, e = pc->p->exec_head; e; e = e->next) -		if (e->param.index >= 0 && !e->param.mask) -			bra_list[n++] = e; - -	/* Make sure we don't have any single 32 bit instructions. */ -	for (e = pc->p->exec_head, pos = 0; e; e = e->next) { -		pos += is_long(e) ? 2 : 1; - -		if ((pos & 1) && (!e->next || is_long(e->next))) { -			for (i = 0; i < n; ++i) -				if (bra_list[i]->param.index >= pos) -					bra_list[i]->param.index += 1; -			for (i = 0; i < pc->insn_nr; ++i) -				if (pc->insn_pos[i] >= pos) -					pc->insn_pos[i] += 1; -			convert_to_long(pc, e); -			++pos; -		} -	} - -	FREE(bra_list); +   int c; +   unsigned n = ++ti->immd32_nr; -	if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) -		return; +   if (n == (1 << (ffs(n) - 1))) +      ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16); -	/* fill in CALL offsets */ -	for (e = pc->p->exec_head; e; e = e->next) { -		if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) -			e->param.index = pc->insn_pos[e->param.index]; -	} +   for (c = 0; c < 4; ++c) +      ti->immd32[(n - 1) * 4 + c] = imm->u[c].Uint;  } -static boolean -nv50_program_tx(struct nv50_program *p) +static INLINE unsigned +translate_interpolate(const struct tgsi_full_declaration *decl)  { -	struct tgsi_parse_context parse; -	struct nv50_pc *pc; -	boolean ret; - -	pc = CALLOC_STRUCT(nv50_pc); -	if (!pc) -		return FALSE; - -	ret = ctor_nv50_pc(pc, p); -	if (ret == FALSE) -		goto out_cleanup; - -	ret = nv50_program_tx_prep(pc); -	if (ret == FALSE) -		goto out_cleanup; - -	pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); - -	tgsi_parse_init(&parse, pc->p->pipe.tokens); -	while (!tgsi_parse_end_of_tokens(&parse)) { -		const union tgsi_full_token *tok = &parse.FullToken; +   unsigned mode; -		/* previously allow32 was FALSE for first & last instruction */ -		pc->allow32 = TRUE; +   if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_CONSTANT) +      mode = NV50_INTERP_FLAT; +   else +   if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_PERSPECTIVE) +      mode = 0; +   else +      mode = NV50_INTERP_LINEAR; -		tgsi_parse_token(&parse); +   if (decl->Declaration.Centroid) +      mode |= NV50_INTERP_CENTROID; -		switch (tok->Token.Type) { -		case TGSI_TOKEN_TYPE_INSTRUCTION: -			pc->insn_pos[pc->insn_cur] = pc->p->exec_size; -			++pc->insn_cur; -			ret = nv50_tgsi_insn(pc, tok); -			if (ret == FALSE) -				goto out_err; -			break; -		default: -			break; -		} -	} - -	nv50_program_fixup_insns(pc); - -	p->param_nr = pc->param_nr * 4; -	p->immd_nr = pc->immd_nr * 4; -	p->immd = pc->immd_buf; - -out_err: -	tgsi_parse_free(&parse); - -out_cleanup: -	free_nv50_pc(pc); -	return ret; +   return mode;  }  static void -nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) +prog_decl(struct nv50_translation_info *ti, +          const struct tgsi_full_declaration *decl)  { -	if (nv50_program_tx(p) == FALSE) -		assert(0); -	p->translated = TRUE; -} +   unsigned i, first, last, sn = 0, si = 0; -static void -nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, -			unsigned start, unsigned count, unsigned cbuf) -{ -	struct nouveau_channel *chan = nv50->screen->base.channel; -	struct nouveau_grobj *tesla = nv50->screen->tesla; +   first = decl->Range.First; +   last = decl->Range.Last; -	while (count) { -		unsigned nr = count > 2047 ? 2047 : count; +   if (decl->Declaration.Semantic) { +      sn = decl->Semantic.Name; +      si = decl->Semantic.Index; +   } +   tgsi_dump_declaration(decl); -		BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); -		OUT_RING  (chan, (cbuf << 0) | (start << 8)); -		BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); -		OUT_RINGp (chan, map, nr); +   switch (decl->Declaration.File) { +   case TGSI_FILE_INPUT: +      for (i = first; i <= last; ++i) +         ti->interp_mode[i] = translate_interpolate(decl); -		map += nr; -		start += nr; -		count -= nr; -	} -} - -static void -nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) -{ -	struct pipe_context *pipe = &nv50->pipe; -	struct pipe_transfer *transfer; +      if (!decl->Declaration.Semantic) +         break; -	if (!p->data[0] && p->immd_nr) { -		struct nouveau_resource *heap = nv50->screen->immd_heap; +      for (i = first; i <= last; ++i) { +         ti->p->in[i].sn = sn; +         ti->p->in[i].si = si; +      } -		if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { -			while (heap->next && heap->size < p->immd_nr) { -				struct nv50_program *evict = heap->next->priv; -				nouveau_resource_free(&evict->data[0]); -			} +      switch (sn) { +      case TGSI_SEMANTIC_FACE: +         break; +      case TGSI_SEMANTIC_COLOR: +         if (ti->p->type == PIPE_SHADER_FRAGMENT) +            ti->p->vp.bfc[si] = first; +         break; +      } +      break; +   case TGSI_FILE_OUTPUT: +      if (!decl->Declaration.Semantic) +         break; -			if (nouveau_resource_alloc(heap, p->immd_nr, p, -						   &p->data[0])) -				assert(0); -		} +      for (i = first; i <= last; ++i) { +         ti->p->out[i].sn = sn; +         ti->p->out[i].si = si; +      } -		/* immediates only need to be uploaded again when freed */ -		nv50_program_upload_data(nv50, p->immd, p->data[0]->start, -					 p->immd_nr, NV50_CB_PMISC); -	} - -	assert(p->param_nr <= 16384); - -	if (p->param_nr) { -		unsigned cb; -		uint32_t *map = pipe_buffer_map(pipe, -						nv50->constbuf[p->type], -						PIPE_TRANSFER_READ, -						&transfer); -		switch (p->type) { -		case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break; -		case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break; -		default: -			cb = NV50_CB_PVP; -			assert(p->type == PIPE_SHADER_VERTEX); -			break; -		} - -		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); -		pipe_buffer_unmap(pipe, nv50->constbuf[p->type], -				  transfer); -	} +      switch (sn) { +      case TGSI_SEMANTIC_BCOLOR: +         ti->p->vp.bfc[si] = first; +         break; +      case TGSI_SEMANTIC_PSIZE: +         ti->p->vp.psiz = first; +         break; +      case TGSI_SEMANTIC_EDGEFLAG: +         ti->edgeflag_out = first; +         break; +      default: +         break; +      } +      break; +   case TGSI_FILE_SYSTEM_VALUE: +      switch (decl->Semantic.Name) { +      case TGSI_SEMANTIC_FACE: +         break; +      case TGSI_SEMANTIC_INSTANCEID: +         break; +      case TGSI_SEMANTIC_PRIMID: +         break; +         /* +      case TGSI_SEMANTIC_PRIMIDIN: +         break; +      case TGSI_SEMANTIC_VERTEXID: +         break; +         */ +      default: +         break; +      } +      break; +   case TGSI_FILE_CONSTANT: +      ti->p->parm_size = MAX2(ti->p->parm_size, (last + 1) * 16); +      break; +   case TGSI_FILE_ADDRESS: +   case TGSI_FILE_SAMPLER: +   case TGSI_FILE_TEMPORARY: +      break; +   default: +      assert(0); +      break; +   }  } -static void -nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) +static int +nv50_vertprog_prepare(struct nv50_translation_info *ti)  { -	struct nouveau_channel *chan = nv50->screen->base.channel; -	struct nouveau_grobj *tesla = nv50->screen->tesla; -	struct nv50_program_exec *e; -	uint32_t *up, i; -	boolean upload = FALSE; -	unsigned offset; -	int width; - -	if (!p->bo) { -		nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, -			       p->exec_size * 4, &p->bo); -		upload = TRUE; -	} - -	if (p->data[0] && p->data[0]->start != p->data_start[0]) -		upload = TRUE; - -	if (!upload) -		return; - -	up = MALLOC(p->exec_size * 4); - -	for (i = 0, e = p->exec_head; e; e = e->next) { -		unsigned ei, ci, bs; - -		if (e->param.index >= 0 && e->param.mask) { -			bs = (e->inst[1] >> 22) & 0x07; -			assert(bs < 2); -			ei = e->param.shift >> 5; -			ci = e->param.index; -			if (bs == 0) -				ci += p->data[bs]->start; +   struct nv50_program *p = ti->p; +   int i, c; +   unsigned num_inputs = 0; -			e->inst[ei] &= ~e->param.mask; -			e->inst[ei] |= (ci << e->param.shift); -		} else -		if (e->param.index >= 0) { -			/* zero mask means param is a jump/branch offset */ -			assert(!(e->param.index & 1)); -			/* seem to be 8 byte steps */ -			ei = (e->param.index >> 1) + 0 /* START_ID */; +   ti->input_file = NV_FILE_MEM_S; +   ti->output_file = NV_FILE_OUT; -			e->inst[0] &= 0xf0000fff; -			e->inst[0] |= ei << 12; -		} +   for (i = 0; i <= ti->scan.file_max[TGSI_FILE_INPUT]; ++i) { +      p->in[i].id = i; +      p->in[i].hw = num_inputs; -		up[i++] = e->inst[0]; -		if (is_long(e)) -			up[i++] = e->inst[1]; -	} -	assert(i == p->exec_size); +      for (c = 0; c < 4; ++c) { +         if (!ti->input_access[i][c]) +            continue; +         ti->input_map[i][c] = num_inputs++; +         p->vp.attrs[(4 * i + c) / 32] |= 1 << ((i * 4 + c) % 32); +      } +   } -	if (p->data[0]) -		p->data_start[0] = p->data[0]->start; +   for (i = 0; i <= ti->scan.file_max[TGSI_FILE_OUTPUT]; ++i) { +      p->out[i].id = i; +      p->out[i].hw = p->max_out; -#ifdef NV50_PROGRAM_DUMP -	NOUVEAU_ERR("-------\n"); -	for (e = p->exec_head; e; e = e->next) { -		NOUVEAU_ERR("0x%08x\n", e->inst[0]); -		if (is_long(e)) -			NOUVEAU_ERR("0x%08x\n", e->inst[1]); -	} -#endif +      for (c = 0; c < 4; ++c) { +         if (!ti->output_access[i][c]) +            continue; +         ti->output_map[i][c] = p->max_out++; +         p->out[i].mask |= 1 << c; +      } +   } -	/* SIFC_HEIGHT/SIFC_WIDTH of 65536 do not work, and are not reported -	 * as data error either. hw bug ? */ -#define SIFC_MAX_WIDTH (65536 - 256) -	offset = 0; -	width = p->exec_size * 4; -	while (width > 0) { -		nv50_upload_sifc(nv50, p->bo, offset, NOUVEAU_BO_VRAM, -				 NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, -				 &up[offset / 4], NV50_2D_SIFC_FORMAT_R8_UNORM, -				 0, 0, 0, MIN2(SIFC_MAX_WIDTH, width), 1, 1); -		width -= SIFC_MAX_WIDTH; -		offset += SIFC_MAX_WIDTH; -	} -	BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); -	OUT_RING  (chan, 0); +   if (p->vp.psiz < 0x40) +      p->vp.psiz = p->out[p->vp.psiz].hw; -	FREE(up); +   return 0;  } -struct nouveau_stateobj * -nv50_vertprog_validate(struct nv50_context *nv50) +static int +nv50_fragprog_prepare(struct nv50_translation_info *ti)  { -	struct nouveau_grobj *tesla = nv50->screen->tesla; -	struct nv50_program *p = nv50->vertprog; -	struct nouveau_stateobj *so; - -	if (!p->translated) { -		nv50_program_validate(nv50, p); -		if (!p->translated) -			assert(0); -	} - -	nv50_program_validate_data(nv50, p); -	nv50_program_validate_code(nv50, p); - -	if (!(nv50->dirty & NV50_NEW_VERTPROG)) -		return NULL; +   struct nv50_program *p = ti->p; +   int i, j, c; +   unsigned nvary, nintp, depr; +   unsigned n = 0, m = 0, skip = 0; +   ubyte sn[16], si[16]; -	so = so_new(5, 7, 2); -	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); -	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | -		  NOUVEAU_BO_HIGH, 0, 0); -	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | -		  NOUVEAU_BO_LOW, 0, 0); -	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); -	so_data  (so, p->cfg.attr[0]); -	so_data  (so, p->cfg.attr[1]); -	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); -	so_data  (so, p->cfg.high_result); -	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); -	so_data  (so, p->cfg.high_temp); -	so_method(so, tesla, NV50TCL_VP_START_ID, 1); -	so_data  (so, 0); /* program start offset */ -	return so; -} - -struct nouveau_stateobj * -nv50_fragprog_validate(struct nv50_context *nv50) -{ -	struct nouveau_grobj *tesla = nv50->screen->tesla; -	struct nv50_program *p = nv50->fragprog; -	struct nouveau_stateobj *so; +   /* FP flags */ -	if (!p->translated) { -		nv50_program_validate(nv50, p); -		if (!p->translated) -			assert(0); -	} +   if (ti->scan.writes_z) { +      p->fp.flags[1] = 0x11; +      p->fp.flags[0] |= NV50TCL_FP_CONTROL_EXPORTS_Z; +   } -	nv50_program_validate_data(nv50, p); -	nv50_program_validate_code(nv50, p); +   if (ti->scan.uses_kill) +      p->fp.flags[0] |= NV50TCL_FP_CONTROL_USES_KIL; -	if (!(nv50->dirty & NV50_NEW_FRAGPROG)) -		return NULL; +   /* FP inputs */ -	so = so_new(6, 7, 2); -	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); -	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | -		      NOUVEAU_BO_HIGH, 0, 0); -	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | -		      NOUVEAU_BO_LOW, 0, 0); -	so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); -	so_data  (so, p->cfg.high_temp); -	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); -	so_data  (so, p->cfg.high_result); -	so_method(so, tesla, NV50TCL_FP_CONTROL, 1); -	so_data  (so, p->cfg.regs[2]); -	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); -	so_data  (so, p->cfg.regs[3]); -	so_method(so, tesla, NV50TCL_FP_START_ID, 1); -	so_data  (so, 0); /* program start offset */ -	return so; -} +   ti->input_file = NV_FILE_MEM_V; +   ti->output_file = NV_FILE_GPR; -struct nouveau_stateobj * -nv50_geomprog_validate(struct nv50_context *nv50) -{ -	struct nouveau_grobj *tesla = nv50->screen->tesla; -	struct nv50_program *p = nv50->geomprog; -	struct nouveau_stateobj *so; +   /* count non-flat inputs, save semantic info */ +   for (i = 0; i < p->in_nr; ++i) { +      m += (ti->interp_mode[i] & NV50_INTERP_FLAT) ? 0 : 1; +      sn[i] = p->in[i].sn; +      si[i] = p->in[i].si; +   } -	if (!p->translated) { -		nv50_program_validate(nv50, p); -		if (!p->translated) -			assert(0); -	} +   /* reorder p->in[] so that non-flat inputs are first and +    * kick out special inputs that don't use VP/GP_RESULT_MAP +    */ +   nintp = 0; +   for (i = 0; i < p->in_nr; ++i) { +      if (sn[i] == TGSI_SEMANTIC_POSITION) { +         for (c = 0; c < 4; ++c) { +            ti->input_map[i][c] = nintp; +            if (ti->input_access[i][c]) { +               p->fp.interp |= 1 << (24 + c); +               ++nintp; +            } +         } +         skip++; +         continue; +      } else +      if (sn[i] == TGSI_SEMANTIC_FACE) { +         ti->input_map[i][0] = 255; +         skip++; +         continue; +      } -	nv50_program_validate_data(nv50, p); -	nv50_program_validate_code(nv50, p); +      j = (ti->interp_mode[i] & NV50_INTERP_FLAT) ? m++ : n++; -	if (!(nv50->dirty & NV50_NEW_GEOMPROG)) -		return NULL; +      if (sn[i] == TGSI_SEMANTIC_COLOR) +         p->vp.bfc[si[i]] = j; +	    +      p->in[j].linear = (ti->interp_mode[i] & NV50_INTERP_LINEAR) ? 1 : 0; +      p->in[j].id = i; +      p->in[j].sn = sn[i]; +      p->in[j].si = si[i]; +   } +   assert(n <= m); +   p->in_nr -= skip; -	so = so_new(6, 7, 2); -	so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); -	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | -		  NOUVEAU_BO_HIGH, 0, 0); -	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | -		  NOUVEAU_BO_LOW, 0, 0); -	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); -	so_data  (so, p->cfg.high_temp); -	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); -	so_data  (so, p->cfg.high_result); -	so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); -	so_data  (so, p->cfg.prim_type); -	so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); -	so_data  (so, p->cfg.vert_count); -	so_method(so, tesla, NV50TCL_GP_START_ID, 1); -	so_data  (so, 0); -	return so; -} +   if (!(p->fp.interp & (8 << 24))) { +      p->fp.interp |= (8 << 24); +      ++nintp; +   } -static uint32_t -nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) -{ -	struct nv50_program *vp; -	struct nv50_program *fp = nv50->fragprog; -	unsigned i, c, m = base; -	uint32_t origin = 0x00000010; +   p->fp.colors = (1 << 24) | 4; /* CLAMP, FFC0_ID = 4 */ -	vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog; +   for (i = 0; i < p->in_nr; ++i) { +      int j = p->in[i].id; +      p->in[i].hw = nintp; -	/* XXX: this might not work correctly in all cases yet - we'll -	 * just assume that an FP generic input that is not written in -	 * the VP is PointCoord. -	 */ -	memset(pntc, 0, 8 * sizeof(uint32_t)); +      for (c = 0; c < 4; ++c) { +         if (!ti->input_access[j][c]) +            continue; +         p->in[i].mask |= 1 << c; +         ti->input_map[j][c] = nintp++; +      } +      /* count color inputs */ +      if (i == p->vp.bfc[0] || i == p->vp.bfc[1]) +         p->fp.colors += bitcount4(p->in[i].mask) << 16; +   } +   nintp -= bitcount4(p->fp.interp >> 24); /* subtract position inputs */ +   nvary = nintp; +   if (n < m) +      nvary -= p->in[n].hw; -	for (i = 0; i < fp->cfg.in_nr; i++) { -		unsigned j, n = popcnt4(fp->cfg.in[i].mask); +   p->fp.interp |= nvary << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_SHIFT; +   p->fp.interp |= nintp << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_SHIFT; -		if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) { -			m += n; -			continue; -		} +   /* FP outputs */ -		for (j = 0; j < vp->cfg.out_nr; ++j) -			if (vp->cfg.out[j].sn ==  fp->cfg.in[i].sn && -			    vp->cfg.out[j].si == fp->cfg.in[i].si) -				break; +   if (p->out_nr > (1 + (ti->scan.writes_z ? 1 : 0))) +      p->fp.flags[0] |= NV50TCL_FP_CONTROL_MULTIPLE_RESULTS; -		if (j < vp->info.num_outputs) { -			ubyte enable = -				 (nv50->rasterizer->pipe.sprite_coord_enable >> vp->cfg.out[j].si) & 1; +   depr = p->out_nr; +   for (i = 0; i < p->out_nr; ++i) { +      p->out[i].id = i; +      if (p->out[i].sn == TGSI_SEMANTIC_POSITION) { +         depr = i; +         continue; +      } +      p->out[i].hw = p->max_out; +      p->out[i].mask = 0xf; -			if (enable == 0) { -				m += n; -				continue; -			} -		} +      for (c = 0; c < 4; ++c) +         ti->output_map[i][c] = p->max_out++; +   } +   if (depr < p->out_nr) { +      p->out[depr].mask = 0x4; +      p->out[depr].hw = p->max_out++; +   } -		/* this is either PointCoord or replaced by sprite coords */ -		for (c = 0; c < 4; c++) { -			if (!(fp->cfg.in[i].mask & (1 << c))) -				continue; -			pntc[m / 8] |= (c + 1) << ((m % 8) * 4); -			++m; -		} -	} -	return (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT ? 0 : origin); +   return 0;  }  static int -nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4], -	      struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) +nv50_geomprog_prepare(struct nv50_translation_info *ti)  { -	int c; -	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; -	uint8_t *map = (uint8_t *)map32; +   ti->input_file = NV_FILE_MEM_S; +   ti->output_file = NV_FILE_OUT; -	for (c = 0; c < 4; ++c) { -		if (mf & 1) { -			if (fpi->linear == TRUE) -				lin[mid / 32] |= 1 << (mid % 32); -			if (mv & 1) -				map[mid] = oid; -			else -				map[mid] = (c == 3) ? (zval + 1) : zval; -			++mid; -		} - -		oid += mv & 1; -		mf >>= 1; -		mv >>= 1; -	} - -	return mid; +   assert(0); +   return 1;  } -struct nouveau_stateobj * -nv50_fp_linkage_validate(struct nv50_context *nv50) +static int +nv50_prog_scan(struct nv50_translation_info *ti)  { -	struct nouveau_grobj *tesla = nv50->screen->tesla; -	struct nv50_program *vp = nv50->vertprog; -	struct nv50_program *fp = nv50->fragprog; -	struct nouveau_stateobj *so; -	struct nv50_sreg4 dummy; -	int i, n, c, m = 0; -	uint32_t map[16], lin[4], reg[6], pcrd[8]; -	uint8_t zval = 0x40; +   struct nv50_program *p = ti->p; +   struct tgsi_parse_context parse; +   int ret; -	if (nv50->geomprog) { -		vp = nv50->geomprog; -		zval = 0x80; -	} -	memset(map, 0, sizeof(map)); -	memset(lin, 0, sizeof(lin)); +   p->vp.psiz = 0x40; +   p->vp.bfc[0] = 0x40; +   p->vp.bfc[1] = 0x40; +   p->gp.primid = 0x80; -	reg[1] = 0x00000004; /* low and high clip distance map ids */ -	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ -	reg[3] = 0x00000000; /* point size map id & enable */ -	reg[5] = 0x00000000; /* primitive ID map slot */ -	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ -	reg[4] = fp->cfg.regs[1]; /* interpolant info */ +   tgsi_scan_shader(p->pipe.tokens, &ti->scan); -	dummy.linear = FALSE; -	dummy.mask = 0xf; /* map all components of HPOS */ -	m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]); +   tgsi_parse_init(&parse, p->pipe.tokens); +   while (!tgsi_parse_end_of_tokens(&parse)) { +      tgsi_parse_token(&parse); -	dummy.mask = 0x0; +      switch (parse.FullToken.Token.Type) { +      case TGSI_TOKEN_TYPE_IMMEDIATE: +         prog_immediate(ti, &parse.FullToken.FullImmediate); +         break; +      case TGSI_TOKEN_TYPE_DECLARATION: +         prog_decl(ti, &parse.FullToken.FullDeclaration); +         break; +      case TGSI_TOKEN_TYPE_INSTRUCTION: +         prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr); +         break; +      } +   } -	if (vp->cfg.clpd < 0x40) { -		for (c = 0; c < vp->cfg.clpd_nr; ++c) { -			map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8); -			++m; -		} -		reg[1] = (m << 8); -	} +   p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1; +   p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1; -	reg[0] |= m << 8; /* adjust BFC0 id */ +   switch (p->type) { +   case PIPE_SHADER_VERTEX: +      ret = nv50_vertprog_prepare(ti); +      break; +   case PIPE_SHADER_FRAGMENT: +      ret = nv50_fragprog_prepare(ti); +      break; +   case PIPE_SHADER_GEOMETRY: +      ret = nv50_geomprog_prepare(ti); +      break; +   default: +      assert(!"unsupported program type"); +      ret = -1; +      break; +   } -	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ -	if (nv50->rasterizer->pipe.light_twoside) { -		struct nv50_sreg4 *vpo = &vp->cfg.two_side[0]; -		struct nv50_sreg4 *fpi = &fp->cfg.two_side[0]; - -		m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]); -		m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]); -	} - -	reg[0] += m - 4; /* adjust FFC0 id */ -	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ - -	for (i = 0; i < fp->cfg.in_nr; i++) { -		/* maybe even remove these from cfg.io */ -		if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION || -		    fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE) -			continue; - -		for (n = 0; n < vp->cfg.out_nr; ++n) -			if (vp->cfg.out[n].sn == fp->cfg.in[i].sn && -			    vp->cfg.out[n].si == fp->cfg.in[i].si) -				break; - -		m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i], -				  (n < vp->cfg.out_nr) ? -				  &vp->cfg.out[n] : &dummy); -	} -	/* PrimitiveID either is replaced by the system value, or -	 * written by the geometry shader into an output register -	 */ -	if (fp->cfg.prim_id < 0x40) { -		map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8); -		reg[5] = m++; -	} - -	if (nv50->rasterizer->pipe.point_size_per_vertex) { -		map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); -		reg[3] = (m++ << 4) | 1; -	} - -	/* now fill the stateobj (at most 28 so_data)  */ -	so = so_new(10, 54, 0); - -	n = (m + 3) / 4; -	assert(m <= 64); -	if (vp->type == PIPE_SHADER_GEOMETRY) { -		so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); -		so_data  (so, m); -		so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); -		so_datap (so, map, n); -	} else { -		so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); -		so_data  (so, vp->cfg.regs[0]); - -		so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); -		so_data  (so, reg[5]); - -		so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); -		so_data  (so, m); -		so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); -		so_datap (so, map, n); -	} - -	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); -	so_datap (so, reg, 4); - -	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); -	so_data  (so, reg[4]); - -	so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); -	so_datap (so, lin, 4); - -	if (nv50->rasterizer->pipe.sprite_coord_enable) { -		so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); -		so_data  (so, -			  nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff)); - -		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); -		so_datap (so, pcrd, 8); -	} - -	so_method(so, tesla, NV50TCL_GP_ENABLE, 1); -	so_data  (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); - -	return so; -} - -static int -construct_vp_gp_mapping(uint32_t *map32, int m, -			struct nv50_program *vp, struct nv50_program *gp) -{ -	uint8_t *map = (uint8_t *)map32; -	int i, j, c; - -        for (i = 0; i < gp->cfg.in_nr; ++i) { -                uint8_t oid = 0, mv = 0, mg = gp->cfg.in[i].mask; - -                for (j = 0; j < vp->cfg.out_nr; ++j) { -                        if (vp->cfg.out[j].sn == gp->cfg.in[i].sn && -                            vp->cfg.out[j].si == gp->cfg.in[i].si) { -				mv = vp->cfg.out[j].mask; -				oid = vp->cfg.out[j].hw; -                                break; -			} -		} - -                for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { -			if (mg & mv & 1) -				map[m++] = oid; -			else -			if (mg & 1) -				map[m++] = (c == 3) ? 0x41 : 0x40; -                        oid += mv & 1; -                } -        } -	return m; +   assert(!ret); +   return ret;  } -struct nouveau_stateobj * -nv50_gp_linkage_validate(struct nv50_context *nv50) +boolean +nv50_program_tx(struct nv50_program *p)  { -	struct nouveau_grobj *tesla = nv50->screen->tesla; -	struct nouveau_stateobj *so; -	struct nv50_program *vp = nv50->vertprog; -	struct nv50_program *gp = nv50->geomprog; -	uint32_t map[16]; -	int m = 0; +   struct nv50_translation_info *ti; +   int ret; -	if (!gp) -		return NULL; -	memset(map, 0, sizeof(map)); +   ti = CALLOC_STRUCT(nv50_translation_info); +   ti->p = p; -	m = construct_vp_gp_mapping(map, m, vp, gp); +   ti->edgeflag_out = PIPE_MAX_SHADER_OUTPUTS; -	so = so_new(3, 24 - 3, 0); +   ret = nv50_prog_scan(ti); +   if (ret) { +      NOUVEAU_ERR("unsupported shader program\n"); +      goto out; +   } -	so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); -	so_data  (so, vp->cfg.regs[0] | gp->cfg.regs[0]); +   ret = nv50_generate_code(ti); +   if (ret) { +      NOUVEAU_ERR("error during shader translation\n"); +      goto out; +   } -	assert(m <= 32); -	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); -	so_data  (so, m); - -	m = (m + 3) / 4; -	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); -	so_datap (so, map, m); - -	return so; +out: +   if (ti->immd32) +      FREE(ti->immd32); +   FREE(ti); +   return ret ? FALSE : TRUE;  }  void  nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)  { -	while (p->exec_head) { -		struct nv50_program_exec *e = p->exec_head; - -		p->exec_head = e->next; -		FREE(e); -	} -	p->exec_tail = NULL; -	p->exec_size = 0; +   nouveau_bo_ref(NULL, &p->bo); -	nouveau_bo_ref(NULL, &p->bo); +   so_ref(NULL, &p->so); -	FREE(p->immd); -	nouveau_resource_free(&p->data[0]); +   if (p->code) +      FREE(p->code); -	p->translated = 0; +   p->translated = FALSE;  } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 1e3ad6bff0..654bce59f3 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -1,75 +1,116 @@ -#ifndef __NV50_PROGRAM_H__ -#define __NV50_PROGRAM_H__ +/* + * Copyright 2010 Ben Skeggs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NV50_PROG_H__ +#define __NV50_PROG_H__  #include "pipe/p_state.h"  #include "tgsi/tgsi_scan.h" +#include "nouveau/nouveau_class.h" -struct nv50_program_exec { -	struct nv50_program_exec *next; - -	unsigned inst[2]; -	struct { -		int index; -		unsigned mask; -		unsigned shift; -	} param; -}; - -struct nv50_sreg4 { -	uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ -	uint8_t id; /* tgsi index */ +struct nv50_varying { +   uint8_t id; /* tgsi index */ +   uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ -	uint8_t mask; -	boolean linear; +   uint8_t mask   : 4; +   uint8_t linear : 1; +   uint8_t pad    : 3; -	ubyte sn, si; /* semantic name & index */ +   ubyte sn; /* semantic name */ +   ubyte si; /* semantic index */  };  struct nv50_program { -	struct pipe_shader_state pipe; -	struct tgsi_shader_info info; -	boolean translated; +   struct pipe_shader_state pipe; -	unsigned type; -	struct nv50_program_exec *exec_head; -	struct nv50_program_exec *exec_tail; -	unsigned exec_size; -	struct nouveau_resource *data[1]; -	unsigned data_start[1]; +   ubyte type; +   boolean translated; -	struct nouveau_bo *bo; +   struct nouveau_bo *bo; +   struct nouveau_stateobj *so; -	uint32_t *immd; -	unsigned immd_nr; -	unsigned param_nr; +   uint32_t *code; +   unsigned code_size; +   unsigned code_start; /* offset inside bo */ +   uint32_t *immd; +   unsigned immd_size; +   unsigned parm_size; /* size limit of uniform buffer */ -	struct { -		unsigned high_temp; -		unsigned high_result; +   ubyte max_gpr; /* REG_ALLOC_TEMP */ +   ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */ -		uint32_t attr[2]; -		uint32_t regs[4]; +   ubyte in_nr; +   ubyte out_nr; +   struct nv50_varying in[16]; +   struct nv50_varying out[16]; -		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */ -		unsigned in_nr, out_nr; -		struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS]; -		struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS]; +   struct { +      uint32_t attrs[3]; /* VP_ATTR_EN_0,1 and VP_GP_BUILTIN_ATTR_EN */ +      ubyte psiz; +      ubyte bfc[2]; +      ubyte edgeflag; +      ubyte clpd; +      ubyte clpd_nr; +   } vp; -		/* FP colour inputs, VP/GP back colour outputs */ -		struct nv50_sreg4 two_side[2]; +   struct { +      uint32_t flags[2]; /* 0x19a8, 196c */ +      uint32_t interp; /* 0x1988 */ +      uint32_t colors; /* 0x1904 */ +   } fp; -		/* GP only */ -		unsigned vert_count; -		uint8_t prim_type; +   struct { +      ubyte primid; /* primitive id output register */ +      uint8_t vert_count; +      uint8_t prim_type; /* point, line strip or tri strip */ +   } gp; -		/* VP & GP only */ -		uint8_t clpd, clpd_nr; -		uint8_t psiz; -		uint8_t edgeflag_in; +   void *fixups; +   unsigned num_fixups; +}; + +#define NV50_INTERP_LINEAR   (1 << 0) +#define NV50_INTERP_FLAT     (1 << 1) +#define NV50_INTERP_CENTROID (1 << 2) -		/* FP & GP only */ -		uint8_t prim_id; -	} cfg; +struct nv50_translation_info { +   struct nv50_program *p; +   unsigned inst_nr; +   ubyte input_file; +   ubyte output_file; +   ubyte input_map[PIPE_MAX_SHADER_INPUTS][4]; +   ubyte output_map[PIPE_MAX_SHADER_OUTPUTS][4]; +   ubyte interp_mode[PIPE_MAX_SHADER_INPUTS]; +   int input_access[PIPE_MAX_SHADER_INPUTS][4]; +   int output_access[PIPE_MAX_SHADER_OUTPUTS][4]; +   boolean indirect_inputs; +   boolean indirect_outputs; +   struct tgsi_shader_info scan; +   uint32_t *immd32; +   unsigned immd32_nr; +   ubyte edgeflag_out;  }; -#endif +int nv50_generate_code(struct nv50_translation_info *ti); +boolean nv50_program_tx(struct nv50_program *p); + +#endif /* __NV50_PROG_H__ */ diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c index c3ac804146..481182dd8d 100644 --- a/src/gallium/drivers/nv50/nv50_push.c +++ b/src/gallium/drivers/nv50/nv50_push.c @@ -227,7 +227,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe,     ctx.idxbuf = NULL;     ctx.vtx_size = 0;     ctx.edgeflag = 0.5f; -   ctx.edgeflag_attr = nv50->vertprog->cfg.edgeflag_in; +   ctx.edgeflag_attr = nv50->vertprog->vp.edgeflag;     /* map vertex buffers, determine vertex size */     for (i = 0; i < nv50->vtxelt->num_elements; i++) { diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c new file mode 100644 index 0000000000..f7e6355286 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -0,0 +1,619 @@ +/* + * Copyright 2008 Ben Skeggs + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "util/u_inlines.h" + +#include "nv50_context.h" +#include "nv50_transfer.h" + +static void +nv50_transfer_constbuf(struct nv50_context *nv50, +                       struct pipe_resource *buf, unsigned size, unsigned cbi) +{ +   struct pipe_context *pipe = &nv50->pipe; +   struct pipe_transfer *transfer; +   struct nouveau_channel *chan = nv50->screen->base.channel; +   struct nouveau_grobj *tesla = nv50->screen->tesla; +   uint32_t *map; +   unsigned count, start; + +   map = pipe_buffer_map(pipe, buf, PIPE_TRANSFER_READ, &transfer); +   if (!map) +      return; + +   count = MIN2(buf->width0, size); +   start = 0; + +   while (count) { +      unsigned nr = count; +      nr = MIN2(nr, 2047); + +      /* FIXME: emit relocs for unsuiTed MM */ +      BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); +      OUT_RING  (chan, (start << 8) | cbi); +      BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); +      OUT_RINGp (chan, map, nr); + +      count -= nr; +      start += nr; +      map += nr; +   } + +   pipe_buffer_unmap(pipe, buf, transfer); +} + +static void +nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) +{ +   struct nouveau_channel *chan = nv50->screen->base.channel; +   struct nouveau_grobj *tesla = nv50->screen->tesla; +   unsigned cbi; + +   if (p->immd_size) { +      uint32_t *data = p->immd; +      unsigned count = p->immd_size / 4; +      unsigned start = 0; + +      while (count) { +         unsigned nr = count; +         nr = MIN2(nr, 2047); + +         BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); +         OUT_RING  (chan, (start << 8) | NV50_CB_PMISC); +         BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); +         OUT_RINGp (chan, data, nr); + +         count -= nr; +         start += nr; +         data += nr; +      } +   } + +   if (p->parm_size == 0) +      return; + +   switch (p->type) { +   case PIPE_SHADER_VERTEX: +      cbi = NV50_CB_PVP; +      break; +   case PIPE_SHADER_FRAGMENT: +      cbi = NV50_CB_PFP; +      break; +   case PIPE_SHADER_GEOMETRY: +      cbi = NV50_CB_PGP; +      break; +   default: +      assert(0); +      break; +   } + +   nv50_transfer_constbuf(nv50, nv50->constbuf[p->type], p->parm_size, cbi); +} + +static void +nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) +{ +   struct nouveau_channel *chan = nv50->screen->base.channel; +   struct nouveau_grobj *tesla = nv50->screen->tesla; +   struct nouveau_grobj *eng2d = nv50->screen->eng2d; +   int ret; +   unsigned offset; +   unsigned size = p->code_size; +   uint32_t *data = p->code; + +   assert(p->translated); + +   /* TODO: use a single bo (for each type) for shader code */ +   if (p->bo) +      return; +   ret = nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, size, &p->bo); +   assert(!ret); + +   offset = p->code_start = 0; + +   BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2); +   OUT_RING  (chan, NV50_2D_DST_FORMAT_R8_UNORM); +   OUT_RING  (chan, 1); +   BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1); +   OUT_RING  (chan, 0x40000); +   BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 2); +   OUT_RING  (chan, 0x10000); +   OUT_RING  (chan, 1); + +   while (size) { +      unsigned nr = size / 4; + +      if (AVAIL_RING(chan) < 32) +         FIRE_RING(chan); + +      nr = MIN2(nr, AVAIL_RING(chan) - 18); +      nr = MIN2(nr, 1792); +      if (nr < (size / 4)) +         nr &= ~0x3f; +      assert(!(size & 3)); + +      BEGIN_RING(chan, eng2d, NV50_2D_DST_ADDRESS_HIGH, 2); +      OUT_RELOCh(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); +      OUT_RELOCl(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); +      BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2); +      OUT_RING  (chan, 0); +      OUT_RING  (chan, NV50_2D_SIFC_FORMAT_R8_UNORM); +      BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10); +      OUT_RING  (chan, nr * 4); +      OUT_RING  (chan, 1); +      OUT_RING  (chan, 0); +      OUT_RING  (chan, 1); +      OUT_RING  (chan, 0); +      OUT_RING  (chan, 1); +      OUT_RING  (chan, 0); +      OUT_RING  (chan, 0); +      OUT_RING  (chan, 0); +      OUT_RING  (chan, 0); + +      BEGIN_RING_NI(chan, eng2d, NV50_2D_SIFC_DATA, nr); +      OUT_RINGp (chan, data, nr); + +      data += nr; +      offset += nr * 4; +      size -= nr * 4; +   } + +   BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); +   OUT_RING  (chan, 0); +} + +static void +nv50_vp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ +   struct nouveau_grobj *tesla = nv50->screen->tesla; +   struct nouveau_stateobj *so = so_new(5, 7, 2); + +   nv50_program_validate_code(nv50, p); + +   so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); +   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | +             NOUVEAU_BO_HIGH, 0, 0); +   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | +             NOUVEAU_BO_LOW, 0, 0); +   so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); +   so_data  (so, p->vp.attrs[0]); +   so_data  (so, p->vp.attrs[1]); +   so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); +   so_data  (so, p->max_out); +   so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); +   so_data  (so, p->max_gpr); +   so_method(so, tesla, NV50TCL_VP_START_ID, 1); +   so_data  (so, p->code_start); + +   so_ref(so, &p->so); +   so_ref(NULL, &so); +} + +static void +nv50_fp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ +   struct nouveau_grobj *tesla = nv50->screen->tesla; +	struct nouveau_stateobj *so = so_new(6, 7, 2); + +   nv50_program_validate_code(nv50, p); + +   so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); +   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | +             NOUVEAU_BO_HIGH, 0, 0); +   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | +             NOUVEAU_BO_LOW, 0, 0); +   so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); +   so_data  (so, p->max_gpr); +   so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); +   so_data  (so, p->max_out); +   so_method(so, tesla, NV50TCL_FP_CONTROL, 1); +   so_data  (so, p->fp.flags[0]); +   so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); +   so_data  (so, p->fp.flags[1]); +   so_method(so, tesla, NV50TCL_FP_START_ID, 1); +   so_data  (so, p->code_start); + +   so_ref(so, &p->so); +   so_ref(NULL, &so); +} + +static void +nv50_gp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ +   struct nouveau_grobj *tesla = nv50->screen->tesla; +	struct nouveau_stateobj *so = so_new(6, 7, 2); + +   nv50_program_validate_code(nv50, p); + +   so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); +   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | +             NOUVEAU_BO_HIGH, 0, 0); +   so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | +             NOUVEAU_BO_LOW, 0, 0); +   so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); +   so_data  (so, p->max_gpr); +   so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); +   so_data  (so, p->max_out); +   so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); +   so_data  (so, p->gp.prim_type); +   so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); +   so_data  (so, p->gp.vert_count); +   so_method(so, tesla, NV50TCL_GP_START_ID, 1); +   so_data  (so, p->code_start); + +   so_ref(so, &p->so); +   so_ref(NULL, &so); +} + +static boolean +nv50_program_validate(struct nv50_program *p) +{ +   p->translated = nv50_program_tx(p); +   assert(p->translated); +   return p->translated; +} + +struct nouveau_stateobj * +nv50_vertprog_validate(struct nv50_context *nv50) +{ +   struct nv50_program *p = nv50->vertprog; +   struct nouveau_stateobj *so = NULL; + +   if (!p->translated) { +      if (nv50_program_validate(p)) +         nv50_vp_update_stateobj(nv50, p); +      else +         return NULL; +   } + +   if (nv50->dirty & NV50_NEW_VERTPROG_CB) +      nv50_program_validate_data(nv50, p); + +   if (!(nv50->dirty & NV50_NEW_VERTPROG)) +      return NULL; + +   nv50_program_validate_code(nv50, p); + +   so_ref(p->so, &so); +   return so; +} + +struct nouveau_stateobj * +nv50_fragprog_validate(struct nv50_context *nv50) +{ +   struct nv50_program *p = nv50->fragprog; +   struct nouveau_stateobj *so = NULL; + +   if (!p->translated) { +      if (nv50_program_validate(p)) +         nv50_fp_update_stateobj(nv50, p); +      else +         return NULL; +   } + +   if (nv50->dirty & NV50_NEW_FRAGPROG_CB) +      nv50_program_validate_data(nv50, p); + +   if (!(nv50->dirty & NV50_NEW_FRAGPROG)) +      return NULL; + +   nv50_program_validate_code(nv50, p); + +   so_ref(p->so, &so); +   return so; +} + +struct nouveau_stateobj * +nv50_geomprog_validate(struct nv50_context *nv50) +{ +   struct nv50_program *p = nv50->geomprog; +   struct nouveau_stateobj *so = NULL; + +   if (!p->translated) { +      if (nv50_program_validate(p)) +         nv50_gp_update_stateobj(nv50, p); +      else +         return NULL; +   } + +   if (nv50->dirty & NV50_NEW_GEOMPROG_CB) +      nv50_program_validate_data(nv50, p); + +   if (!(nv50->dirty & NV50_NEW_GEOMPROG)) +      return NULL; + +   nv50_program_validate_code(nv50, p); + +   so_ref(p->so, &so); +   return so; +} + +/* XXX: this might not work correctly in all cases yet: we assume that + * an FP generic input that is not written in the VP is gl_PointCoord. + */ +static uint32_t +nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned m) +{ +   struct nv50_program *vp = nv50->vertprog; +   struct nv50_program *fp = nv50->fragprog; +   unsigned i, c; + +   memset(pntc, 0, 8 * sizeof(uint32_t)); + +   if (nv50->geomprog) +      vp = nv50->geomprog; + +   for (i = 0; i < fp->in_nr; i++) { +      unsigned j, n = util_bitcount(fp->in[i].mask); + +      if (fp->in[i].sn != TGSI_SEMANTIC_GENERIC) { +         m += n; +         continue; +      } + +      for (j = 0; j < vp->out_nr; ++j) +         if (vp->out[j].sn == fp->in[i].sn && vp->out[j].si == fp->in[i].si) +            break; + +      if (j < vp->out_nr) { +         ubyte en = nv50->rasterizer->pipe.sprite_coord_enable; + +         if (!(en & (1 << vp->out[j].si))) { +            m += n; +            continue; +         } +      } + +      /* this is either PointCoord or replaced by sprite coords */ +      for (c = 0; c < 4; c++) { +         if (!(fp->in[i].mask & (1 << c))) +            continue; +         pntc[m / 8] |= (c + 1) << ((m % 8) * 4); +         ++m; +      } +   } +   if (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) +      return 0; +   return (1 << 4); +} + +static int +nv50_vec4_map(uint32_t *map32, int mid, uint32_t lin[4], +              struct nv50_varying *in, struct nv50_varying *out) +{ +   int c; +   uint8_t mv = out->mask, mf = in->mask, oid = out->hw; +   uint8_t *map = (uint8_t *)map32; + +   for (c = 0; c < 4; ++c) { +      if (mf & 1) { +         if (in->linear) +            lin[mid / 32] |= 1 << (mid % 32); +         if (mv & 1) +            map[mid] = oid; +         else +         if (c == 3) +            map[mid] |= 1; +         ++mid; +      } + +      oid += mv & 1; +      mf >>= 1; +      mv >>= 1; +   } + +   return mid; +} + +struct nouveau_stateobj * +nv50_fp_linkage_validate(struct nv50_context *nv50) +{ +   struct nouveau_grobj *tesla = nv50->screen->tesla; +   struct nv50_program *vp; +   struct nv50_program *fp = nv50->fragprog; +   struct nouveau_stateobj *so; +   struct nv50_varying dummy; +   int i, n, c, m; + +   uint32_t map[16], lin[4], pntc[8]; + +   uint32_t interp = fp->fp.interp; +   uint32_t colors = fp->fp.colors; +   uint32_t clip = 0x04; +   uint32_t psiz = 0x000; +   uint32_t primid = 0; +   uint32_t sysval = 0; + +   if (nv50->geomprog) { +      vp = nv50->geomprog; +      memset(map, 0x80, sizeof(map)); +   } else { +      vp = nv50->vertprog; +      memset(map, 0x40, sizeof(map)); +   } +   memset(lin, 0, sizeof(lin)); + +   dummy.linear = 0; +   dummy.mask = 0xf; /* map all components of HPOS */ +   m = nv50_vec4_map(map, 0, lin, &dummy, &vp->out[0]); + +   if (vp->vp.clpd < 0x40) { +      for (c = 0; c < vp->vp.clpd_nr; ++c) { +         map[m / 4] |= (vp->vp.clpd + c) << ((m % 4) * 8); +         ++m; +      } +      clip |= vp->vp.clpd_nr << 8; +   } + +   colors |= m << 8; /* adjust BFC0 id */ + +   /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ +   if (nv50->rasterizer->pipe.light_twoside) { +      for (i = 0; i < 2; ++i) +         m = nv50_vec4_map(map, m, lin, +                           &fp->in[fp->vp.bfc[i]], +                           &vp->out[vp->vp.bfc[i]]); +   } + +   colors += m - 4; /* adjust FFC0 id */ +   interp |= m << 8; /* set mid where 'normal' FP inputs start */ + +   dummy.mask = 0x0; +   for (i = 0; i < fp->in_nr; i++) { +      for (n = 0; n < vp->out_nr; ++n) +         if (vp->out[n].sn == fp->in[i].sn && +             vp->out[n].si == fp->in[i].si) +            break; + +      m = nv50_vec4_map(map, m, lin, +                        &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy); +	} +   /* PrimitiveID either is replaced by the system value, or +    * written by the geometry shader into an output register +    */ +   if (fp->gp.primid < 0x40) { +      map[m / 4] |= vp->gp.primid << ((m % 4) * 8); +      primid = m++; +   } + +   if (nv50->rasterizer->pipe.point_size_per_vertex) { +      map[m / 4] |= vp->vp.psiz << ((m % 4) * 8); +      psiz = (m++ << 4) | 1; +   } + +   /* now fill the stateobj (at most 28 so_data)  */ +   so = so_new(10, 54, 0); + +   n = (m + 3) / 4; +   assert(m <= 64); +   if (vp->type == PIPE_SHADER_GEOMETRY) { +      so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); +      so_data  (so, m); +      so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); +      so_datap (so, map, n); +   } else { +      so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); +      so_data  (so, vp->vp.attrs[2]); + +      so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); +      so_data  (so, primid); + +      so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); +      so_data  (so, m); +      so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); +      so_datap (so, map, n); +   } + +   //colors = 0x01000404; +   so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); +   so_data  (so, colors); +   so_data  (so, clip); +   so_data  (so, sysval); +   so_data  (so, psiz); + +   so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); +   so_data  (so, interp); + +   so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); +   so_datap (so, lin, 4); + +   if (nv50->rasterizer->pipe.sprite_coord_enable) { +      so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); +      so_data  (so, +                nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff)); + +      so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); +      so_datap (so, pntc, 8); +   } + +   so_method(so, tesla, NV50TCL_GP_ENABLE, 1); +   so_data  (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); + +   return so; +} + +static int +nv50_vp_gp_mapping(uint32_t *map32, int m, +                   struct nv50_program *vp, struct nv50_program *gp) +{ +   uint8_t *map = (uint8_t *)map32; +   int i, j, c; + +   for (i = 0; i < gp->in_nr; ++i) { +      uint8_t oid = 0, mv = 0, mg = gp->in[i].mask; + +      for (j = 0; j < vp->out_nr; ++j) { +         if (vp->out[j].sn == gp->in[i].sn && +             vp->out[j].si == gp->in[i].si) { +            mv = vp->out[j].mask; +            oid = vp->out[j].hw; +            break; +         } +      } + +      for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { +         if (mg & mv & 1) +            map[m++] = oid; +         else +         if (mg & 1) +            map[m++] = (c == 3) ? 0x41 : 0x40; +         oid += mv & 1; +      } +   } +   return m; +} + +struct nouveau_stateobj * +nv50_gp_linkage_validate(struct nv50_context *nv50) +{ +   struct nouveau_grobj *tesla = nv50->screen->tesla; +   struct nouveau_stateobj *so; +   struct nv50_program *vp = nv50->vertprog; +   struct nv50_program *gp = nv50->geomprog; +   uint32_t map[16]; +   int m = 0; + +   if (!gp) +      return NULL; +   memset(map, 0, sizeof(map)); + +   m = nv50_vp_gp_mapping(map, m, vp, gp); + +   so = so_new(3, 24 - 3, 0); + +   so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); +   so_data  (so, vp->vp.attrs[2] | gp->vp.attrs[2]); + +   assert(m <= 32); +   so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); +   so_data  (so, m); + +   m = (m + 3) / 4; +   so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); +   so_datap (so, map, m); + +   return so; +} diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index 42c5a58318..0d744ab788 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -546,7 +546,6 @@ nv50_vp_state_create(struct pipe_context *pipe,  	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);  	p->type = PIPE_SHADER_VERTEX; -	tgsi_scan_shader(p->pipe.tokens, &p->info);  	return (void *)p;  } @@ -578,7 +577,6 @@ nv50_fp_state_create(struct pipe_context *pipe,  	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);  	p->type = PIPE_SHADER_FRAGMENT; -	tgsi_scan_shader(p->pipe.tokens, &p->info);  	return (void *)p;  } @@ -610,7 +608,6 @@ nv50_gp_state_create(struct pipe_context *pipe,  	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);  	p->type = PIPE_SHADER_GEOMETRY; -	tgsi_scan_shader(p->pipe.tokens, &p->info);  	return (void *)p;  } diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index 524696f35d..8d662d8f60 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -81,6 +81,9 @@ validate_fb(struct nv50_context *nv50)  		case PIPE_FORMAT_R16G16B16A16_UNORM:  			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM);  			break; +		case PIPE_FORMAT_R16G16B16A16_FLOAT: +			so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT); +			break;  		case PIPE_FORMAT_R32G32B32A32_FLOAT:  			so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT);  			break; @@ -135,6 +138,12 @@ validate_fb(struct nv50_context *nv50)  		case PIPE_FORMAT_Z32_FLOAT:  			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT);  			break; +		case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED: +			so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM); +			break; +		case PIPE_FORMAT_Z16_UNORM: +			so_data(so, NV50TCL_ZETA_FORMAT_Z16_UNORM); +			break;  		default:  			NOUVEAU_ERR("AIIII unknown format %s\n",  			            util_format_name(fb->zsbuf->format)); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c new file mode 100644 index 0000000000..aa15917774 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -0,0 +1,1266 @@ + +#include <unistd.h> + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" + +#include "util/u_simple_list.h" +#include "tgsi/tgsi_dump.h" + +#define BLD_MAX_TEMPS 64 +#define BLD_MAX_ADDRS 4 +#define BLD_MAX_PREDS 4 +#define BLD_MAX_IMMDS 128 + +#define BLD_MAX_COND_NESTING 4 +#define BLD_MAX_LOOP_NESTING 4 +#define BLD_MAX_CALL_NESTING 2 + +/* collects all values assigned to the same TGSI register */ +struct bld_value_stack { +   struct nv_value *top; +   struct nv_value **body; +   unsigned size; +}; + +static INLINE void +bld_push_value(struct bld_value_stack *stk) +{ +   assert(!stk->size || (stk->body[stk->size - 1] != stk->top)); + +   if (!(stk->size % 8)) { +      unsigned old_sz = (stk->size + 0) * sizeof(struct nv_value *); +      unsigned new_sz = (stk->size + 8) * sizeof(struct nv_value *); +      stk->body = (struct nv_value **)REALLOC(stk->body, old_sz, new_sz); +   } +   stk->body[stk->size++] = stk->top; +   stk->top = NULL; +} + +static INLINE void +bld_push_values(struct bld_value_stack *stacks, int n) +{ +   int i, c; + +   for (i = 0; i < n; ++i) +      for (c = 0; c < 4; ++c) +         if (stacks[i * 4 + c].top) +            bld_push_value(&stacks[i * 4 + c]); +} + +#define FETCH_TEMP(i, c)    (bld->tvs[i][c].top) +#define STORE_TEMP(i, c, v) (bld->tvs[i][c].top = (v)) +#define FETCH_ADDR(i, c)    (bld->avs[i][c].top) +#define STORE_ADDR(i, c, v) (bld->avs[i][c].top = (v)) +#define FETCH_PRED(i, c)    (bld->pvs[i][c].top) +#define STORE_PRED(i, c, v) (bld->pvs[i][c].top = (v)) +#define FETCH_OUTR(i, c)    (bld->ovs[i][c].top) +#define STORE_OUTR(i, c, v)                                         \ +   do {                                                             \ +      bld->ovs[i][c].top = (v);                                     \ +      bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \ +   } while (0) + +struct bld_context { +   struct nv50_translation_info *ti; + +   struct nv_pc *pc; +   struct nv_basic_block *b; + +   struct tgsi_parse_context parse[BLD_MAX_CALL_NESTING]; +   int call_lvl; + +   struct nv_basic_block *cond_bb[BLD_MAX_COND_NESTING]; +   struct nv_basic_block *join_bb[BLD_MAX_COND_NESTING]; +   struct nv_basic_block *else_bb[BLD_MAX_COND_NESTING]; +   int cond_lvl; +   struct nv_basic_block *loop_bb[BLD_MAX_LOOP_NESTING]; +   int loop_lvl; + +   struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */ +   struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */ +   struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ +   struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4]; + +   uint32_t outputs_written[PIPE_MAX_SHADER_OUTPUTS / 32]; + +   struct nv_value *frgcrd[4]; +   struct nv_value *sysval[4]; + +   /* wipe on new BB */ +   struct nv_value *saved_addr[4][2]; +   struct nv_value *saved_inputs[128]; +   struct nv_value *saved_immd[BLD_MAX_IMMDS]; +   uint num_immds; +}; + +static INLINE struct nv_value * +bld_def(struct nv_instruction *i, int c, struct nv_value *value) +{ +   i->def[c] = value; +   value->insn = i; +   return value; +} + +static INLINE struct nv_value * +find_by_bb(struct bld_value_stack *stack, struct nv_basic_block *b) +{ +   int i; + +   if (stack->top && stack->top->insn->bb == b) +      return stack->top; + +   for (i = stack->size - 1; i >= 0; --i) +      if (stack->body[i]->insn->bb == b) +         return stack->body[i]; +   return NULL; +} + +/* fetch value from stack that was defined in the specified basic block, + * or search for first definitions in all of its predecessors + */ +static void +fetch_by_bb(struct bld_value_stack *stack, +            struct nv_value **vals, int *n, +            struct nv_basic_block *b) +{ +   int i; +   struct nv_value *val; + +   assert(*n < 16); /* MAX_COND_NESTING */ + +   val = find_by_bb(stack, b); +   if (val) { +      for (i = 0; i < *n; ++i) +         if (vals[i] == val) +            return; +      vals[(*n)++] = val; +      return; +   } +   for (i = 0; i < b->num_in; ++i) +      fetch_by_bb(stack, vals, n, b->in[i]); +} + +static struct nv_value * +bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) +{ +   struct nv_value *vals[16], *phi = NULL; +   int j, i = 0, n = 0; + +   fetch_by_bb(stack, vals, &n, bld->pc->current_block); + +   assert(n); +   if (n == 1) +      return vals[0]; + +   debug_printf("phi required: %i candidates\n", n); + +   while (i < n) { +      struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_PHI); + +      j = phi ? 1 : 0; +      if (phi) +         insn->src[0] = new_ref(bld->pc, phi); + +      phi = new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type); + +      bld_def(insn, 0, phi); + +      for (; j < 4; ++j) { +         insn->src[j] = new_ref(bld->pc, vals[i++]); +         if (i == n) +            break; +      } +      debug_printf("new phi: %i, %i in\n", phi->n, j); +   } + +   /* insert_at_head(list, phi) is done at end of block */ +   return phi; +} + +static INLINE struct nv_value * +bld_imm_u32(struct bld_context *bld, uint32_t u) +{ +   int i; +   unsigned n = bld->num_immds; + +   debug_printf("bld_imm_u32: 0x%08x\n", u); + +   for (i = 0; i < n; ++i) +      if (bld->saved_immd[i]->reg.imm.u32 == u) +         return bld->saved_immd[i]; +   assert(n < BLD_MAX_IMMDS); + +   debug_printf("need new one\n"); + +   bld->num_immds++; + +   bld->saved_immd[n] = new_value(bld->pc, NV_FILE_IMM, NV_TYPE_U32); +   bld->saved_immd[n]->reg.imm.u32 = u; +   return bld->saved_immd[n]; +} + +static INLINE struct nv_value * +bld_imm_f32(struct bld_context *bld, float f) +{ +   return bld_imm_u32(bld, fui(f)); +} + +#define SET_TYPE(v, t) ((v)->reg.type = NV_TYPE_##t) + +static struct nv_value * +bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0) +{ +   struct nv_instruction *insn = new_instruction(bld->pc, opcode); +   assert(insn); + +   nv_reference(bld->pc, &insn->src[0], src0); /* NOTE: new_ref would suffice */ +    +   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +static struct nv_value * +bld_insn_2(struct bld_context *bld, uint opcode, +	      struct nv_value *src0, struct nv_value *src1) +{ +   struct nv_instruction *insn = new_instruction(bld->pc, opcode); + +   nv_reference(bld->pc, &insn->src[0], src0); +   nv_reference(bld->pc, &insn->src[1], src1); + +   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +static struct nv_value * +bld_insn_3(struct bld_context *bld, uint opcode, +              struct nv_value *src0, struct nv_value *src1, +              struct nv_value *src2) +{ +   struct nv_instruction *insn = new_instruction(bld->pc, opcode); + +   nv_reference(bld->pc, &insn->src[0], src0); +   nv_reference(bld->pc, &insn->src[1], src1); +   nv_reference(bld->pc, &insn->src[2], src2); + +   return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +#define BLD_INSN_1_EX(d, op, dt, s0, s0t)           \ +   do {                                             \ +      (d) = bld_insn_1(bld, (NV_OP_##op), (s0));    \ +      (d)->reg.type = NV_TYPE_##dt;                 \ +      (d)->insn->src[0]->typecast = NV_TYPE_##s0t;  \ +   } while(0) + +static struct nv_value * +bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e) +{ +   struct nv_value *val; + +   BLD_INSN_1_EX(val, LG2, F32, x, F32); +   BLD_INSN_1_EX(val, MUL, F32, e, F32); +   val = bld_insn_1(bld, NV_OP_PREEX2, val); +   val = bld_insn_1(bld, NV_OP_EX2, val); + +   return val; +} + +static INLINE struct nv_value * +bld_load_imm_f32(struct bld_context *bld, float f) +{ +   return bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f)); +} + +static INLINE struct nv_value * +bld_load_imm_u32(struct bld_context *bld, uint32_t u) +{ +   return bld_insn_1(bld, NV_OP_MOV, bld_imm_u32(bld, u)); +} + +static struct nv_value * +bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) +{ +   int i; +   struct nv_instruction *nvi; + +   for (i = 0; i < 4; ++i) { +      if (!bld->saved_addr[i][0]) +         break; +      if (bld->saved_addr[i][1] == indirect) { +         nvi = bld->saved_addr[i][0]->insn; +         if (nvi->src[0]->value->reg.imm.u32 == id) +            return bld->saved_addr[i][0]; +      } +   } +   i &= 3; + +   bld->saved_addr[i][0] = bld_load_imm_u32(bld, id); +   bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR; +   bld->saved_addr[i][1] = indirect; +   return bld->saved_addr[i][0]; +} + + +static struct nv_value * +bld_predicate(struct bld_context *bld, struct nv_value *src) +{ +   struct nv_instruction *nvi = src->insn; + +   if (nvi->opcode == NV_OP_LDA || +       nvi->opcode == NV_OP_PHI || +       nvi->bb != bld->pc->current_block) { +      nvi = new_instruction(bld->pc, NV_OP_CVT); +      nv_reference(bld->pc, &nvi->src[0], src); +   } + +   if (!nvi->flags_def) { +      nvi->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); +      nvi->flags_def->insn = nvi; +   } +   return nvi->flags_def; +} + +static void +bld_kil(struct bld_context *bld, struct nv_value *src) +{ +   struct nv_instruction *nvi; + +   src = bld_predicate(bld, src); +   nvi = new_instruction(bld->pc, NV_OP_KIL); +   nvi->fixed = 1; +   nvi->flags_src = new_ref(bld->pc, src); +   nvi->cc = NV_CC_LT; +} + +static void +bld_flow(struct bld_context *bld, uint opcode, ubyte cc, +         struct nv_value *src, boolean plan_reconverge) +{ +   struct nv_instruction *nvi; + +   if (plan_reconverge) +      new_instruction(bld->pc, NV_OP_JOINAT)->fixed = 1; + +   nvi = new_instruction(bld->pc, opcode); +   nvi->is_terminator = 1; +   nvi->cc = cc; +   nvi->flags_src = new_ref(bld->pc, src); +} + +static ubyte +translate_setcc(unsigned opcode) +{ +   switch (opcode) { +   case TGSI_OPCODE_SLT: return NV_CC_LT; +   case TGSI_OPCODE_SGE: return NV_CC_GE; +   case TGSI_OPCODE_SEQ: return NV_CC_EQ; +   case TGSI_OPCODE_SGT: return NV_CC_GT; +   case TGSI_OPCODE_SLE: return NV_CC_LE; +   case TGSI_OPCODE_SNE: return NV_CC_NE | NV_CC_U; +   case TGSI_OPCODE_STR: return NV_CC_TR; +   case TGSI_OPCODE_SFL: return NV_CC_FL; + +   case TGSI_OPCODE_ISLT: return NV_CC_LT; +   case TGSI_OPCODE_ISGE: return NV_CC_GE; +   case TGSI_OPCODE_USEQ: return NV_CC_EQ; +   case TGSI_OPCODE_USGE: return NV_CC_GE; +   case TGSI_OPCODE_USLT: return NV_CC_LT; +   case TGSI_OPCODE_USNE: return NV_CC_NE; +   default: +      assert(0); +      return NV_CC_FL; +   } +} + +static uint +translate_opcode(uint opcode) +{ +   switch (opcode) { +   case TGSI_OPCODE_ABS: return NV_OP_ABS; +   case TGSI_OPCODE_ADD: +   case TGSI_OPCODE_SUB: +   case TGSI_OPCODE_UADD: return NV_OP_ADD; +   case TGSI_OPCODE_AND: return NV_OP_AND; +   case TGSI_OPCODE_EX2: return NV_OP_EX2; +   case TGSI_OPCODE_CEIL: return NV_OP_CEIL; +   case TGSI_OPCODE_FLR: return NV_OP_FLOOR; +   case TGSI_OPCODE_TRUNC: return NV_OP_TRUNC; +   case TGSI_OPCODE_DDX: return NV_OP_DFDX; +   case TGSI_OPCODE_DDY: return NV_OP_DFDY; +   case TGSI_OPCODE_F2I: +   case TGSI_OPCODE_F2U: +   case TGSI_OPCODE_I2F: +   case TGSI_OPCODE_U2F: return NV_OP_CVT; +   case TGSI_OPCODE_INEG: return NV_OP_NEG; +   case TGSI_OPCODE_LG2: return NV_OP_LG2; +   case TGSI_OPCODE_ISHR: +   case TGSI_OPCODE_USHR: return NV_OP_SHR; +   case TGSI_OPCODE_MAD: +   case TGSI_OPCODE_UMAD: return NV_OP_MAD; +   case TGSI_OPCODE_MAX: +   case TGSI_OPCODE_IMAX: +   case TGSI_OPCODE_UMAX: return NV_OP_MAX; +   case TGSI_OPCODE_MIN: +   case TGSI_OPCODE_IMIN: +   case TGSI_OPCODE_UMIN: return NV_OP_MIN; +   case TGSI_OPCODE_MUL: +   case TGSI_OPCODE_UMUL: return NV_OP_MUL; +   case TGSI_OPCODE_OR: return NV_OP_OR; +   case TGSI_OPCODE_RCP: return NV_OP_RCP; +   case TGSI_OPCODE_RSQ: return NV_OP_RSQ; +   case TGSI_OPCODE_SAD: return NV_OP_SAD; +   case TGSI_OPCODE_SHL: return NV_OP_SHL; +   case TGSI_OPCODE_SLT: +   case TGSI_OPCODE_SGE: +   case TGSI_OPCODE_SEQ: +   case TGSI_OPCODE_SGT: +   case TGSI_OPCODE_SLE: +   case TGSI_OPCODE_SNE: +   case TGSI_OPCODE_ISLT: +   case TGSI_OPCODE_ISGE: +   case TGSI_OPCODE_USEQ: +   case TGSI_OPCODE_USGE: +   case TGSI_OPCODE_USLT: +   case TGSI_OPCODE_USNE: return NV_OP_SET; +   case TGSI_OPCODE_TEX: return NV_OP_TEX; +   case TGSI_OPCODE_TXP: return NV_OP_TEX; +   case TGSI_OPCODE_TXB: return NV_OP_TXB; +   case TGSI_OPCODE_TXL: return NV_OP_TXL; +   case TGSI_OPCODE_XOR: return NV_OP_XOR; +   default: +      return NV_OP_NOP; +   } +} + +static ubyte +infer_src_type(unsigned opcode) +{ +   switch (opcode) { +   case TGSI_OPCODE_MOV: +   case TGSI_OPCODE_AND: +   case TGSI_OPCODE_OR: +   case TGSI_OPCODE_XOR: +   case TGSI_OPCODE_SAD: +   case TGSI_OPCODE_U2F: +   case TGSI_OPCODE_UADD: +   case TGSI_OPCODE_UDIV: +   case TGSI_OPCODE_UMOD: +   case TGSI_OPCODE_UMAD: +   case TGSI_OPCODE_UMUL: +   case TGSI_OPCODE_UMAX: +   case TGSI_OPCODE_UMIN: +   case TGSI_OPCODE_USEQ: +   case TGSI_OPCODE_USGE: +   case TGSI_OPCODE_USLT: +   case TGSI_OPCODE_USNE: +   case TGSI_OPCODE_USHR: +      return NV_TYPE_U32; +   case TGSI_OPCODE_I2F: +   case TGSI_OPCODE_IDIV: +   case TGSI_OPCODE_IMAX: +   case TGSI_OPCODE_IMIN: +   case TGSI_OPCODE_INEG: +   case TGSI_OPCODE_ISGE: +   case TGSI_OPCODE_ISHR: +   case TGSI_OPCODE_ISLT: +      return NV_TYPE_S32; +   default: +      return NV_TYPE_F32; +   } +} + +static ubyte +infer_dst_type(unsigned opcode) +{ +   switch (opcode) { +   case TGSI_OPCODE_MOV: +   case TGSI_OPCODE_F2U: +   case TGSI_OPCODE_AND: +   case TGSI_OPCODE_OR: +   case TGSI_OPCODE_XOR: +   case TGSI_OPCODE_SAD: +   case TGSI_OPCODE_UADD: +   case TGSI_OPCODE_UDIV: +   case TGSI_OPCODE_UMOD: +   case TGSI_OPCODE_UMAD: +   case TGSI_OPCODE_UMUL: +   case TGSI_OPCODE_UMAX: +   case TGSI_OPCODE_UMIN: +   case TGSI_OPCODE_USEQ: +   case TGSI_OPCODE_USGE: +   case TGSI_OPCODE_USLT: +   case TGSI_OPCODE_USNE: +   case TGSI_OPCODE_USHR: +      return NV_TYPE_U32; +   case TGSI_OPCODE_F2I: +   case TGSI_OPCODE_IDIV: +   case TGSI_OPCODE_IMAX: +   case TGSI_OPCODE_IMIN: +   case TGSI_OPCODE_INEG: +   case TGSI_OPCODE_ISGE: +   case TGSI_OPCODE_ISHR: +   case TGSI_OPCODE_ISLT: +      return NV_TYPE_S32; +   default: +      return NV_TYPE_F32; +   } +} + +static void +emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, +	   unsigned chan, struct nv_value *value) +{ +   const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + +   assert(chan < 4); + +   if (inst->Instruction.Opcode != TGSI_OPCODE_MOV) +      value->reg.type = infer_dst_type(inst->Instruction.Opcode); + +   switch (inst->Instruction.Saturate) { +   case TGSI_SAT_NONE: +      break; +   case TGSI_SAT_ZERO_ONE: +      BLD_INSN_1_EX(value, SAT, F32, value, F32); +      break; +   case TGSI_SAT_MINUS_PLUS_ONE: +      value = bld_insn_2(bld, NV_OP_MAX, value, bld_load_imm_f32(bld, -1.0f)); +      value = bld_insn_2(bld, NV_OP_MIN, value, bld_load_imm_f32(bld, 1.0f)); +      value->reg.type = NV_TYPE_F32; +      break; +   } + +   switch (reg->Register.File) { +   case TGSI_FILE_OUTPUT: +      value = bld_insn_1(bld, NV_OP_MOV, value); +      value->reg.file = bld->ti->output_file; + +      if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) { +         STORE_OUTR(reg->Register.Index, chan, value); +      } else { +         value->insn->fixed = 1; +         value->reg.id = bld->ti->output_map[reg->Register.Index][chan]; +      } +      break; +   case TGSI_FILE_TEMPORARY: +      assert(reg->Register.Index < BLD_MAX_TEMPS); +      value->reg.file = NV_FILE_GPR; +      if (value->insn->bb != bld->pc->current_block) +         value = bld_insn_1(bld, NV_OP_MOV, value); +      STORE_TEMP(reg->Register.Index, chan, value); +      break; +   case TGSI_FILE_ADDRESS: +      assert(reg->Register.Index < BLD_MAX_ADDRS); +      value->reg.file = NV_FILE_ADDR; +      STORE_ADDR(reg->Register.Index, chan, value); +      break; +   } +} + +static INLINE uint32_t +bld_is_output_written(struct bld_context *bld, int i, int c) +{ +   if (c < 0) +      return bld->outputs_written[i / 8] & (0xf << ((i * 4) % 32)); +   return bld->outputs_written[i / 8] & (1 << ((i * 4 + c) % 32)); +} + +static void +bld_export_outputs(struct bld_context *bld) +{ +   struct nv_value *vals[4]; +   struct nv_instruction *nvi; +   int i, c, n; + +   bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + +   for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) { +      if (!bld_is_output_written(bld, i, -1)) +         continue; +      for (n = 0, c = 0; c < 4; ++c) { +         if (!bld_is_output_written(bld, i, c)) +            continue; +         vals[n] = bld_fetch_global(bld, &bld->ovs[i][c]); +         vals[n] = bld_insn_1(bld, NV_OP_MOV, vals[n]); +         vals[n++]->reg.id = bld->ti->output_map[i][c]; +      } +      assert(n); + +      (nvi = new_instruction(bld->pc, NV_OP_EXPORT))->fixed = 1; + +      for (c = 0; c < n; ++c) +         nvi->src[c] = new_ref(bld->pc, vals[c]); +   } +} + +static void +bld_new_block(struct bld_context *bld, struct nv_basic_block *b) +{ +   int i; + +   bld_push_values(&bld->tvs[0][0], BLD_MAX_TEMPS); +   bld_push_values(&bld->avs[0][0], BLD_MAX_ADDRS); +   bld_push_values(&bld->pvs[0][0], BLD_MAX_PREDS); +   bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + +   bld->pc->current_block = b; + +   for (i = 0; i < 4; ++i) +      bld->saved_addr[i][0] = NULL; +} + +static struct nv_value * +bld_saved_input(struct bld_context *bld, unsigned i, unsigned c) +{ +   unsigned idx = bld->ti->input_map[i][c]; + +   if (bld->ti->p->type != PIPE_SHADER_FRAGMENT) +      return NULL; +   if (bld->saved_inputs[idx]) +      return bld->saved_inputs[idx]; +   return NULL; +} + +static struct nv_value * +bld_interpolate(struct bld_context *bld, unsigned mode, struct nv_value *val) +{ +   if (mode & (NV50_INTERP_LINEAR | NV50_INTERP_FLAT)) +      val = bld_insn_1(bld, NV_OP_LINTERP, val); +   else +      val = bld_insn_2(bld, NV_OP_PINTERP, val, bld->frgcrd[3]); + +   val->insn->flat = (mode & NV50_INTERP_FLAT) ? 1 : 0; +   val->insn->centroid = (mode & NV50_INTERP_CENTROID) ? 1 : 0; +   return val; +} + +static struct nv_value * +emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, +           const unsigned s, const unsigned chan) +{ +   const struct tgsi_full_src_register *src = &insn->Src[s]; +   struct nv_value *res; +   unsigned idx, swz, dim_idx, ind_idx, ind_swz; +   ubyte type = infer_src_type(insn->Instruction.Opcode); + +   idx = src->Register.Index; +   swz = tgsi_util_get_full_src_register_swizzle(src, chan); +   dim_idx = -1; +   ind_idx = -1; +   ind_swz = 0; + +   if (src->Register.Indirect) { +      ind_idx = src->Indirect.Index; +      ind_swz = tgsi_util_get_src_register_swizzle(&src->Indirect, 0); +   } + +   switch (src->Register.File) { +   case TGSI_FILE_CONSTANT: +      dim_idx = src->Dimension.Index ? src->Dimension.Index + 2 : 1; +      assert(dim_idx < 14); +      assert(dim_idx == 1); /* for now */ + +      res = new_value(bld->pc, NV_FILE_MEM_C(dim_idx), type); +      res->reg.type = type; +      res->reg.id = (idx * 4 + swz) & 127; +      res = bld_insn_1(bld, NV_OP_LDA, res); + +      if (src->Register.Indirect) +         res->insn->src[4] = new_ref(bld->pc, FETCH_ADDR(ind_idx, ind_swz)); +      if (idx >= (128 / 4)) +         res->insn->src[4] = +            new_ref(bld->pc, bld_get_address(bld, (idx * 16) & ~0x1ff, NULL)); +      break; +   case TGSI_FILE_IMMEDIATE: +      assert(idx < bld->ti->immd32_nr); +      res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]); +      res->reg.type = type; +      break; +   case TGSI_FILE_INPUT: +      res = bld_saved_input(bld, idx, swz); +      if (res && (insn->Instruction.Opcode != TGSI_OPCODE_TXP)) +         return res; + +      res = new_value(bld->pc, bld->ti->input_file, type); +      res->reg.id = bld->ti->input_map[idx][swz]; + +      if (res->reg.file == NV_FILE_MEM_V) { +         res = bld_interpolate(bld, bld->ti->interp_mode[idx], res); +      } else { +         assert(src->Dimension.Dimension == 0); +         res = bld_insn_1(bld, NV_OP_LDA, res); +      } +      assert(res->reg.type == type); + +      bld->saved_inputs[bld->ti->input_map[idx][swz]] = res; +      break; +   case TGSI_FILE_TEMPORARY: +      /* this should be load from l[], with reload elimination later on */ +      res = bld_fetch_global(bld, &bld->tvs[idx][swz]); +      break; +   case TGSI_FILE_ADDRESS: +      res = bld_fetch_global(bld, &bld->avs[idx][swz]); +      break; +   case TGSI_FILE_PREDICATE: +      res = bld_fetch_global(bld, &bld->pvs[idx][swz]); +      break; +   default: +      NOUVEAU_ERR("illegal/unhandled src reg file: %d\n", src->Register.File); +      abort(); +      break;	    +   } + +   switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) { +   case TGSI_UTIL_SIGN_KEEP: +      break; +   case TGSI_UTIL_SIGN_CLEAR: +      res = bld_insn_1(bld, NV_OP_ABS, res); +      break; +   case TGSI_UTIL_SIGN_TOGGLE: +      res = bld_insn_1(bld, NV_OP_NEG, res); +      break; +   case TGSI_UTIL_SIGN_SET: +      res = bld_insn_1(bld, NV_OP_ABS, res); +      res = bld_insn_1(bld, NV_OP_NEG, res); +      break; +   default: +      NOUVEAU_ERR("illegal/unhandled src reg sign mode\n"); +      abort(); +      break; +   } + +   return res; +} + +static void +bld_lit(struct bld_context *bld, struct nv_value *dst0[4], +        const struct tgsi_full_instruction *insn) +{ +   struct nv_value *val0, *zero; +   unsigned mask = insn->Dst[0].Register.WriteMask; + +   if (mask & ((1 << 0) | (1 << 3))) +      dst0[3] = dst0[0] = bld_load_imm_f32(bld, 1.0f); + +   if (mask & (3 << 1)) { +      zero = bld_load_imm_f32(bld, 0.0f); +      val0 = bld_insn_2(bld, NV_OP_MAX, emit_fetch(bld, insn, 0, 0), zero); + +      if (mask & (1 << 1)) +         dst0[1] = val0; +   } + +   if (mask & (1 << 2)) { +      struct nv_value *val1, *val3, *src1, *src3; +      struct nv_value *pos128 = bld_load_imm_f32(bld, 127.999999f); +      struct nv_value *neg128 = bld_load_imm_f32(bld, -127.999999f); + +      src1 = emit_fetch(bld, insn, 0, 1); +      src3 = emit_fetch(bld, insn, 0, 3); + +      val0->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); +      val0->insn->flags_def->insn = val0->insn; + +      val1 = bld_insn_2(bld, NV_OP_MAX, src1, zero); +      val3 = bld_insn_2(bld, NV_OP_MAX, src3, neg128); +      val3 = bld_insn_2(bld, NV_OP_MIN, val3, pos128); +      val3 = bld_pow(bld, val1, val3); + +      dst0[2] = bld_insn_1(bld, NV_OP_MOV, zero); +      dst0[2]->insn->cc = NV_CC_LE; +      dst0[2]->insn->flags_src = new_ref(bld->pc, val0->insn->flags_def); + +      dst0[2] = bld_insn_2(bld, NV_OP_SELECT, val3, dst0[2]); +   } +} + +static INLINE void +get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg) +{ +   switch (insn->Texture.Texture) { +   case TGSI_TEXTURE_1D: +      *arg = *dim = 1; +      break; +   case TGSI_TEXTURE_SHADOW1D: +      *dim = 1; +      *arg = 2; +      break; +   case TGSI_TEXTURE_UNKNOWN: +   case TGSI_TEXTURE_2D: +   case TGSI_TEXTURE_RECT: +      *arg = *dim = 2; +      break; +   case TGSI_TEXTURE_SHADOW2D: +   case TGSI_TEXTURE_SHADOWRECT: +      *dim = 2; +      *arg = 3; +      break; +   case TGSI_TEXTURE_3D: +   case TGSI_TEXTURE_CUBE: +      *dim = *arg = 3; +      break; +   default: +      assert(0); +      break; +   } +} + +static void +load_proj_tex_coords(struct bld_context *bld, +		     struct nv_value *t[4], int dim, +		     const struct tgsi_full_instruction *insn) +{ +   int c, mask = 0; + +   t[3] = emit_fetch(bld, insn, 0, 3); + +   if (t[3]->insn->opcode == NV_OP_PINTERP) { +      t[3]->insn->opcode = NV_OP_LINTERP; +      nv_reference(bld->pc, &t[3]->insn->src[1], NULL); +   } + +   t[3] = bld_insn_1(bld, NV_OP_RCP, t[3]); + +   for (c = 0; c < dim; ++c) { +      t[c] = emit_fetch(bld, insn, 0, c); +      if (t[c]->insn->opcode == NV_OP_LINTERP) +         t[c]->insn->opcode = NV_OP_PINTERP; + +      if (t[c]->insn->opcode == NV_OP_PINTERP) +         nv_reference(bld->pc, &t[c]->insn->src[1], t[3]); +      else +         mask |= 1 << c; +   } + +   for (c = 0; mask; ++c, mask >>= 1) { +      if (!(mask & 1)) +         continue; +      t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], t[3]); +   } +} + +static void +bld_tex(struct bld_context *bld, struct nv_value *dst0[4], +        const struct tgsi_full_instruction *insn) +{ +   struct nv_value *t[4]; +   struct nv_instruction *nvi; +   uint opcode = translate_opcode(insn->Instruction.Opcode); +   int arg, dim, c; + +   get_tex_dim(insn, &dim, &arg); + +   if (insn->Texture.Texture == TGSI_TEXTURE_CUBE) { +   } +   // else +   if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) { +      load_proj_tex_coords(bld, t, dim, insn); +   } else +      for (c = 0; c < dim; ++c) +         t[c] = emit_fetch(bld, insn, 0, c); + +   if (arg != dim) +      t[dim] = emit_fetch(bld, insn, 0, 2); + +   if (insn->Instruction.Opcode == TGSI_OPCODE_TXB || +       insn->Instruction.Opcode == TGSI_OPCODE_TXL) { +      t[arg++] = emit_fetch(bld, insn, 0, 3); +   } + +   for (c = 0; c < arg; ++c) { +      t[c] = bld_insn_1(bld, NV_OP_MOV, t[c]); +      t[c]->reg.type = NV_TYPE_F32; +   } + +   nvi = new_instruction(bld->pc, opcode); + +   for (c = 0; c < 4; ++c) { +      nvi->def[c] = dst0[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32); +      nvi->def[c]->insn = nvi; +   } +   for (c = 0; c < arg; ++c) +      nvi->src[c] = new_ref(bld->pc, t[c]); + +   nvi->tex_t = insn->Src[1].Register.Index; +   nvi->tex_s = 0; +   nvi->tex_mask = 0xf; +   nvi->tex_cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; +   nvi->tex_live = 0; +   nvi->tex_argc = arg; +} + +#define FOR_EACH_DST0_ENABLED_CHANNEL(chan, inst) \ +   for (chan = 0; chan < 4; ++chan)               \ +      if ((inst)->Dst[0].Register.WriteMask & (1 << chan)) + +static void +bld_instruction(struct bld_context *bld, +                const struct tgsi_full_instruction *insn) +{ +   struct nv_value *src0; +   struct nv_value *src1; +   struct nv_value *src2; +   struct nv_value *dst0[4]; +   struct nv_value *temp; +   int c; +   uint opcode = translate_opcode(insn->Instruction.Opcode); + +   tgsi_dump_instruction(insn, 1); +	 +   switch (insn->Instruction.Opcode) { +   case TGSI_OPCODE_ADD: +   case TGSI_OPCODE_MAX: +   case TGSI_OPCODE_MIN: +   case TGSI_OPCODE_MUL: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         dst0[c] = bld_insn_2(bld, opcode, src0, src1); +      } +      break; +   case TGSI_OPCODE_CMP: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         src2 = emit_fetch(bld, insn, 2, c); +         src0 = bld_predicate(bld, src0); + +         src1 = bld_insn_1(bld, NV_OP_MOV, src1); +         src1->insn->flags_src = new_ref(bld->pc, src0); +         src1->insn->cc = NV_CC_LT; + +         src2 = bld_insn_1(bld, NV_OP_MOV, src2); +         src2->insn->flags_src = new_ref(bld->pc, src0); +         src2->insn->cc = NV_CC_GE; + +         dst0[c] = bld_insn_2(bld, NV_OP_SELECT, src1, src2); +      } +      break; +   case TGSI_OPCODE_COS: +      src0 = emit_fetch(bld, insn, 0, 0); +      temp = bld_insn_1(bld, NV_OP_PRESIN, src0); +      if (insn->Dst[0].Register.WriteMask & 7) +         temp = bld_insn_1(bld, NV_OP_COS, temp); +      for (c = 0; c < 3; ++c) +         if (insn->Dst[0].Register.WriteMask & (1 << c)) +            dst0[c] = temp; +      if (!(insn->Dst[0].Register.WriteMask & (1 << 3))) +         break; +      /* XXX: if src0.x is src0.w, don't emit new insns */ +      src0 = emit_fetch(bld, insn, 0, 3); +      temp = bld_insn_1(bld, NV_OP_PRESIN, src0); +      dst0[3] = bld_insn_1(bld, NV_OP_COS, temp); +      break; +   case TGSI_OPCODE_DP3: +      src0 = emit_fetch(bld, insn, 0, 0); +      src1 = emit_fetch(bld, insn, 1, 0); +      temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); +      for (c = 1; c < 3; ++c) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); +      } +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +         dst0[c] = temp; +      break; +   case TGSI_OPCODE_DP4: +      src0 = emit_fetch(bld, insn, 0, 0); +      src1 = emit_fetch(bld, insn, 1, 0); +      temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); +      for (c = 1; c < 4; ++c) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); +      } +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +         dst0[c] = temp; +      break; +   case TGSI_OPCODE_EX2: +      src0 = emit_fetch(bld, insn, 0, 0); +      temp = bld_insn_1(bld, NV_OP_PREEX2, src0); +      temp = bld_insn_1(bld, NV_OP_EX2, temp); +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +         dst0[c] = temp; +      break; +   case TGSI_OPCODE_FRC: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         dst0[c] = bld_insn_1(bld, NV_OP_FLOOR, src0); +         dst0[c] = bld_insn_2(bld, NV_OP_SUB, src0, dst0[c]); +      } +      break; +   case TGSI_OPCODE_KIL: +      for (c = 0; c < 4; ++c) { +         src0 = emit_fetch(bld, insn, 0, c); +         bld_kil(bld, src0); +      } +      break; +   case TGSI_OPCODE_IF: +   { +      struct nv_basic_block *b = new_basic_block(bld->pc); + +      nvbb_attach_block(bld->pc->current_block, b); + +      bld->join_bb[bld->cond_lvl] = bld->pc->current_block; +      bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + +      src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0)); + +      bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, FALSE); + +      ++bld->cond_lvl; +      bld_new_block(bld, b); +   } +      break; +   case TGSI_OPCODE_ELSE: +   { +      struct nv_basic_block *b = new_basic_block(bld->pc); + +      --bld->cond_lvl; +      nvbb_attach_block(bld->join_bb[bld->cond_lvl], b); + +      bld->cond_bb[bld->cond_lvl]->exit->target = b; +      bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + +      new_instruction(bld->pc, NV_OP_BRA)->is_terminator = 1; + +      ++bld->cond_lvl; +      bld_new_block(bld, b); +   } +      break; +   case TGSI_OPCODE_ENDIF: /* XXX: deal with ENDIF; ENDIF; */ +   { +      struct nv_basic_block *b = new_basic_block(bld->pc); + +      --bld->cond_lvl; +      nvbb_attach_block(bld->pc->current_block, b); +      nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b); + +      bld->cond_bb[bld->cond_lvl]->exit->target = b; + +      if (0 && bld->join_bb[bld->cond_lvl]) { +         bld->join_bb[bld->cond_lvl]->exit->prev->target = b; + +         new_instruction(bld->pc, NV_OP_NOP)->is_join = TRUE; +      } + +      bld_new_block(bld, b); +   } +      break; +   case TGSI_OPCODE_BGNLOOP: +      assert(0); +      break; +   case TGSI_OPCODE_BRK: +      assert(0); +      break; +   case TGSI_OPCODE_CONT: +      assert(0); +      break; +   case TGSI_OPCODE_ENDLOOP: +      assert(0); +      break; +   case TGSI_OPCODE_ABS: +   case TGSI_OPCODE_CEIL: +   case TGSI_OPCODE_FLR: +   case TGSI_OPCODE_TRUNC: +   case TGSI_OPCODE_DDX: +   case TGSI_OPCODE_DDY: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         dst0[c] = bld_insn_1(bld, opcode, src0); +      }	    +      break; +   case TGSI_OPCODE_LIT: +      bld_lit(bld, dst0, insn); +      break; +   case TGSI_OPCODE_LRP: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         src2 = emit_fetch(bld, insn, 2, c); +         dst0[c] = bld_insn_2(bld, NV_OP_SUB, src1, src2); +         dst0[c] = bld_insn_3(bld, NV_OP_MAD, dst0[c], src0, src2); +      } +      break; +   case TGSI_OPCODE_MOV: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +         dst0[c] = emit_fetch(bld, insn, 0, c); +      break; +   case TGSI_OPCODE_MAD: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         src2 = emit_fetch(bld, insn, 2, c); +         dst0[c] = bld_insn_3(bld, opcode, src0, src1, src2); +      } +      break; +   case TGSI_OPCODE_POW: +      src0 = emit_fetch(bld, insn, 0, 0); +      src1 = emit_fetch(bld, insn, 1, 0); +      temp = bld_pow(bld, src0, src1); +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +         dst0[c] = temp; +      break; +   case TGSI_OPCODE_RCP: +   case TGSI_OPCODE_LG2: +      src0 = emit_fetch(bld, insn, 0, 0); +      temp = bld_insn_1(bld, opcode, src0); +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +         dst0[c] = temp; +      break; +   case TGSI_OPCODE_RSQ: +      src0 = emit_fetch(bld, insn, 0, 0); +      temp = bld_insn_1(bld, NV_OP_ABS, src0); +      temp = bld_insn_1(bld, NV_OP_RSQ, temp); +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +         dst0[c] = temp; +      break; +   case TGSI_OPCODE_SLT: +   case TGSI_OPCODE_SGE: +   case TGSI_OPCODE_SEQ: +   case TGSI_OPCODE_SGT: +   case TGSI_OPCODE_SLE: +   case TGSI_OPCODE_SNE: +   case TGSI_OPCODE_ISLT: +   case TGSI_OPCODE_ISGE: +   case TGSI_OPCODE_USEQ: +   case TGSI_OPCODE_USGE: +   case TGSI_OPCODE_USLT: +   case TGSI_OPCODE_USNE: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         dst0[c] = bld_insn_2(bld, NV_OP_SET, src0, src1); +         dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode); +         dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode); + +         if (dst0[c]->reg.type != NV_TYPE_F32) +            break; +         dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]); +         dst0[c]->insn->src[0]->typecast = NV_TYPE_S32; +         dst0[c]->reg.type = NV_TYPE_S32; +         dst0[c] = bld_insn_1(bld, NV_OP_CVT, dst0[c]); +         dst0[c]->reg.type = NV_TYPE_F32; +      } +      break; +   case TGSI_OPCODE_SUB: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         src0 = emit_fetch(bld, insn, 0, c); +         src1 = emit_fetch(bld, insn, 1, c); +         dst0[c] = bld_insn_2(bld, NV_OP_ADD, src0, src1); +         dst0[c]->insn->src[1]->mod ^= NV_MOD_NEG; +      } +      break; +   case TGSI_OPCODE_TEX: +   case TGSI_OPCODE_TXB: +   case TGSI_OPCODE_TXL: +   case TGSI_OPCODE_TXP: +      bld_tex(bld, dst0, insn); +      break; +   case TGSI_OPCODE_XPD: +      FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { +         if (c == 3) { +            dst0[3] = bld_imm_f32(bld, 1.0f); +            break; +         } +         src0 = emit_fetch(bld, insn, 0, (c + 1) % 3); +         src1 = emit_fetch(bld, insn, 1, (c + 2) % 3); +         dst0[c] = bld_insn_2(bld, NV_OP_MUL, src0, src1); + +         src0 = emit_fetch(bld, insn, 0, (c + 2) % 3); +         src1 = emit_fetch(bld, insn, 1, (c + 1) % 3); +         dst0[c] = bld_insn_3(bld, NV_OP_MAD, src0, src1, dst0[c]); + +         dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG; +      } +      break; +   case TGSI_OPCODE_END: +      if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) +         bld_export_outputs(bld); +      break; +   default: +      NOUVEAU_ERR("nv_bld: unhandled opcode %u\n", insn->Instruction.Opcode); +      abort(); +      break; +   } + +   FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) +      emit_store(bld, insn, c, dst0[c]); +} + +int +nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) +{ +   struct bld_context *bld = CALLOC_STRUCT(bld_context); +   int c; + +   pc->root = pc->current_block = new_basic_block(pc); + +   bld->pc = pc; +   bld->ti = ti; + +   pc->loop_nesting_bound = 1; /* XXX: should work with 0 */ + +   c = util_bitcount(bld->ti->p->fp.interp >> 24); +   if (c && ti->p->type == PIPE_SHADER_FRAGMENT) { +      bld->frgcrd[3] = new_value(pc, NV_FILE_MEM_V, NV_TYPE_F32); +      bld->frgcrd[3]->reg.id = c - 1; +      bld->frgcrd[3] = bld_insn_1(bld, NV_OP_LINTERP, bld->frgcrd[3]); +      bld->frgcrd[3] = bld_insn_1(bld, NV_OP_RCP, bld->frgcrd[3]); +   } + +   tgsi_parse_init(&bld->parse[0], ti->p->pipe.tokens); + +   while (!tgsi_parse_end_of_tokens(&bld->parse[bld->call_lvl])) { +      const union tgsi_full_token *tok = &bld->parse[bld->call_lvl].FullToken; + +      tgsi_parse_token(&bld->parse[bld->call_lvl]); + +      switch (tok->Token.Type) { +      case TGSI_TOKEN_TYPE_INSTRUCTION: +         bld_instruction(bld, &tok->FullInstruction); +         break; +      default: +         break; +      } +   } + +   FREE(bld); +   return 0; +} + +#if 0 +/* If a variable is assigned in a loop, replace all references to the value + * from outside the loop with a phi value. + */ +static void +bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b, +                   struct nv_value *old_val, +                   struct nv_value *new_val) +{ +   struct nv_instruction *nvi; + +   for (nvi = b->entry; nvi; nvi = nvi->next) { +      int s; +      for (s = 0; s < 5; ++s) { +         if (!nvi->src[s]) +            continue; +         if (nvi->src[s]->value == old_val) +            nv_reference(pc, &nvi->src[s], new_val); +      } +      if (nvi->flags_src && nvi->flags_src->value == old_val) +         nv_reference(pc, &nvi->flags_src, new_val); +   } +   b->pass_seq = pc->pass_seq; + +   if (b->out[0] && b->out[0]->pass_seq < pc->pass_seq) +      bld_adjust_nv_refs(pc, b, old_val, new_val); + +   if (b->out[1] && b->out[1]->pass_seq < pc->pass_seq) +      bld_adjust_nv_refs(pc, b, old_val, new_val); +} +#endif diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index 864cb09352..6bd52884b5 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -519,7 +519,7 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,  		so_data  (so, fui(v[1]));  		break;  	case 1: -		if (attrib == nv50->vertprog->cfg.edgeflag_in) { +		if (attrib == nv50->vertprog->vp.edgeflag) {  			so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);  			so_data  (so, v[0] ? 1 : 0);  		} @@ -560,7 +560,7 @@ nv50_vbo_validate(struct nv50_context *nv50)  	nv50->vbo_fifo = 0;  	if (nv50->screen->force_push || -	    nv50->vertprog->cfg.edgeflag_in < 16) +	    nv50->vertprog->vp.edgeflag < 16)  		nv50->vbo_fifo = 0xffff;  	for (i = 0; i < nv50->vtxbuf_nr; i++) {  | 
