diff options
-rw-r--r-- | src/gallium/drivers/nv50/Makefile | 11 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_pc.c | 433 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_pc.h | 431 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_emit.c | 1139 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_optimize.c | 717 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_print.c | 287 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_regalloc.c | 973 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_program.c | 5109 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_program.h | 169 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_push.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_shader_state.c | 619 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_state.c | 3 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_state_validate.c | 9 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 1266 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_vbo.c | 4 |
15 files changed, 6485 insertions, 4687 deletions
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile index e31e6f8662..3943a9e257 100644 --- a/src/gallium/drivers/nv50/Makefile +++ b/src/gallium/drivers/nv50/Makefile @@ -10,7 +10,6 @@ C_SOURCES = \ nv50_draw.c \ nv50_miptree.c \ nv50_query.c \ - nv50_program.c \ nv50_resource.c \ nv50_screen.c \ nv50_state.c \ @@ -19,6 +18,14 @@ C_SOURCES = \ nv50_tex.c \ nv50_transfer.c \ nv50_vbo.c \ - nv50_push.c + nv50_push.c \ + nv50_program.c \ + nv50_shader_state.c \ + nv50_pc.c \ + nv50_pc_print.c \ + nv50_pc_emit.c \ + nv50_tgsi_to_nc.c \ + nv50_pc_optimize.c \ + nv50_pc_regalloc.c include ../../Makefile.template diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c new file mode 100644 index 0000000000..8aba0a32b7 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -0,0 +1,433 @@ + +#include "nv50_pc.h" +#include "nv50_program.h" + +#include <stdio.h> + +/* returns TRUE if operands 0 and 1 can be swapped */ +boolean +nv_op_commutative(uint opcode) +{ + switch (opcode) { + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_MAD: + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + case NV_OP_MIN: + case NV_OP_MAX: + case NV_OP_SAD: + return TRUE; + default: + return FALSE; + } +} + +/* return operand to which the address register applies */ +int +nv50_indirect_opnd(struct nv_instruction *i) +{ + if (!i->src[4]) + return -1; + + switch (i->opcode) { + case NV_OP_MOV: + case NV_OP_LDA: + return 0; + default: + return 1; + } +} + +boolean +nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s) +{ + if (nvi->flags_src || nvi->flags_def) + return FALSE; + + switch (nvi->opcode) { + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + case NV_OP_SHL: + case NV_OP_SHR: + return (s == 1) && (nvi->def[0]->reg.file == NV_FILE_GPR); + case NV_OP_MOV: + assert(s == 0); + return (nvi->def[0]->reg.file == NV_FILE_GPR); + default: + return FALSE; + } +} + +boolean +nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) +{ + switch (nvi->opcode) { + case NV_OP_ABS: + case NV_OP_ADD: + case NV_OP_CEIL: + case NV_OP_FLOOR: + case NV_OP_TRUNC: + case NV_OP_CVT: + case NV_OP_MAD: + case NV_OP_MUL: + case NV_OP_SAT: + case NV_OP_SUB: + case NV_OP_MAX: + case NV_OP_MIN: + if (s == 0 && (value->reg.file == NV_FILE_MEM_S || + value->reg.file == NV_FILE_MEM_P)) + return TRUE; + if (s == 1 && + value->reg.file >= NV_FILE_MEM_C(0) && + value->reg.file <= NV_FILE_MEM_C(15)) + return TRUE; + if (s == 2 && nvi->src[1]->value->reg.file == NV_FILE_GPR) + return TRUE; + return FALSE; + case NV_OP_MOV: + assert(s == 0); + return TRUE; + default: + return FALSE; + } +} + +ubyte +nv50_supported_src_mods(uint opcode, int s) +{ + switch (opcode) { + case NV_OP_ABS: + return NV_MOD_NEG | NV_MOD_ABS; /* obviously */ + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_MAD: + return NV_MOD_NEG; + case NV_OP_DFDX: + case NV_OP_DFDY: + assert(s == 0); + return NV_MOD_NEG; + case NV_OP_MAX: + case NV_OP_MIN: + return NV_MOD_ABS; + case NV_OP_CVT: + case NV_OP_LG2: + case NV_OP_NEG: + case NV_OP_PREEX2: + case NV_OP_PRESIN: + case NV_OP_RCP: + case NV_OP_RSQ: + return NV_MOD_ABS | NV_MOD_NEG; + default: + return 0; + } +} + +int +nv_nvi_refcount(struct nv_instruction *nvi) +{ + int i, rc; + + rc = nvi->flags_def ? nvi->flags_def->refc : 0; + + for (i = 0; i < 4; ++i) { + if (!nvi->def[i]) + return rc; + rc += nvi->def[i]->refc; + } + return rc; +} + +static void +nv_pc_free_refs(struct nv_pc *pc) +{ + int i; + for (i = 0; i < pc->num_refs; i += 64) + FREE(pc->refs[i]); +} + +void +nv_print_program(struct nv_basic_block *b) +{ + struct nv_instruction *i = b->phi; + + b->priv = 0; + + debug_printf("=== BB %i ", b->id); + if (b->out[0]) + debug_printf("(--0> %i) ", b->out[0]->id); + if (b->out[1]) + debug_printf("(--1> %i) ", b->out[1]->id); + debug_printf("===\n"); + + if (!i) + i = b->entry; + for (; i; i = i->next) + nv_print_instruction(i); + + if (!b->out[0]) { + debug_printf("END\n\n"); + return; + } + if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) + return; + + if (b->out[0] != b) + nv_print_program(b->out[0]); + + if (b->out[1] && b->out[1] != b) + nv_print_program(b->out[1]); +} + +static INLINE void +nvcg_show_bincode(struct nv_pc *pc) +{ + int i; + + for (i = 0; i < pc->bin_size / 4; ++i) + debug_printf("0x%08x ", pc->emit[i]); + debug_printf("\n"); +} + +static int +nv50_emit_program(struct nv_pc *pc) +{ + uint32_t *code = pc->emit; + int n; + + debug_printf("emitting program: size = %u\n", pc->bin_size); + + for (n = 0; n < pc->num_blocks; ++n) { + struct nv_instruction *i; + struct nv_basic_block *b = pc->bb_list[n]; + + for (i = b->entry; i; i = i->next) { + nv50_emit_instruction(pc, i); + + pc->bin_pos += 1 + (pc->emit[0] & 1); + pc->emit += 1 + (pc->emit[0] & 1); + } + } + assert(pc->emit == &code[pc->bin_size / 4]); + + /* XXX: we can do better than this ... */ + if ((pc->emit[-1] & 3) == 3) { + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0xe0000000; + pc->bin_size += 8; + } + + pc->emit = code; + code[pc->bin_size / 4 - 1] |= 1; + + nvcg_show_bincode(pc); + + return 0; +} + +int +nv50_generate_code(struct nv50_translation_info *ti) +{ + struct nv_pc *pc; + int ret; + + pc = CALLOC_STRUCT(nv_pc); + if (!pc) + return 1; + + ret = nv50_tgsi_to_nc(pc, ti); + if (ret) + goto out; + + /* optimization */ + ret = nv_pc_exec_pass0(pc); + if (ret) + goto out; + + /* register allocation */ + ret = nv_pc_exec_pass1(pc); + if (ret) + goto out; + + /* prepare for emission */ + ret = nv_pc_exec_pass2(pc); + if (ret) + goto out; + + pc->emit = CALLOC(pc->bin_size / 4 + 2, 4); + if (!pc->emit) { + ret = 3; + goto out; + } + ret = nv50_emit_program(pc); + if (ret) + goto out; + + ti->p->code_size = pc->bin_size; + ti->p->code = pc->emit; + + ti->p->immd_size = pc->immd_count * 4; + ti->p->immd = pc->immd_buf; + + ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] + 1) >> 1; + ti->p->max_gpr++; + + ti->p->fixups = pc->fixups; + ti->p->num_fixups = pc->num_fixups; + + debug_printf("SHADER TRANSLATION - %s\n", ret ? "failure" : "success"); + +out: + nv_pc_free_refs(pc); + if (ret) { + if (pc->emit) + free(pc->emit); + if (pc->immd_buf) + free(pc->immd_buf); + if (pc->fixups) + free(pc->fixups); + } + free(pc); + + return ret; +} + +static void +nvbb_insert_phi(struct nv_basic_block *b, struct nv_instruction *i) +{ + if (!b->phi) { + i->prev = NULL; + b->phi = i; + i->next = b->entry; + if (b->entry) { + assert(!b->entry->prev && b->exit); + b->entry->prev = i; + } else { + b->entry = i; + b->exit = i; + } + } else { + assert(b->entry); + if (b->entry->opcode == NV_OP_PHI) { /* insert after entry */ + assert(b->entry == b->exit); + b->entry->next = i; + i->prev = b->entry; + b->entry = i; + b->exit = i; + } else { /* insert before entry */ + assert(b->entry->prev && b->exit); + i->next = b->entry; + i->prev = b->entry->prev; + b->entry->prev = i; + i->prev->next = i; + } + } +} + +void +nvbb_insert_tail(struct nv_basic_block *b, struct nv_instruction *i) +{ + if (i->opcode == NV_OP_PHI) { + nvbb_insert_phi(b, i); + } else { + i->prev = b->exit; + if (b->exit) + b->exit->next = i; + b->exit = i; + if (!b->entry) + b->entry = i; + else + if (i->prev && i->prev->opcode == NV_OP_PHI) + b->entry = i; + } + + i->bb = b; + b->num_instructions++; +} + +void +nv_nvi_delete(struct nv_instruction *nvi) +{ + struct nv_basic_block *b = nvi->bb; + int j; + + debug_printf("REM: "); nv_print_instruction(nvi); + + for (j = 0; j < 4; ++j) { + if (!nvi->src[j]) + break; + --(nvi->src[j]->value->refc); + nvi->src[j] = NULL; + } + + if (nvi->next) + nvi->next->prev = nvi->prev; + else { + assert(nvi == b->exit); + b->exit = nvi->prev; + } + + if (nvi->prev) + nvi->prev->next = nvi->next; + + if (nvi == b->entry) { + assert(nvi->opcode != NV_OP_PHI || !nvi->next); + + if (!nvi->next || (nvi->opcode == NV_OP_PHI)) + b->entry = nvi->prev; + else + b->entry = nvi->next; + } + + if (nvi == b->phi) { + assert(!nvi->prev); + if (nvi->opcode != NV_OP_PHI) + debug_printf("WARN: b->phi points to non-PHI instruction\n"); + + if (!nvi->next || nvi->next->opcode != NV_OP_PHI) + b->phi = NULL; + else + b->phi = nvi->next; + } +} + +void +nv_nvi_permute(struct nv_instruction *i1, struct nv_instruction *i2) +{ + struct nv_basic_block *b = i1->bb; + + assert(i1->opcode != NV_OP_PHI && + i2->opcode != NV_OP_PHI); + assert(i1->next == i2); + + if (b->exit == i2) + b->exit = i1; + + if (b->entry == i1) + b->entry = i2; + + i2->prev = i1->prev; + i1->next = i2->next; + i2->next = i1; + i1->prev = i2; + + if (i2->prev) + i2->prev->next = i2; + if (i1->next) + i1->next->prev = i1; +} + +void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b) +{ + if (parent->out[0]) { + assert(!parent->out[1]); + parent->out[1] = b; + } else + parent->out[0] = b; + + b->in[b->num_in++] = parent; +} diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h new file mode 100644 index 0000000000..3ab48d0afd --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -0,0 +1,431 @@ +/*************************************************************************/ +/* Copyright (C) 2010 I */ +/* */ +/* This program is free software: you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation, either version 3 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program. If not, see <http://www.gnu.org/licenses/>. */ +/*************************************************************************/ + +#ifndef __NV50_COMPILER_H__ +#define __NV50_COMPILER_H__ + +#include "pipe/p_defines.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +#define NV_OP_PHI 0 +#define NV_OP_EXTRACT 1 +#define NV_OP_COMBINE 2 +#define NV_OP_LDA 3 +#define NV_OP_STA 4 +#define NV_OP_MOV 5 +#define NV_OP_ADD 6 +#define NV_OP_SUB 7 +#define NV_OP_NEG 8 +#define NV_OP_MUL 9 +#define NV_OP_MAD 10 +#define NV_OP_CVT 11 +#define NV_OP_SAT 12 +#define NV_OP_NOT 13 +#define NV_OP_AND 14 +#define NV_OP_OR 15 +#define NV_OP_XOR 16 +#define NV_OP_SHL 17 +#define NV_OP_SHR 18 +#define NV_OP_RCP 19 +/* gap */ +#define NV_OP_RSQ 21 +#define NV_OP_LG2 22 +#define NV_OP_SIN 23 +#define NV_OP_COS 24 +#define NV_OP_EX2 25 +#define NV_OP_PRESIN 26 +#define NV_OP_PREEX2 27 +#define NV_OP_MIN 28 +#define NV_OP_MAX 29 +#define NV_OP_SET 30 +#define NV_OP_SAD 31 +#define NV_OP_KIL 32 +#define NV_OP_BRA 33 +#define NV_OP_CALL 34 +#define NV_OP_RET 35 +#define NV_OP_BREAK 36 +#define NV_OP_BREAKADDR 37 +#define NV_OP_JOINAT 38 +#define NV_OP_TEX 39 +#define NV_OP_TXB 40 +#define NV_OP_TXL 41 +#define NV_OP_TXF 42 +#define NV_OP_TXQ 43 +#define NV_OP_DFDX 44 +#define NV_OP_DFDY 45 +#define NV_OP_QUADOP 46 +#define NV_OP_LINTERP 47 +#define NV_OP_PINTERP 48 +#define NV_OP_ABS 49 +#define NV_OP_CEIL 50 +#define NV_OP_FLOOR 51 +#define NV_OP_TRUNC 52 +#define NV_OP_NOP 53 +#define NV_OP_SELECT 54 +#define NV_OP_EXPORT 55 +#define NV_OP_COUNT 56 + +#define NV_FILE_GPR 0 +#define NV_FILE_OUT 1 +#define NV_FILE_ADDR 2 +#define NV_FILE_FLAGS 3 +#define NV_FILE_IMM 16 +#define NV_FILE_MEM_S 32 +#define NV_FILE_MEM_P 33 +#define NV_FILE_MEM_V 34 +#define NV_FILE_MEM_L 48 +#define NV_FILE_MEM_G(i) (64 + i) +#define NV_FILE_MEM_C(i) (80 + i) + +#define NV_MOD_NEG 1 +#define NV_MOD_ABS 2 +#define NV_MOD_NOT 4 +#define NV_MOD_SAT 8 + +#define NV_TYPE_U8 0x00 +#define NV_TYPE_S8 0x01 +#define NV_TYPE_U16 0x02 +#define NV_TYPE_S16 0x03 +#define NV_TYPE_U32 0x04 +#define NV_TYPE_S32 0x05 +#define NV_TYPE_P32 0x07 +#define NV_TYPE_F32 0x09 +#define NV_TYPE_F64 0x0b +#define NV_TYPE_VEC(x, n) (NV_TYPE_##x | (n << 4)) +#define NV_TYPE_LO 0x00 +#define NV_TYPE_HI 0x80 +#define NV_TYPE_ANY 0xff + +#define NV_TYPE_ISINT(t) ((t) <= 5) +#define NV_TYPE_ISFLT(t) ((t) & 0x08) + +#define NV_CC_FL 0x0 +#define NV_CC_LT 0x1 +#define NV_CC_EQ 0x2 +#define NV_CC_LE 0x3 +#define NV_CC_GT 0x4 +#define NV_CC_NE 0x5 +#define NV_CC_GE 0x6 +#define NV_CC_U 0x8 +#define NV_CC_TR 0xf + +#define NV_PC_MAX_INSTRUCTIONS 2048 +#define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4) + +static INLINE boolean +nv_is_vector_op(uint opcode) +{ + return (opcode >= NV_OP_TEX) && (opcode <= NV_OP_TXQ); +} + +static INLINE uint +nv_type_order(ubyte type) +{ + switch (type & 0xf) { + case NV_TYPE_U8: + case NV_TYPE_S8: + return 0; + case NV_TYPE_U16: + case NV_TYPE_S16: + return 1; + case NV_TYPE_U32: + case NV_TYPE_F32: + case NV_TYPE_S32: + case NV_TYPE_P32: + return 2; + case NV_TYPE_F64: + return 3; + } + assert(0); +} + +static INLINE uint +nv_type_sizeof(ubyte type) +{ + if (type & 0xf0) + return (1 << nv_type_order(type)) * (type >> 4); + return 1 << nv_type_order(type); +} + +static INLINE uint +nv_type_sizeof_base(ubyte type) +{ + return 1 << nv_type_order(type); +} + +struct nv_reg { + int id; + ubyte file; + ubyte type; /* type of generating instruction's result */ + union { + float f32; + double f64; + int32_t s32; + uint32_t u32; + } imm; +}; + +struct nv_range { + struct nv_range *next; + int bgn; + int end; +}; + +struct nv_value { + struct nv_reg reg; + struct nv_instruction *insn; + struct nv_value *join; + int n; + struct nv_range *livei; + int refc; + + struct nv_value *next; + struct nv_value *prev; +}; + +struct nv_ref { + struct nv_value *value; + struct nv_instruction *insn; + ubyte mod; + ubyte typecast; + ubyte flags; /* not used yet */ +}; + +struct nv_basic_block; + +struct nv_instruction { + struct nv_instruction *next; + struct nv_instruction *prev; + uint opcode; + int serial; + struct nv_value *def[4]; + struct nv_value *flags_def; + struct nv_ref *src[5]; + struct nv_ref *flags_src; + struct nv_basic_block *bb; + struct nv_basic_block *target; /* target block of control flow insn */ + ubyte cc; + ubyte set_cond : 4; + ubyte fixed : 1; /* don't optimize away */ + ubyte is_terminator : 1; + ubyte is_join : 1; + ubyte is_long : 1; /* for emission */ + /* */ + ubyte saturate : 1; + ubyte centroid : 1; + ubyte flat : 1; + ubyte padding : 4; + ubyte tex_live : 1; + /* */ + ubyte tex_t; /* TIC binding */ + ubyte tex_s; /* TSC binding */ + ubyte tex_argc : 3; + ubyte tex_cube : 1; + ubyte tex_mask : 4; + /* */ + ubyte quadop; +}; + +struct nv_basic_block { + struct nv_instruction *entry; /* first non-phi instruction */ + struct nv_instruction *exit; + struct nv_instruction *phi; /* very first instruction */ + int num_instructions; + + struct nv_basic_block *out[2]; /* no indirect branches -> 2 */ + struct nv_basic_block **in; + uint num_in; + + int id; + struct nv_basic_block *last_visitor; + uint priv; + uint pass_seq; + + uint32_t bin_pos; /* position, size in emitted code */ + uint32_t bin_size; + + uint32_t live_set[NV_PC_MAX_VALUES / 32]; +}; + +#define NV_FIXUP_CFLOW_RELOC 0 +#define NV_FIXUP_PARAM_RELOC 1 + +struct nv_fixup { + ubyte type; + ubyte shift; + uint32_t mask; + uint32_t data; + uint32_t offset; +}; + +static INLINE void +nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data) +{ + uint32_t val; + + val = bin[fixup->offset / 4] & ~fixup->mask; + data = (fixup->shift < 0) ? (data >> fixup->shift) : (data << fixup->shift); + val |= (fixup->data + data) & fixup->mask; + bin[fixup->offset / 4] = val; +} + +struct nv_pc { + struct nv50_translation_info *ti; + + struct nv_basic_block *root; + struct nv_basic_block *current_block; + struct nv_basic_block *parent_block; + + int loop_nesting_bound; + uint pass_seq; + + struct nv_value values[NV_PC_MAX_VALUES]; + struct nv_instruction instructions[NV_PC_MAX_INSTRUCTIONS]; + struct nv_ref **refs; + struct nv_basic_block **bb_list; + int num_values; + int num_instructions; + int num_refs; + int num_blocks; + + int max_reg[4]; + + uint32_t *immd_buf; /* populated on emit */ + unsigned immd_count; + + uint32_t *emit; + unsigned bin_size; + unsigned bin_pos; + + struct nv_fixup *fixups; + int num_fixups; +}; + +void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *); + +static INLINE struct nv_instruction * +new_instruction(struct nv_pc *pc, uint opcode) +{ + struct nv_instruction *insn; + + insn = &pc->instructions[pc->num_instructions++]; + assert(pc->num_instructions < NV_PC_MAX_INSTRUCTIONS); + + insn->cc = NV_CC_TR; + insn->opcode = opcode; + + nvbb_insert_tail(pc->current_block, insn); + return insn; +} + +static INLINE struct nv_value * +new_value(struct nv_pc *pc, ubyte file, ubyte type) +{ + struct nv_value *value = &pc->values[pc->num_values]; + + assert(pc->num_values < NV_PC_MAX_VALUES - 1); + + value->n = pc->num_values++; + value->join = value; + value->reg.id = -1; + value->reg.file = file; + value->reg.type = type; + return value; +} + +static INLINE struct nv_ref * +new_ref(struct nv_pc *pc, struct nv_value *val) +{ + int i; + struct nv_ref *ref; + + if ((pc->num_refs % 64) == 0) { + const unsigned old_size = pc->num_refs * sizeof(struct nv_ref *); + const unsigned new_size = (pc->num_refs + 64) * sizeof(struct nv_ref *); + + pc->refs = REALLOC(pc->refs, old_size, new_size); + + ref = CALLOC(64, sizeof(struct nv_ref)); + for (i = 0; i < 64; ++i) + pc->refs[pc->num_refs + i] = &ref[i]; + } + + ref = pc->refs[pc->num_refs++]; + ref->value = val; + ref->typecast = val->reg.type; + + ++val->refc; + return ref; +} + +static INLINE struct nv_basic_block * +new_basic_block(struct nv_pc *pc) +{ + struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block); + + bb->in = CALLOC(sizeof(struct nv_basic_block *), 4); + bb->id = pc->num_blocks++; + return bb; +} + +static INLINE void +nv_reference(struct nv_pc *pc, struct nv_ref **d, struct nv_value *s) +{ + if (*d) + --(*d)->value->refc; + + if (s) { + if (!*d) + *d = new_ref(pc, s); + else { + (*d)->value = s; + ++(s->refc); + } + } else { + assert(*d); + *d = NULL; + } +} + +/* nv50_emit.c */ +void nv50_emit_instruction(struct nv_pc *, struct nv_instruction *); + +/* nv50_print.c */ +const char *nv_opcode_name(uint opcode); +void nv_print_instruction(struct nv_instruction *); + +/* nv50_pc.c */ +void nv_print_program(struct nv_basic_block *b); + +boolean nv_op_commutative(uint opcode); +int nv50_indirect_opnd(struct nv_instruction *); +boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s); +boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *); +ubyte nv50_supported_src_mods(uint opcode, int s); +int nv_nvi_refcount(struct nv_instruction *); +void nv_nvi_delete(struct nv_instruction *); +void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); +void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *); + +int nv_pc_exec_pass0(struct nv_pc *pc); +int nv_pc_exec_pass1(struct nv_pc *pc); +int nv_pc_exec_pass2(struct nv_pc *pc); + +int nv50_tgsi_to_nc(struct nv_pc *, struct nv50_translation_info *); + +#endif // NV50_COMPILER_H diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c new file mode 100644 index 0000000000..b917d23232 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -0,0 +1,1139 @@ +/*************************************************************************/ +/* Copyright (C) 2009 */ +/* */ +/* This program is free software: you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation, either version 3 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program. If not, see <http://www.gnu.org/licenses/>. */ +/*************************************************************************/ + +#include "nv50_context.h" +#include "nv50_pc.h" + +// Definitions + +#define FLAGS_CC_SHIFT 7 +#define FLAGS_ID_SHIFT 12 +#define FLAGS_WR_ID_SHIFT 4 +#define FLAGS_CC_MASK (0x1f << FLAGS_CC_SHIFT) +#define FLAGS_ID_MASK (0x03 << FLAGS_ID_SHIFT) +#define FLAGS_WR_EN (1 << 6) +#define FLAGS_WR_ID_MASK (0x3 << FLAGS_WR_ID_SHIFT) + +const ubyte nv50_inst_min_size_tab[NV_OP_COUNT] = +{ + 0, 0, 0, 8, 8, 4, 4, 4, 8, 4, 4, 8, 8, 8, 8, 8, /* 15 */ + 8, 8, 8, 4, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 31 */ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 47 */ + 4, 8, 8, 8, 8, 8, 0, 0 +}; + +/* XXX: silence, you ! */ +unsigned +nv50_inst_min_size(struct nv_instruction *i); + +unsigned +nv50_inst_min_size(struct nv_instruction *i) +{ + int n; + + if (nv50_inst_min_size_tab[i->opcode] > 4) + return 8; + + if (i->def[0] && i->def[0]->reg.file != NV_FILE_GPR) + return 8; + if (i->def[0]->join->reg.id > 63) + return 8; + + for (n = 0; n < 3; ++n) { + if (!i->src[n]) + break; + if (i->src[n]->value->reg.file != NV_FILE_GPR && + i->src[n]->value->reg.file != NV_FILE_MEM_V) + return 8; + if (i->src[n]->value->reg.id > 63) + return 8; + } + + if (i->flags_def || i->flags_src || i->src[4]) + return 8; + + if (i->src[2]) { + if (i->saturate || i->src[2]->mod) + return 8; + if (i->src[0]->mod ^ i->src[1]->mod) + return 8; + if ((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS) + return 8; + if (i->def[0]->join->reg.id < 0 || + i->def[0]->join->reg.id != i->src[2]->value->join->reg.id) + return 8; + } + + return nv50_inst_min_size_tab[i->opcode]; +} + +static INLINE ubyte +STYPE(struct nv_instruction *nvi, int s) +{ + return nvi->src[s]->typecast; +} + +static INLINE ubyte +DTYPE(struct nv_instruction *nvi, int d) +{ + return nvi->def[d]->reg.type; +} + +static INLINE struct nv_reg * +SREG(struct nv_ref *ref) +{ + return &ref->value->join->reg; +} + +static INLINE struct nv_reg * +DREG(struct nv_value *val) +{ + return &val->join->reg; +} + +static INLINE ubyte +SFILE(struct nv_instruction *nvi, int s) +{ + return nvi->src[s]->value->reg.file; +} + +static INLINE ubyte +DFILE(struct nv_instruction *nvi, int d) +{ + return nvi->def[0]->reg.file; +} + +static INLINE void +SID(struct nv_pc *pc, struct nv_ref *ref, int pos) +{ + pc->emit[pos / 32] |= SREG(ref)->id << (pos % 32); +} + +static INLINE void +DID(struct nv_pc *pc, struct nv_value *val, int pos) +{ + pc->emit[pos / 32] |= DREG(val)->id << (pos % 32); +} + +static INLINE uint32_t +get_immd_u32(struct nv_ref *ref) +{ + assert(ref->value->reg.file == NV_FILE_IMM); + return ref->value->reg.imm.u32; +} + +static INLINE void +set_immd_u32(struct nv_pc *pc, uint32_t u32) +{ + pc->emit[1] |= 3; + pc->emit[0] |= (u32 & 0x3f) << 16; + pc->emit[1] |= (u32 >> 6) << 2; +} + +static INLINE void +set_immd(struct nv_pc *pc, struct nv_ref *ref) +{ + assert(ref->value->reg.file == NV_FILE_IMM); + set_immd_u32(pc, get_immd_u32(ref)); +} + +static void +new_fixup(struct nv_pc *pc, unsigned type, uint32_t data, uint32_t m, int s) +{ + const unsigned size = sizeof(struct nv_fixup); + const unsigned n = pc->num_fixups; + return; + + if (!(n % 8)) + pc->fixups = REALLOC(pc->fixups, n * size, (n + 8) * size); + + pc->fixups[n].offset = pc->bin_pos + (s / 32); + pc->fixups[n].type = type; + pc->fixups[n].data = data; + pc->fixups[n].mask = m << (s % 32); + pc->fixups[n].shift = s % 32; + + ++pc->num_fixups; + + assert(((data << (s % 32)) & pc->fixups[n].mask) == (data << (s % 32))); +} + +static void +nv_pc_alloc_immd(struct nv_pc *pc, struct nv_ref *ref) +{ + uint32_t i, val = get_immd_u32(ref); + + for (i = 0; i < pc->immd_count; ++i) + if (pc->immd_buf[i] == val) + break; + + if (i == pc->immd_count) { + if (!(pc->immd_count % 8)) + pc->immd_buf = REALLOC(pc->immd_buf, + pc->immd_count * 4, (pc->immd_count + 8) * 4); + pc->immd_buf[pc->immd_count++] = val; + } + + SREG(ref)->id = i; +} + +static INLINE void +set_pred(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!(pc->emit[1] & 0x00003f80)); + + pc->emit[1] |= i->cc << 7; + if (i->flags_src) + pc->emit[1] |= SREG(i->flags_src)->id << 12; +} + +static INLINE void +set_pred_wr(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!(pc->emit[1] & 0x00000070)); + + if (i->flags_def) + pc->emit[1] |= (DREG(i->flags_def)->id << 4) | 0x40; +} + +static INLINE void +set_a16_bits(struct nv_pc *pc, uint id) +{ + ++id; /* $a0 is always 0 */ + pc->emit[0] |= (id & 3) << 26; + pc->emit[1] |= id & 4; +} + +static INLINE void +set_addr(struct nv_pc *pc, struct nv_instruction *i) +{ + if (i->src[4]) + set_a16_bits(pc, SREG(i->src[4])->id); +} + +static void +set_dst(struct nv_pc *pc, struct nv_value *value) +{ + struct nv_reg *reg = &value->join->reg; + + if (reg->id < 0) { + debug_printf("WARNING: unused dst, hope we can bucket it !\n"); + pc->emit[0] |= 127 << 2; + pc->emit[1] |= 0x8; + return; + } + + if (reg->file == NV_FILE_OUT) + pc->emit[1] |= 0x8; + else + if (reg->file == NV_FILE_ADDR) + assert(0); + + pc->emit[0] |= reg->id << 2; +} + +static void +set_src_0(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file == NV_FILE_MEM_S) + pc->emit[1] |= 0x00200000; + else + if (reg->file == NV_FILE_MEM_P) + pc->emit[0] |= 0x01800000; + else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src0 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[0] |= reg->id << 9; +} + +static void +set_src_1(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file >= NV_FILE_MEM_C(0) && + reg->file <= NV_FILE_MEM_C(15)) { + assert(!(pc->emit[1] & 0x01800000)); + + pc->emit[0] |= 0x00800000; + pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; + } else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src1 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[0] |= reg->id << 16; +} + +static void +set_src_2(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file >= NV_FILE_MEM_C(0) && + reg->file <= NV_FILE_MEM_C(15)) { + assert(!(pc->emit[1] & 0x01800000)); + + pc->emit[0] |= 0x01000000; + pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; + } else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src2 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[1] |= reg->id << 14; +} + +/* the default form: + * - long instruction + * - 1 to 3 sources in slots 0, 1, 2 + * - address & flags + */ +static void +emit_form_MAD(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] |= 1; + + set_pred(pc, i); + set_pred_wr(pc, i); + + if (i->def[0]) + set_dst(pc, i->def[0]); + else { + pc->emit[0] |= 0x01fc; + pc->emit[1] |= 0x0008; + } + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_1(pc, i->src[1]); + + if (i->src[2]) + set_src_2(pc, i->src[2]); + + set_addr(pc, i); +} + +/* like default form, but 2nd source in slot 2, no 3rd source */ +static void +emit_form_ADD(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] |= 1; + + if (i->def[0]) + set_dst(pc, i->def[0]); + else { + pc->emit[0] |= 0x01fc; + pc->emit[1] |= 0x0008; + } + + set_pred(pc, i); + set_pred_wr(pc, i); + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_2(pc, i->src[1]); + + set_addr(pc, i); +} + +/* short mul */ +static void +emit_form_MUL(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!i->is_long && !(pc->emit[0] & 1)); + + assert(i->def[0]); + set_dst(pc, i->def[0]); + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_1(pc, i->src[1]); +} + +/* default immediate form + * - 1 to 3 sources where last is immediate + * - no address or predicate possible + */ +static void +emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask) +{ + pc->emit[0] |= 1; + + assert(i->def[0]); + assert(i->src[0]); + set_dst(pc, i->def[0]); + + assert(!i->src[4] && !i->flags_src && !i->flags_def); + + if (i->src[2]) { + set_immd(pc, i->src[2]); + set_src_0(pc, i->src[1]); + set_src_1(pc, i->src[0]); + } else + if (i->src[1]) { + set_immd(pc, i->src[1]); + set_src_0(pc, i->src[0]); + } else + set_immd(pc, i->src[0]); + + assert(!mod_mask); +} + +static void +set_ld_st_size(struct nv_pc *pc, ubyte type) +{ + switch (type) { + case NV_TYPE_F64: + pc->emit[1] |= 0x8000; + break; + case NV_TYPE_F32: + case NV_TYPE_S32: + case NV_TYPE_U32: + pc->emit[1] |= 0xc000; + break; + case NV_TYPE_S16: + pc->emit[1] |= 0x6000; + break; + case NV_TYPE_U16: + pc->emit[1] |= 0x4000; + break; + case NV_TYPE_S8: + pc->emit[1] |= 0x2000; + break; + default: + break; + } +} + +static void +emit_ld(struct nv_pc *pc, struct nv_instruction *i) +{ + ubyte sf = SFILE(i, 0); + + if (sf == NV_FILE_IMM) { + sf = NV_FILE_MEM_C(0); + nv_pc_alloc_immd(pc, i->src[0]); + + new_fixup(pc, NV_FIXUP_PARAM_RELOC, SREG(i->src[0])->id, 0xffff, 9); + } + + if (sf == NV_FILE_MEM_S || + sf == NV_FILE_MEM_P) { + pc->emit[0] = 0x10000001; + pc->emit[1] = 0x04200000 | (0x3c << 12); + if (sf == NV_FILE_MEM_P) + pc->emit[0] |= 0x01800000; + } else + if (sf >= NV_FILE_MEM_C(0) && + sf <= NV_FILE_MEM_C(15)) { + pc->emit[0] = 0x10000001; + pc->emit[1] = 0x24000000; + pc->emit[1] |= (sf - NV_FILE_MEM_C(0)) << 22; + } else + if (sf >= NV_FILE_MEM_G(0) && + sf <= NV_FILE_MEM_G(15)) { + pc->emit[0] = 0xd0000001 | ((sf - NV_FILE_MEM_G(0)) << 16); + pc->emit[1] = 0xa0000000; + + assert(i->src[4] && SREG(i->src[4])->file == NV_FILE_GPR); + SID(pc, i->src[4], 9); + } else + if (sf == NV_FILE_MEM_L) { + pc->emit[0] = 0xd0000001; + pc->emit[1] = 0x40000000; + } else { + NOUVEAU_ERR("invalid ld source file\n"); + abort(); + } + + set_ld_st_size(pc, STYPE(i, 0)); + + set_dst(pc, i->def[0]); + set_pred_wr(pc, i); + + set_pred(pc, i); + + if (sf < NV_FILE_MEM_G(0) || + sf > NV_FILE_MEM_G(15)) { + SID(pc, i->src[0], 9); + set_addr(pc, i); + } +} + +static void +emit_st(struct nv_pc *pc, struct nv_instruction *i) +{ + +} + +static int +verify_mov(struct nv_instruction *i) +{ + ubyte sf = SFILE(i, 0); + ubyte df = DFILE(i, 0); + + if (df == NV_FILE_GPR) + return 0; + + if (df != NV_FILE_OUT && + df != NV_FILE_FLAGS && + df != NV_FILE_ADDR) + return 1; + + if (sf == NV_FILE_FLAGS) + return 2; + if (sf == NV_FILE_ADDR) + return 3; + if (sf == NV_FILE_IMM && df != NV_FILE_OUT) + return 4; + + return 0; +} + +static void +emit_mov(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!verify_mov(i)); + + if (SFILE(i, 0) >= NV_FILE_MEM_S) + emit_ld(pc, i); + else + if (SFILE(i, 0) == NV_FILE_FLAGS) { + pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); + pc->emit[1] = 0x20000780 | (SREG(i->src[0])->id << 12); + } else + if (SFILE(i, 0) == NV_FILE_ADDR) { + pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); + pc->emit[1] = 0x40000780; + set_a16_bits(pc, SREG(i->src[0])->id); + } else + if (DFILE(i, 0) == NV_FILE_FLAGS) { + pc->emit[0] = 0x000001fd; + pc->emit[1] = 0xa0000788 | (1 << 6); + pc->emit[0] |= SREG(i->src[0])->id << 9; + pc->emit[1] |= DREG(i->def[0])->id << 4; + } else + if (SFILE(i, 0) == NV_FILE_IMM) { + if (i->opcode == NV_OP_LDA) + emit_ld(pc, i); + else { + pc->emit[0] = 0x10008001; + pc->emit[1] = 0x00000003; + + emit_form_IMM(pc, i, 0); + } + } else { + pc->emit[0] = 0x10000000; + pc->emit[0] |= DREG(i->def[0])->id << 2; + pc->emit[0] |= SREG(i->src[0])->id << 9; + + if (!i->is_long) + pc->emit[0] |= 0x8000; + else { + pc->emit[0] |= 0x00000001; + pc->emit[1] = 0x0403c000; + + set_pred(pc, i); + } + } + + if (DFILE(i, 0) == NV_FILE_OUT) + pc->emit[1] |= 0x8; +} + +static void +emit_interp(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x80000000; + + assert(DFILE(i, 0) == NV_FILE_GPR); + assert(SFILE(i, 0) == NV_FILE_MEM_V); + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 16); + + if (i->flat) + pc->emit[0] |= 1 << 8; + else + if (i->opcode == NV_OP_PINTERP) { + pc->emit[0] |= 1 << 25; + pc->emit[0] |= SREG(i->src[1])->id << 9; + } + + if (i->centroid) + pc->emit[0] |= 1 << 24; + + if (i->is_long) { + pc->emit[1] |= 0x0780 | + (pc->emit[0] & (3 << 24)) >> (24 - 16) | + (pc->emit[0] & (1 << 8)) >> (18 - 8); + + pc->emit[0] |= 1; + pc->emit[0] &= ~0x03000100; + } +} + +static void +emit_minmax(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x30000000; + pc->emit[1] = (i->opcode == NV_OP_MIN) ? (2 << 28) : 0; + + switch (DTYPE(i, 0)) { + case NV_TYPE_F32: + pc->emit[0] |= 0x80000000; + pc->emit[1] |= 0x80000000; + break; + case NV_TYPE_S32: + pc->emit[1] |= 0x8c000000; + break; + case NV_TYPE_U32: + pc->emit[1] |= 0x84000000; + break; + } + + emit_form_MAD(pc, i); + + if (i->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; + if (i->src[1]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00080000; +} + +static void +emit_add_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xb0000000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; + } else + if (i->is_long) { + emit_form_ADD(pc, i); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 26; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 27; + } else { + emit_form_MUL(pc, i); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; + } +} + +static void +emit_add_b32(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x20008000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + } else + if (i->is_long) { + pc->emit[0] = 0x20000000; + pc->emit[1] = 0x04000000; + emit_form_ADD(pc, i); + } else { + emit_form_MUL(pc, i); + } + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 28; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; +} + +static void +emit_add_a16(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xd0000001 | (get_immd_u32(i->src[0]) << 9); + pc->emit[1] = 0x20000000; + + pc->emit[0] |= (DREG(i->def[0])->id + 1) << 2; + + set_pred(pc, i); + + if (i->src[1]) + set_a16_bits(pc, SREG(i->src[1])->id); +} + +static void +emit_flow(struct nv_pc *pc, struct nv_instruction *i, ubyte flow_op) +{ + pc->emit[0] = 0x00000003 | (flow_op << 28); + pc->emit[1] = 0x00000000; + + set_pred(pc, i); + + if (i->target) { + new_fixup(pc, NV_FIXUP_CFLOW_RELOC, i->target->bin_pos, 0x7ff800, 11); + pc->emit[0] |= (i->target->bin_pos / 4) << 11; + } +} + +static INLINE void +emit_add(struct nv_pc *pc, struct nv_instruction *i) +{ + if (DFILE(i, 0) == NV_FILE_ADDR) + emit_add_a16(pc, i); + else { + switch (DTYPE(i, 0)) { + case NV_TYPE_F32: + emit_add_f32(pc, i); + break; + case NV_TYPE_U32: + case NV_TYPE_S32: + emit_add_b32(pc, i); + break; + } + } +} + +static void +emit_bitop2(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xd0000000; + + if (SFILE(i, 0) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (i->opcode == NV_OP_OR) + pc->emit[0] |= 0x0100; + else + if (i->opcode == NV_OP_XOR) + pc->emit[0] |= 0x8000; + } else { + emit_form_MAD(pc, i); + + pc->emit[1] |= 0x04000000; + + if (i->opcode == NV_OP_OR) + pc->emit[1] |= 0x4000; + else + if (i->opcode == NV_OP_XOR) + pc->emit[1] |= 0x8000; + } +} + +static void +emit_shift(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x30000001; + pc->emit[1] = 0xc4000000; + + if (i->opcode == NV_OP_SHR) + pc->emit[1] |= 1 << 29; + + if (SFILE(i, 1) == NV_FILE_IMM) { + pc->emit[1] |= 1 << 20; + pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x7f) << 16; + + set_pred(pc, i); + } else + emit_form_MAD(pc, i); + + if (STYPE(i, 0) == NV_TYPE_S32) + pc->emit[1] |= 1 << 27; +} + +static void +emit_flop(struct nv_pc *pc, struct nv_instruction *i) +{ + struct nv_ref *src0 = i->src[0]; + + pc->emit[0] = 0x90000000; + + assert(SREG(src0)->type == NV_TYPE_F32); + assert(SREG(src0)->file == NV_FILE_GPR); + + if (!i->is_long) { + emit_form_MUL(pc, i); + assert(i->opcode == NV_OP_RCP && !src0->mod); + return; + } + + pc->emit[1] = (i->opcode - NV_OP_RCP) << 29; + + emit_form_MAD(pc, i); + + if (src0->mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; + if (src0->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_mad_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + const boolean neg_mul = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; + const boolean neg_add = (i->src[2]->mod & NV_MOD_NEG); + + pc->emit[0] = 0xe0000000; + + if (!i->is_long) { + emit_form_MUL(pc, i); + assert(!neg_mul && !neg_add); + return; + } + + emit_form_MAD(pc, i); + + if (neg_mul) pc->emit[1] |= 0x04000000; + if (neg_add) pc->emit[1] |= 0x08000000; + + if (i->saturate) + pc->emit[1] |= 0x20000000; +} + +static INLINE void +emit_mad(struct nv_pc *pc, struct nv_instruction *i) +{ + emit_mad_f32(pc, i); +} + +static void +emit_mul_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + boolean neg = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; + + pc->emit[0] = 0xc0000000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (neg) + pc->emit[0] |= 0x8000; + } else + if (i->is_long) { + emit_form_MAD(pc, i); + + if (neg) + pc->emit[1] |= 0x08 << 24; + } else { + emit_form_MUL(pc, i); + + if (neg) + pc->emit[0] |= 0x8000; + } +} + +static void +emit_set(struct nv_pc *pc, struct nv_instruction *nvi) +{ + assert(nvi->is_long); + + pc->emit[0] = 0x30000000; + pc->emit[1] = 0x60000000; + + pc->emit[1] |= nvi->set_cond << 14; + + switch (STYPE(nvi, 0)) { + case NV_TYPE_U32: pc->emit[1] |= 0x04000000; break; + case NV_TYPE_S32: pc->emit[1] |= 0x0c000000; break; + case NV_TYPE_F32: pc->emit[0] |= 0x80000000; break; + default: + assert(0); + break; + } + + emit_form_MAD(pc, nvi); +} + +#define CVT_RN (0x00 << 16) +#define CVT_FLOOR (0x02 << 16) +#define CVT_CEIL (0x04 << 16) +#define CVT_TRUNC (0x06 << 16) +#define CVT_SAT (0x08 << 16) +#define CVT_ABS (0x10 << 16) + +#define CVT_X32_X32 0x04004000 +#define CVT_X32_S32 0x04014000 +#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) +#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) +#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) +#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) +#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) +#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) +#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) +#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) +#define CVT_U32_U32 ((0x00 << 24) | CVT_X32_X32) + +#define CVT_NEG 0x20000000 +#define CVT_RI 0x08000000 + +static void +emit_cvt(struct nv_pc *pc, struct nv_instruction *nvi) +{ + ubyte dst_type = nvi->def[0] ? DTYPE(nvi, 0) : STYPE(nvi, 0); + + pc->emit[0] = 0xa0000000; + + switch (dst_type) { + case NV_TYPE_F32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_F32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_F32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_F32_U32; break; + } + break; + case NV_TYPE_S32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_S32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_S32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_S32_U32; break; + } + break; + case NV_TYPE_U32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_U32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_U32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_U32_U32; break; + } + break; + } + if (pc->emit[1] == CVT_F32_F32 && + (nvi->opcode == NV_OP_CEIL || nvi->opcode == NV_OP_FLOOR || + nvi->opcode == NV_OP_TRUNC)) + pc->emit[1] |= CVT_RI; + + switch (nvi->opcode) { + case NV_OP_CEIL: pc->emit[1] |= CVT_CEIL; break; + case NV_OP_FLOOR: pc->emit[1] |= CVT_FLOOR; break; + case NV_OP_TRUNC: pc->emit[1] |= CVT_TRUNC; break; + + case NV_OP_ABS: pc->emit[1] |= CVT_ABS; break; + case NV_OP_SAT: pc->emit[1] |= CVT_SAT; break; + case NV_OP_NEG: pc->emit[1] |= CVT_NEG; break; + default: + assert(nvi->opcode == NV_OP_CVT); + break; + } + assert(nvi->opcode != NV_OP_ABS || !(nvi->src[0]->mod & NV_MOD_NEG)); + + if (nvi->src[0]->mod & NV_MOD_NEG) pc->emit[1] ^= CVT_NEG; + if (nvi->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= CVT_ABS; + + emit_form_MAD(pc, nvi); +} + +static void +emit_tex(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0x00000000; + + DID(pc, i->def[0], 2); + + set_pred(pc, i); + + pc->emit[0] |= i->tex_t << 9; + pc->emit[0] |= i->tex_s << 17; + + pc->emit[0] |= i->tex_argc << 22; + + pc->emit[0] |= (i->tex_mask & 0x3) << 25; + pc->emit[1] |= (i->tex_mask & 0xc) << 12; + + if (i->tex_live) + pc->emit[1] |= 4; + + if (i->tex_cube) + pc->emit[0] |= 0x08000000; + + if (i->opcode == NV_OP_TXB) + pc->emit[1] |= 0x20000000; + else + if (i->opcode == NV_OP_TXL) + pc->emit[1] |= 0x40000000; + else + pc->emit[0] -= 1 << 22; +} + +static void +emit_cvt2fixed(struct nv_pc *pc, struct nv_instruction *i) +{ + ubyte mod = i->src[0]->mod; + + pc->emit[0] = 0xb0000000; + pc->emit[1] = 0xc0000000; + + if (i->opcode == NV_OP_PREEX2) + pc->emit[1] |= 0x4000; + + emit_form_MAD(pc, i); + + if (mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; + if (mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_ddx(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + + pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0240001 : 0xc0140001; + pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x86400000 : 0x89800000; + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 9); + SID(pc, i->src[0], 32 + 14); + + set_pred(pc, i); + set_pred_wr(pc, i); +} + +static void +emit_ddy(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + + pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0250001 : 0xc0150001; + pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x85800000 : 0x8a400000; + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 9); + SID(pc, i->src[0], 32 + 14); + + set_pred(pc, i); + set_pred_wr(pc, i); +} + +void +nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) +{ + // nv_print_instruction(i); + + switch (i->opcode) { + case NV_OP_MOV: + if (DFILE(i, 0) == NV_FILE_ADDR) + emit_add_a16(pc, i); + else + emit_mov(pc, i); + break; + case NV_OP_LDA: + emit_mov(pc, i); + break; + case NV_OP_STA: + emit_st(pc, i); + break; + case NV_OP_LINTERP: + case NV_OP_PINTERP: + emit_interp(pc, i); + break; + case NV_OP_ADD: + emit_add(pc, i); + break; + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + emit_bitop2(pc, i); + break; + case NV_OP_CVT: + case NV_OP_ABS: + case NV_OP_NEG: + case NV_OP_SAT: + case NV_OP_CEIL: + case NV_OP_FLOOR: + case NV_OP_TRUNC: + emit_cvt(pc, i); + break; + case NV_OP_DFDX: + emit_ddx(pc, i); + break; + case NV_OP_DFDY: + emit_ddy(pc, i); + break; + case NV_OP_RCP: + case NV_OP_RSQ: + case NV_OP_LG2: + case NV_OP_SIN: + case NV_OP_COS: + case NV_OP_EX2: + emit_flop(pc, i); + break; + case NV_OP_PRESIN: + case NV_OP_PREEX2: + emit_cvt2fixed(pc, i); + break; + case NV_OP_MAD: + emit_mad(pc, i); + break; + case NV_OP_MAX: + case NV_OP_MIN: + emit_minmax(pc, i); + break; + case NV_OP_MUL: + emit_mul_f32(pc, i); + break; + case NV_OP_SET: + emit_set(pc, i); + break; + case NV_OP_SHL: + case NV_OP_SHR: + emit_shift(pc, i); + break; + case NV_OP_TEX: + case NV_OP_TXB: + case NV_OP_TXL: + emit_tex(pc, i); + break; + case NV_OP_KIL: + emit_flow(pc, i, 0x0); + break; + case NV_OP_BRA: + emit_flow(pc, i, 0x1); + break; + case NV_OP_CALL: + emit_flow(pc, i, 0x2); + break; + case NV_OP_RET: + emit_flow(pc, i, 0x3); + break; + case NV_OP_BREAKADDR: + emit_flow(pc, i, 0x4); + break; + case NV_OP_BREAK: + emit_flow(pc, i, 0x5); + break; + case NV_OP_JOINAT: + emit_flow(pc, i, 0xa); + break; + case NV_OP_NOP: + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0xe0000000; + break; + case NV_OP_PHI: + case NV_OP_SUB: + NOUVEAU_ERR("operation \"%s\" should have been eliminated\n", + nv_opcode_name(i->opcode)); + break; + default: + NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode); + abort(); + break; + } + + assert((pc->emit[0] & 1) == i->is_long); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c new file mode 100644 index 0000000000..0811420e42 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -0,0 +1,717 @@ + +#include "nv50_pc.h" + +#define DESCEND_ARBITRARY(j, f) \ +do { \ + b->pass_seq = ctx->pc->pass_seq; \ + \ + for (j = 0; j < 2; ++j) \ + if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \ + f(ctx, b->out[j]); \ +} while (0) + +extern unsigned nv50_inst_min_size(struct nv_instruction *); + +struct nv_pc_pass { + struct nv_pc *pc; +}; + +static INLINE boolean +values_equal(struct nv_value *a, struct nv_value *b) +{ + /* XXX: sizes */ + return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id); +} + +static INLINE boolean +inst_commutation_check(struct nv_instruction *a, + struct nv_instruction *b) +{ + int si, di; + + for (di = 0; di < 4; ++di) { + if (!a->def[di]) + break; + for (si = 0; si < 5; ++si) { + if (!b->src[si]) + continue; + if (values_equal(a->def[di], b->src[si]->value)) + return FALSE; + } + } + + if (b->flags_src && b->flags_src->value == a->flags_def) + return FALSE; + + return TRUE; +} + +/* Check whether we can swap the order of the instructions, + * where a & b may be either the earlier or the later one. + */ +static boolean +inst_commutation_legal(struct nv_instruction *a, + struct nv_instruction *b) +{ + return inst_commutation_check(a, b) && inst_commutation_check(b, a); +} + +static INLINE boolean +inst_cullable(struct nv_instruction *nvi) +{ + return (!(nvi->is_terminator || + nvi->target || + nvi->fixed || + nv_nvi_refcount(nvi))); +} + +static INLINE boolean +nvi_isnop(struct nv_instruction *nvi) +{ + if (nvi->opcode == NV_OP_EXPORT) + return TRUE; + + if (nvi->fixed || + nvi->is_terminator || + nvi->flags_src || + nvi->flags_def) + return FALSE; + + if (nvi->def[0]->join->reg.id < 0) + return TRUE; + + if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) + return FALSE; + + if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file) + return FALSE; + + if (nvi->src[0]->value->join->reg.id < 0) { + debug_printf("nvi_isnop: orphaned value detected\n"); + return TRUE; + } + + if (nvi->opcode == NV_OP_SELECT) + if (!values_equal(nvi->def[0], nvi->src[1]->value)) + return FALSE; + + return values_equal(nvi->def[0], nvi->src[0]->value); +} + +static void +nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *next; + int j; + uint size, n32 = 0; + + b->priv = 0; + + if (pc->num_blocks) + b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos + + pc->bb_list[pc->num_blocks - 1]->bin_size; + + pc->bb_list[pc->num_blocks++] = b; + + /* visit node */ + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi_isnop(nvi)) + nv_nvi_delete(nvi); + } + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + + size = nv50_inst_min_size(nvi); + if (nvi->next && size < 8) + ++n32; + else + if ((n32 & 1) && nvi->next && + nv50_inst_min_size(nvi->next) == 4 && + inst_commutation_legal(nvi, nvi->next)) { + ++n32; + debug_printf("permuting: "); + nv_print_instruction(nvi); + nv_print_instruction(nvi->next); + nv_nvi_permute(nvi, nvi->next); + next = nvi; + } else { + nvi->is_long = 1; + + b->bin_size += n32 & 1; + if (n32 & 1) + nvi->prev->is_long = 1; + n32 = 0; + } + b->bin_size += 1 + nvi->is_long; + } + + if (!b->entry) { + debug_printf("block %p is now empty\n", b); + } else + if (!b->exit->is_long) { + assert(n32); + b->exit->is_long = 1; + b->bin_size += 1; + + /* might have del'd a hole tail of instructions */ + if (!b->exit->prev->is_long && !(n32 & 1)) { + b->bin_size += 1; + b->exit->prev->is_long = 1; + } + } + assert(!b->exit || b->exit->is_long); + + pc->bin_size += b->bin_size *= 4; + + /* descend CFG */ + + if (!b->out[0]) + return; + if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) + return; + +#if 0 + /* delete ELSE branch */ + if (b->entry && + b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) { + nv_nvi_delete(b->entry); + b->bin_size -= 2; + pc->bin_size -= 8; + } +#endif + for (j = 0; j < 2; ++j) + if (b->out[j] && b->out[j] != b) + nv_pc_pass_pre_emission(pc, b->out[j]); +} + +int +nv_pc_exec_pass2(struct nv_pc *pc) +{ + debug_printf("preparing %u blocks for emission\n", pc->num_blocks); + + pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); + + pc->num_blocks = 0; + nv_pc_pass_pre_emission(pc, pc->root); + + return 0; +} + +static INLINE boolean +is_cmem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && + nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15)); +} + +static INLINE boolean +is_smem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + (nvi->src[0]->value->reg.file == NV_FILE_MEM_S || + nvi->src[0]->value->reg.file <= NV_FILE_MEM_P)); +} + +static INLINE boolean +is_immd_move(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_MOV && + nvi->src[0]->value->reg.file == NV_FILE_IMM); +} + +static INLINE void +check_swap_src_0_1(struct nv_instruction *nvi) +{ + static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + + struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1]; + + if (!nv_op_commutative(nvi->opcode)) + return; + assert(src0 && src1); + + if (is_cmem_load(src0->value->insn)) { + if (!is_cmem_load(src1->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping cmem load to 1\n"); */ + } + } else + if (is_smem_load(src1->value->insn)) { + if (!is_smem_load(src0->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping smem load to 0\n"); */ + } + } + + if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0) + nvi->set_cond = cc_swapped[nvi->set_cond]; +} + +struct nv_pass { + struct nv_pc *pc; + int n; + void *priv; +}; + +static int +nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *sti; + int j; + + for (sti = b->entry; sti; sti = sti->next) { + if (!sti->def[0]) + continue; + + if (sti->def[0]->reg.file != NV_FILE_OUT) + continue; + if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA) + continue; + + nvi = sti->src[0]->value->insn; + if (!nvi || nvi->opcode == NV_OP_PHI) + continue; + assert(nvi->def[0] == sti->src[0]->value); + + if (nvi->def[0]->refc > 1) + continue; + + nvi->def[0] = sti->def[0]; + nvi->fixed = 1; + sti->fixed = 0; + } + DESCEND_ARBITRARY(j, nv_pass_fold_stores); + + return 0; +} + +static int +nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *ld; + int j; + + for (nvi = b->entry; nvi; nvi = nvi->next) { + check_swap_src_0_1(nvi); + + for (j = 0; j < 3; ++j) { + if (!nvi->src[j]) + break; + ld = nvi->src[j]->value->insn; + if (!ld) + continue; + + if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) { + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + debug_printf("folded immediate %i\n", ld->def[0]->n); + continue; + } + + if (ld->opcode != NV_OP_LDA) + continue; + if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value)) + continue; + + if (j == 0 && ld->src[4]) /* can't load shared mem */ + continue; + + /* fold it ! */ /* XXX: ref->insn */ + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + if (ld->src[4]) + nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); + } + } + DESCEND_ARBITRARY(j, nv_pass_fold_loads); + + return 0; +} + +static int +nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *mi, *next; + ubyte mod; + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi->opcode == NV_OP_SUB) { + nvi->opcode = NV_OP_ADD; + nvi->src[1]->mod ^= NV_MOD_NEG; + } + + /* should not put any modifiers on NEG and ABS */ + assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod); + assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod); + + for (j = 0; j < 4; ++j) { + if (!nvi->src[j]) + break; + + mi = nvi->src[j]->value->insn; + if (!mi) + continue; + if (mi->def[0]->refc > 1) + continue; + + if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG; + else + if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS; + else + continue; + + if (nvi->opcode == NV_OP_ABS) + mod &= ~(NV_MOD_NEG | NV_MOD_ABS); + else + if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) { + nvi->opcode = NV_OP_MOV; + mod = 0; + } + + if (!(nv50_supported_src_mods(nvi->opcode, j) & mod)) + continue; + + nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value); + + nvi->src[j]->mod ^= mod; + } + + if (nvi->opcode == NV_OP_SAT) { + mi = nvi->src[0]->value->insn; + + if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) { + mi->saturate = 1; + mi->def[0] = nvi->def[0]; + nv_nvi_delete(nvi); + } + } + } + DESCEND_ARBITRARY(j, nv_pass_lower_mods); + + return 0; +} + +#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) + +static struct nv_value * +find_immediate(struct nv_ref *ref) +{ + struct nv_value *src; + + if (!ref) + return NULL; + + src = ref->value; + while (src->insn && src->insn->opcode == NV_OP_MOV) { + assert(!src->insn->src[0]->mod); + src = src->insn->src[0]->value; + } + return (src->reg.file == NV_FILE_IMM) ? src : NULL; +} + +static void +constant_operand(struct nv_pc *pc, + struct nv_instruction *nvi, struct nv_value *val, int s) +{ + int t = s ? 0 : 1; + ubyte type; + + if (!nvi->def[0]) + return; + type = nvi->def[0]->reg.type; + + switch (nvi->opcode) { + case NV_OP_MUL: + if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) || + (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[s], NULL); + if (!s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } else + if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) || + (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) { + nvi->opcode = NV_OP_ADD; + nv_reference(pc, &nvi->src[s], NULL); + if (!s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } else + if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) { + nvi->opcode = NV_OP_NEG; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } else + if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) { + nvi->opcode = NV_OP_ADD; + assert(!nvi->src[s]->mod); + nv_reference(pc, &nvi->src[s], nvi->src[t]->value); + nvi->src[t]->mod ^= NV_MOD_NEG; + nvi->src[s]->mod |= NV_MOD_NEG; + } else + if (val->reg.imm.u32 == 0) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[t], NULL); + if (s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } + break; + case NV_OP_ADD: + if (val->reg.imm.u32 == 0) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } + break; + default: + break; + } +} + +static int +nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *next; + int j; + + for (nvi = b->entry; nvi; nvi = next) { + struct nv_value *src0, *src1, *src; + int mod; + + next = nvi->next; + + if ((src = find_immediate(nvi->src[0])) != NULL) + constant_operand(ctx->pc, nvi, src, 0); + else + if ((src = find_immediate(nvi->src[1])) != NULL) + constant_operand(ctx->pc, nvi, src, 1); + + /* try to combine MUL, ADD into MAD */ + if (nvi->opcode != NV_OP_ADD) + continue; + + src0 = nvi->src[0]->value; + src1 = nvi->src[1]->value; + + if (SRC_IS_MUL(src0) && src0->refc == 1) + src = src0; + else + if (SRC_IS_MUL(src1) && src1->refc == 1) + src = src1; + else + continue; + + nvi->opcode = NV_OP_MAD; + mod = nvi->src[(src == src0) ? 0 : 1]->mod; + nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL); + nvi->src[2] = nvi->src[(src == src0) ? 1 : 0]; + + assert(!(mod & ~NV_MOD_NEG)); + nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value); + nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value); + nvi->src[0]->mod = src->insn->src[0]->mod ^ mod; + nvi->src[1]->mod = src->insn->src[1]->mod; + } + DESCEND_ARBITRARY(j, nv_pass_lower_arith); + + return 0; +} + +/* +set $r2 g f32 $r2 $r3 +cvt abs rn f32 $r2 s32 $r2 +cvt f32 $c0 # f32 $r2 +e $c0 bra 0x80 +*/ +#if 0 +static int +nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b) +{ + /* XXX: easier in IR builder for now */ + return 0; +} +#endif + +/* TODO: reload elimination, redundant store elimination */ + +struct nv_pass_reldelim { + struct nv_pc *pc; +}; + +static int +nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *ld, *next; + + for (ld = b->entry; ld; ld = next) { + next = ld->next; + + if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { + + } else + if (ld->opcode == NV_OP_LDA) { + + } else + if (ld->opcode == NV_OP_MOV) { + + } + } + DESCEND_ARBITRARY(j, nv_pass_reload_elim); + + return 0; +} + +static int +nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int i, c, j; + + for (i = 0; i < ctx->pc->num_instructions; ++i) { + struct nv_instruction *nvi = &ctx->pc->instructions[i]; + struct nv_value *def[4]; + + if (!nv_is_vector_op(nvi->opcode)) + continue; + nvi->tex_mask = 0; + + for (c = 0; c < 4; ++c) { + if (nvi->def[c]->refc) + nvi->tex_mask |= 1 << c; + def[c] = nvi->def[c]; + } + + j = 0; + for (c = 0; c < 4; ++c) + if (nvi->tex_mask & (1 << c)) + nvi->def[j++] = def[c]; + for (c = 0; c < 4; ++c) + if (!(nvi->tex_mask & (1 << c))) + nvi->def[j++] = def[c]; + assert(j == 4); + } + return 0; +} + +struct nv_pass_dce { + struct nv_pc *pc; + uint removed; +}; + +static int +nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *next; + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + + if (inst_cullable(nvi)) { + nv_nvi_delete(nvi); + + ++ctx->removed; + } + } + DESCEND_ARBITRARY(j, nv_pass_dce); + + return 0; +} + +static INLINE boolean +bb_simple_if_endif(struct nv_basic_block *bb) +{ + return (bb->out[0] && bb->out[1] && + bb->out[0]->out[0] == bb->out[1] && + !bb->out[0]->out[1]); +} + +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int j; + + if (bb_simple_if_endif(b)) { + ++ctx->n; + debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n); + } + DESCEND_ARBITRARY(j, nv_pass_flatten); + + return 0; +} + +int +nv_pc_exec_pass0(struct nv_pc *pc) +{ + struct nv_pass_reldelim *reldelim; + struct nv_pass pass; + struct nv_pass_dce dce; + int ret; + + reldelim = CALLOC_STRUCT(nv_pass_reldelim); + reldelim->pc = pc; + + ret = nv_pass_reload_elim(reldelim, pc->root); + + FREE(reldelim); + if (ret) + return ret; + + pass.pc = pc; + + pc->pass_seq++; + ret = nv_pass_flatten(&pass, pc->root); + if (ret) + return ret; + + /* Do this first, so we don't have to pay attention + * to whether sources are supported memory loads. + */ + pc->pass_seq++; + ret = nv_pass_lower_arith(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_loads(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_stores(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_lower_mods(&pass, pc->root); + if (ret) + return ret; + + dce.pc = pc; + do { + dce.removed = 0; + pc->pass_seq++; + ret = nv_pass_dce(&dce, pc->root); + if (ret) + return ret; + } while (dce.removed); + + ret = nv_pass_tex_mask(&pass, pc->root); + if (ret) + return ret; + + return ret; +} diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c new file mode 100644 index 0000000000..09512ffb88 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -0,0 +1,287 @@ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#define NVXX_DEBUG 0 + +#define PRINT(args...) debug_printf(args) + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#endif + +static const char *norm = "\x1b[00m"; +static const char *gree = "\x1b[32m"; +static const char *blue = "\x1b[34m"; +static const char *cyan = "\x1b[36m"; +static const char *orng = "\x1b[33m"; +static const char *mgta = "\x1b[35m"; + +static const char *nv_opcode_names[NV_OP_COUNT + 1] = { + "phi", + "extract", + "combine", + "lda", + "sta", + "mov", + "add", + "sub", + "neg", + "mul", + "mad", + "cvt", + "sat", + "not", + "and", + "or", + "xor", + "shl", + "shr", + "rcp", + "(undefined)", + "rsqrt", + "lg2", + "sin", + "cos", + "ex2", + "presin", + "preex2", + "min", + "max", + "set", + "sad", + "kil", + "bra", + "call", + "ret", + "break", + "breakaddr", + "joinat", + "tex", + "texbias", + "texlod", + "texfetch", + "texsize", + "dfdx", + "dfdy", + "quadop", + "linterp", + "pinterp", + "abs", + "ceil", + "floor", + "trunc", + "nop", + "select", + "export", + "BAD_OP" +}; + +static const char *nv_cond_names[] = +{ + "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "", + "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "" +}; + +static const char *nv_modifier_strings[] = +{ + "", + "neg", + "abs", + "neg abs", + "not", + "not neg" + "not abs", + "not neg abs", + "sat", + "BAD_MOD" +}; + +const char * +nv_opcode_name(uint opcode) +{ + return nv_opcode_names[MIN2(opcode, ARRAY_SIZE(nv_opcode_names) - 1)]; +} + +static INLINE const char * +nv_type_name(ubyte type) +{ + switch (type) { + case NV_TYPE_U16: return "u16"; + case NV_TYPE_S16: return "s16"; + case NV_TYPE_F32: return "f32"; + case NV_TYPE_U32: return "u32"; + case NV_TYPE_S32: return "s32"; + case NV_TYPE_P32: return "p32"; + case NV_TYPE_F64: return "f64"; + default: + return "BAD_TYPE"; + } +} + +static INLINE const char * +nv_cond_name(ubyte cc) +{ + return nv_cond_names[MIN2(cc, 15)]; +} + +static INLINE const char * +nv_modifier_string(ubyte mod) +{ + return nv_modifier_strings[MIN2(mod, 9)]; +} + +static INLINE int +nv_value_id(struct nv_value *value) +{ + if (value->join->reg.id >= 0) + return value->join->reg.id; + return value->n; +} + +static INLINE boolean +nv_value_allocated(struct nv_value *value) +{ + return (value->reg.id >= 0) ? TRUE : FALSE; +} + +static INLINE void +nv_print_address(const char c, int buf, struct nv_value *a, int offset) +{ + if (buf >= 0) + PRINT(" %s%c%i[", cyan, c, buf); + else + PRINT(" %s%c[", cyan, c); + if (a) + PRINT("%s$a%i%s+", mgta, nv_value_id(a), cyan); + PRINT("%s0x%x%s]", orng, offset, cyan); +} + +static INLINE void +nv_print_cond(struct nv_instruction *nvi) +{ + PRINT("%s%s%s$c%i ", + gree, nv_cond_name(nvi->cc), + mgta, nv_value_id(nvi->flags_src->value)); +} + +static INLINE void +nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) +{ + char reg_pfx = '$'; + + if (type == NV_TYPE_ANY) + type = value->reg.type; + + if (value->reg.file != NV_FILE_FLAGS) + PRINT(" %s%s", gree, nv_type_name(type)); + + if (!nv_value_allocated(value)) + reg_pfx = '%'; + + switch (value->reg.file) { + case NV_FILE_GPR: + PRINT(" %s%cr%i", blue, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_OUT: + PRINT(" %s%co%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_ADDR: + PRINT(" %s%ca%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_FLAGS: + PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_MEM_S: + nv_print_address('s', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_MEM_P: + nv_print_address('p', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_MEM_V: + nv_print_address('v', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_IMM: + switch (type) { + case NV_TYPE_U16: + case NV_TYPE_S16: + PRINT(" %s0x%04x", orng, value->reg.imm.u32); + break; + case NV_TYPE_F32: + PRINT(" %s%f", orng, value->reg.imm.f32); + break; + case NV_TYPE_F64: + PRINT(" %s%f", orng, value->reg.imm.f64); + break; + case NV_TYPE_U32: + case NV_TYPE_S32: + case NV_TYPE_P32: + PRINT(" %s0x%08x", orng, value->reg.imm.u32); + break; + } + break; + default: + if (value->reg.file >= NV_FILE_MEM_G(0) && + value->reg.file <= NV_FILE_MEM_G(15)) + nv_print_address('g', value->reg.file - NV_FILE_MEM_G(0), ind, + nv_value_id(value) * 4); + else + if (value->reg.file >= NV_FILE_MEM_C(0) && + value->reg.file <= NV_FILE_MEM_C(15)) + nv_print_address('c', value->reg.file - NV_FILE_MEM_C(0), ind, + nv_value_id(value) * 4); + else + NOUVEAU_ERR(" BAD_FILE[%i]", nv_value_id(value)); + break; + } +} + +static INLINE void +nv_print_ref(struct nv_ref *ref, struct nv_value *ind) +{ + nv_print_value(ref->value, ind, ref->typecast); +} + +void +nv_print_instruction(struct nv_instruction *i) +{ + int j; + + if (i->flags_src) + nv_print_cond(i); + + PRINT("%s", gree); + if (i->opcode == NV_OP_SET) + PRINT("set %s", nv_cond_name(i->set_cond)); + else + if (i->saturate) + PRINT("sat %s", nv_opcode_name(i->opcode)); + else + PRINT("%s", nv_opcode_name(i->opcode)); + + if (i->flags_def) + nv_print_value(i->flags_def, NULL, NV_TYPE_ANY); + + /* Only STORE & STA can write to MEM, and they do not def + * anything, so the address is thus part of the source. + */ + if (i->def[0]) + nv_print_value(i->def[0], NULL, NV_TYPE_ANY); + else + PRINT(" #"); + + for (j = 0; j < 4; ++j) { + if (!i->src[j]) + continue; + + if (i->src[j]->mod) + PRINT(" %s", nv_modifier_string(i->src[j]->mod)); + + nv_print_ref(i->src[j], + (j == nv50_indirect_opnd(i)) ? + i->src[4]->value : NULL); + } + if (!i->is_long) + PRINT(" %ss", norm); + PRINT("\n"); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c new file mode 100644 index 0000000000..eb446d641a --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -0,0 +1,973 @@ +/* + * XXX: phi function live intervals start at first ordinary instruction, + * add_range should be taking care of that already ... + * + * XXX: TEX must choose TEX's def as representative + * + * XXX: Aieee! Must materialize MOVs if source is in other basic block! + * -- absolutely, or we cannot execute the MOV conditionally at all + * XXX: Aieee! Must include PHIs in LVA so we pull through liveness if + * PHI source is e.g. in dominator block. + * -- seems we lose liveness somehow, track that + */ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "util/u_simple_list.h" + +#define NUM_REGISTER_FILES 4 + +struct register_set { + struct nv_pc *pc; + + uint32_t last[NUM_REGISTER_FILES]; + uint32_t bits[NUM_REGISTER_FILES][8]; +}; + +struct nv_pc_pass { + struct nv_pc *pc; + + struct nv_instruction **insns; + int num_insns; + + uint pass_seq; +}; + +static void +ranges_coalesce(struct nv_range *range) +{ + while (range->next && range->end >= range->next->bgn) { + struct nv_range *rnn = range->next->next; + assert(range->bgn <= range->next->bgn); + range->end = MAX2(range->end, range->next->end); + FREE(range->next); + range->next = rnn; + } +} + +static boolean +add_range_ex(struct nv_value *val, int bgn, int end, struct nv_range *new_range) +{ + struct nv_range *range, **nextp = &val->livei; + + for (range = val->livei; range; range = range->next) { + if (end < range->bgn) + break; /* insert before */ + + if (bgn > range->end) { + nextp = &range->next; + continue; /* insert after */ + } + + /* overlap */ + if (bgn < range->bgn) { + range->bgn = bgn; + if (end > range->end) + range->end = end; + ranges_coalesce(range); + return TRUE; + } + if (end > range->end) { + range->end = end; + ranges_coalesce(range); + return TRUE; + } + assert(bgn >= range->bgn); + assert(end <= range->end); + return TRUE; + } + + if (!new_range) + new_range = CALLOC_STRUCT(nv_range); + + new_range->bgn = bgn; + new_range->end = end; + new_range->next = range; + *(nextp) = new_range; + return FALSE; +} + +static void +add_range(struct nv_value *val, struct nv_basic_block *b, int end) +{ + int bgn; + + if (!val->insn) /* ignore non-def values */ + return; + assert(b->entry->serial <= b->exit->serial); + assert(b->phi->serial <= end); + assert(b->exit->serial + 1 >= end); + + bgn = val->insn->serial; + if (bgn < b->entry->serial || bgn > b->exit->serial) + bgn = b->entry->serial; + // debug_printf("add_range(value %i): [%i, %i)\n", val->n, bgn, end); + + if (bgn > end) { + debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n", + b->entry->serial, b->exit->serial, bgn, end); + } + assert(bgn <= end); + + if (bgn < val->insn->serial) + debug_printf("WARNING: leaking value %i ?\n", val->n); + + add_range_ex(val, bgn, end, NULL); +} + +#ifdef NV50_RA_DEBUG_JOIN +static void +livei_print(struct nv_value *a) +{ + struct nv_range *r = a->livei; + + debug_printf("livei %i: ", a->n); + while (r) { + debug_printf("[%i, %i) ", r->bgn, r->end); + r = r->next; + } + debug_printf("\n"); +} +#endif + +static void +livei_unify(struct nv_value *dst, struct nv_value *src) +{ + struct nv_range *range, *next; + + for (range = src->livei; range; range = next) { + next = range->next; + if (add_range_ex(dst, range->bgn, range->end, range)) + FREE(range); + } + src->livei = NULL; +} + +static void +livei_release(struct nv_value *val) +{ + struct nv_range *range, *next; + + for (range = val->livei; range; range = next) { + next = range->next; + FREE(range); + } +} + +static boolean +livei_have_overlap(struct nv_value *a, struct nv_value *b) +{ + struct nv_range *r_a, *r_b; + + for (r_a = a->livei; r_a; r_a = r_a->next) { + for (r_b = b->livei; r_b; r_b = r_b->next) { + if (r_b->bgn < r_a->end && + r_b->end > r_a->bgn) + return TRUE; + } + } + return FALSE; +} + +static int +livei_end(struct nv_value *a) +{ + struct nv_range *r = a->livei; + + assert(r); + while (r->next) + r = r->next; + return r->end; +} + +static boolean +livei_contains(struct nv_value *a, int pos) +{ + struct nv_range *r; + + for (r = a->livei; r && r->bgn <= pos; r = r->next) + if (r->end > pos) + return TRUE; + return FALSE; +} + +static boolean +reg_assign(struct register_set *set, struct nv_value **def, int n) +{ + int i, id, s; + uint m; + int f = def[0]->reg.file; + + s = n << (nv_type_order(def[0]->reg.type) - 1); + m = (1 << s) - 1; + + id = set->last[f]; + + for (i = 0; i * 32 < set->last[f]; ++i) { + if (set->bits[f][i] == 0xffffffff) + continue; + + for (id = 0; id < 32; id += s) + if (!(set->bits[f][i] & (m << id))) + break; + if (id < 32) + break; + } + if (i * 32 + id > set->last[f]) + return FALSE; + + set->bits[f][i] |= m << id; + + id += i * 32; + + set->pc->max_reg[f] = MAX2(set->pc->max_reg[f], id + s - 1); + + id >>= nv_type_order(def[0]->reg.type) - 1; + + for (i = 0; i < n; ++i) + if (def[i]->livei) + def[i]->reg.id = id++; + + return TRUE; +} + +static INLINE void +reg_occupy(struct register_set *set, struct nv_value *val) +{ + int s, id = val->reg.id, f = val->reg.file; + uint m; + + if (id < 0) + return; + s = nv_type_order(val->reg.type) - 1; + id <<= s; + m = (1 << (1 << s)) - 1; + + set->bits[f][id / 32] |= m << (id % 32); + + if (set->pc->max_reg[f] < id) + set->pc->max_reg[f] = id; +} + +static INLINE void +reg_release(struct register_set *set, struct nv_value *val) +{ + int s, id = val->reg.id, f = val->reg.file; + uint m; + + if (id < 0) + return; + + s = nv_type_order(val->reg.type) - 1; + id <<= s; + m = (1 << (1 << s)) - 1; + + set->bits[f][id / 32] &= ~(m << (id % 32)); +} + +static INLINE boolean +join_allowed(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + int i; + struct nv_value *val; + + if (a->reg.file != b->reg.file || + nv_type_sizeof(a->reg.type) != nv_type_sizeof(b->reg.type)) + return FALSE; + + if (a->join->reg.id == b->join->reg.id) + return TRUE; + +#if 1 + /* either a or b or both have been assigned */ + + if (a->join->reg.id >= 0 && b->join->reg.id >= 0) + return FALSE; + else + if (b->join->reg.id >= 0) { + if (a->join->reg.id >= 0) + return FALSE; + val = a; + a = b; + b = val; + } + + for (i = 0; i < ctx->pc->num_values; ++i) { + val = &ctx->pc->values[i]; + + if (val->join->reg.id != a->join->reg.id) + continue; + if (val->join != a->join && livei_have_overlap(val->join, b->join)) + return FALSE; + } + return TRUE; +#endif + return FALSE; +} + +static INLINE void +do_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + int j; + struct nv_value *bjoin = b->join; + + if (b->join->reg.id >= 0) + a->join->reg.id = b->join->reg.id; + + livei_unify(a->join, b->join); + +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("joining %i to %i\n", b->n, a->n); +#endif + + /* make a->join the new representative */ + for (j = 0; j < ctx->pc->num_values; ++j) + if (ctx->pc->values[j].join == bjoin) + ctx->pc->values[j].join = a->join; + + assert(b->join == a->join); +} + +static INLINE void +try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + if (!join_allowed(ctx, a, b)) { +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("cannot join %i to %i: not allowed\n", b->n, a->n); +#endif + return; + } + if (livei_have_overlap(a->join, b->join)) { +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("cannot join %i to %i: livei overlap\n", b->n, a->n); + livei_print(a); + livei_print(b); +#endif + return; + } + + do_join_values(ctx, a, b); +} + +/* For each operand of each phi in b, generate a new value by inserting a MOV + * at the end of the block it is coming from and replace the operand with it. + * This eliminates liveness conflicts. + */ +static int +pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i, *i2; + struct nv_basic_block *p, *pn; + struct nv_value *val; + int n, j; + + b->pass_seq = ctx->pc->pass_seq; + + for (n = 0; n < b->num_in; ++n) { + p = b->in[n]; + assert(p); + + if (b->num_in > 1 && p->out[0] && p->out[1]) { /* if without else */ + pn = new_basic_block(ctx->pc); + + if (p->out[0] == b) + p->out[0] = pn; + else + p->out[1] = pn; + + if (p->exit->target == b) /* target to new else-block */ + p->exit->target = pn; + + for (j = 0; j < b->num_in; ++j) { + if (b->in[j] == p) { + b->in[j] = pn; + break; + } + } + + pn->out[0] = b; + pn->in[0] = p; + pn->num_in = 1; + } else + pn = p; + + ctx->pc->current_block = pn; + + /* every block with PHIs will also have other operations */ + for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { + for (j = 0; j < 4; ++j) { + if (!i->src[j]) + j = 3; + else + if (i->src[j]->value->insn->bb == p) + break; + } + if (j >= 4) + continue; + assert(i->src[j]); + val = i->src[j]->value; + + /* XXX: should probably not insert this after terminator */ + i2 = new_instruction(ctx->pc, NV_OP_MOV); + + i2->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); + i2->src[0] = new_ref (ctx->pc, val); + i2->def[0]->insn = i2; + + nv_reference(ctx->pc, &i->src[j], i2->def[0]); + } + if (pn != p && pn->exit) { + /* XXX: this branch should probably be eliminated */ + ctx->pc->current_block = b->in[n ? 0 : 1]; + i2 = new_instruction(ctx->pc, NV_OP_BRA); + i2->target = b; + i2->is_terminator = 1; + } + } + + if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) { + pass_generate_phi_movs(ctx, b->out[0]); + } + + if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) { + pass_generate_phi_movs(ctx, b->out[1]); + } + + return 0; +} + +static int +pass_join_values(struct nv_pc_pass *ctx, int iter) +{ + int c, n; + + for (n = 0; n < ctx->num_insns; ++n) { + struct nv_instruction *i = ctx->insns[n]; + + switch (i->opcode) { + case NV_OP_PHI: + if (!iter) + continue; + try_join_values(ctx, i->src[0]->value, i->src[1]->value); + try_join_values(ctx, i->def[0], i->src[0]->value); + break; + case NV_OP_MOV: + if (iter && i->src[0]->value->insn && + !nv_is_vector_op(i->src[0]->value->join->insn->opcode)) + try_join_values(ctx, i->def[0], i->src[0]->value); + break; + case NV_OP_SELECT: + if (!iter) + break; + assert(join_allowed(ctx, i->def[0], i->src[0]->value)); + assert(join_allowed(ctx, i->def[0], i->src[1]->value)); + do_join_values(ctx, i->def[0], i->src[0]->value); + do_join_values(ctx, i->def[0], i->src[1]->value); + break; + case NV_OP_TEX: + case NV_OP_TXB: + case NV_OP_TXL: + case NV_OP_TXQ: + if (iter) + break; + for (c = 0; c < 4; ++c) { + if (!i->src[c]) + break; + do_join_values(ctx, i->def[c], i->src[c]->value); + } + break; + default: + break; + } + } + return 0; +} + +static int +pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i; + + b->priv = 0; + + assert(!b->exit || !b->exit->next); + for (i = b->phi; i; i = i->next) { + i->serial = ctx->num_insns; + ctx->insns[ctx->num_insns++] = i; + } + + b->pass_seq = ctx->pc->pass_seq; + + if (!b->out[0]) + return 0; + if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) + return 0; + + if (b->out[0] != b) + pass_order_instructions(ctx, b->out[0]); + if (b->out[1] && b->out[1] != b) + pass_order_instructions(ctx, b->out[1]); + + return 0; +} + +static void +bb_live_set_print(struct nv_pc *pc, struct nv_basic_block *b) +{ +#ifdef NV50_RA_DEBUG_LIVE_SETS + int j; + struct nv_value *val; + + debug_printf("live_set of %p: ", b); + + for (j = 0; j < pc->num_values; ++j) { + if (!(b->live_set[j / 32] & (1 << (j % 32)))) + continue; + val = &pc->values[j]; + if (!val->insn) + continue; + debug_printf("%i ", val->n); + } + debug_printf("\n"); +#endif +} + +static INLINE void +live_set_add(struct nv_basic_block *b, struct nv_value *val) +{ + if (!val->insn) /* don't add non-def values */ + return; + /* debug_printf("live[%p] <- %i\n", b, val->n); */ + + b->live_set[val->n / 32] |= 1 << (val->n % 32); +} + +static INLINE void +live_set_rem(struct nv_basic_block *b, struct nv_value *val) +{ + /* if (val->insn) + debug_printf("live[%p] -> %i\n", b, val->n); */ + b->live_set[val->n / 32] &= ~(1 << (val->n % 32)); +} + +static INLINE boolean +live_set_test(struct nv_basic_block *b, struct nv_ref *ref) +{ + int n = ref->value->n; + return b->live_set[n / 32] & (1 << (n % 32)); +} + +/* check if bf (future) can be reached from bp (past) */ +static boolean +bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, + struct nv_basic_block *bt) +{ + if (bf == bp) + return TRUE; + if (bp == bt) + return FALSE; + + if (bp->out[0] && bp->out[0] != bp && + bb_reachable_by(bf, bp->out[0], bt)) + return TRUE; + if (bp->out[1] && bp->out[1] != bp && + bb_reachable_by(bf, bp->out[1], bt)) + return TRUE; + return FALSE; +} + +/* The live set of a block contains those values that are live immediately + * before the beginning of the block. + */ +static int +pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i; + int j, n, ret = 0; + + /* slight hack for undecidedness: set phi = entry if it's undefined */ + if (!b->phi) + b->phi = b->entry; + + for (n = 0; n < 2; ++n) { + if (!b->out[n] || b->out[n] == b) + continue; + ret = pass_build_live_sets(ctx, b->out[n]); + if (ret) + return ret; + + if (n == 0) { + for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) + b->live_set[j] = b->out[n]->live_set[j]; + } else { + for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) + b->live_set[j] |= b->out[n]->live_set[j]; + } + + /* Kick values out of our live set that are created in incoming + * blocks of our successors that are not us. + */ + for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) { + for (j = 0; j < 4; ++j) { + if (!i->src[j]) + break; + assert(i->src[j]->value->insn); + + if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) { + live_set_add(b, i->src[j]->value); + debug_printf("%p: live set + %i\n", b, i->src[j]->value->n); + } else { + live_set_rem(b, i->src[j]->value); + debug_printf("%p: live set - %i\n", b, i->src[j]->value->n); + } + } + } + } + + if (b->pass_seq >= ctx->pc->pass_seq) + return 0; + b->pass_seq = ctx->pc->pass_seq; + + debug_printf("%s: visiting block %p\n", __FUNCTION__, b); + + if (!b->entry) + return 0; + bb_live_set_print(ctx->pc, b); + + for (i = b->exit; i; i = i->prev) { + for (j = 0; j < 4; j++) { + if (!i->def[j]) + break; + live_set_rem(b, i->def[j]); + } + for (j = 0; j < 4; j++) { + if (!i->src[j]) + break; + live_set_add(b, i->src[j]->value); + } + if (i->src[4]) + live_set_add(b, i->src[4]->value); + if (i->flags_def) + live_set_rem(b, i->flags_def); + if (i->flags_src) + live_set_add(b, i->flags_src->value); + } + bb_live_set_print(ctx->pc, b); + + return 0; +} + +static void collect_live_values(struct nv_basic_block *b, const int n) +{ + int i; + + if (b->out[0]) { + if (b->out[1]) { /* what to do about back-edges ? */ + for (i = 0; i < n; ++i) + b->live_set[i] = b->out[0]->live_set[i] | b->out[1]->live_set[i]; + } else { + memcpy(b->live_set, b->out[0]->live_set, n * sizeof(uint32_t)); + } + } else + if (b->out[1]) { + memcpy(b->live_set, b->out[1]->live_set, n * sizeof(uint32_t)); + } else { + memset(b->live_set, 0, n * sizeof(uint32_t)); + } +} + +/* NOTE: the live intervals of phi functions start the the first non-phi instruction */ +static int +pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i, *i_stop; + int j, s; + const int n = (ctx->pc->num_values + 31) / 32; + + debug_printf("building intervals for BB %i\n", b->id); + + /* verify that first block does not have live-in values */ + if (b->num_in == 0) + for (j = 0; j < n; ++j) + assert(b->live_set[j] == 0); + + collect_live_values(b, n); + + /* remove live-outs def'd in a parallel block, hopefully they're all phi'd */ + for (j = 0; j < 2; ++j) { + if (!b->out[j] || !b->out[j]->phi) + continue; + for (i = b->out[j]->phi; i->opcode == NV_OP_PHI; i = i->next) { + live_set_rem(b, i->def[0]); + + for (s = 0; s < 4; ++s) { + if (!i->src[s]) + break; + assert(i->src[s]->value->insn); + if (bb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j])) + live_set_add(b, i->src[s]->value); + else + live_set_rem(b, i->src[s]->value); + } + } + } + + /* remaining live-outs are live until the end */ + for (j = 0; j < ctx->pc->num_values; ++j) { + if (!(b->live_set[j / 32] & (1 << (j % 32)))) + continue; +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for live value %i\n", j); +#endif + add_range(&ctx->pc->values[j], b, b->exit->serial + 1); + } + debug_printf("%s: looping through instructions now\n", __func__); + + i_stop = b->entry ? b->entry->prev : NULL; + + /* don't have to include phi functions here (will have 0 live range) */ + for (i = b->exit; i != i_stop; i = i->prev) { + assert(i->serial >= b->phi->serial && i->serial <= b->exit->serial); + for (j = 0; j < 4; ++j) { + if (i->def[j]) + live_set_rem(b, i->def[j]); + } + if (i->flags_def) + live_set_rem(b, i->flags_def); + + for (j = 0; j < 5; ++j) { + if (i->src[j] && !live_set_test(b, i->src[j])) { + live_set_add(b, i->src[j]->value); +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for source that ends living: %i\n", + i->src[j]->value->n); +#endif + add_range(i->src[j]->value, b, i->serial); + } + } + if (i->flags_src && !live_set_test(b, i->flags_src)) { + live_set_add(b, i->flags_src->value); +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for source that ends living: %i\n", + i->flags_src->value->n); +#endif + add_range(i->flags_src->value, b, i->serial); + } + } + + b->pass_seq = ctx->pc->pass_seq; + + if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) + pass_build_intervals(ctx, b->out[0]); + + if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) + pass_build_intervals(ctx, b->out[1]); + + debug_printf("built intervals for block %p\n", b); + + return 0; +} + +static INLINE void +nv50_ctor_register_set(struct nv_pc *pc, struct register_set *set) +{ + memset(set, 0, sizeof(*set)); + + set->last[NV_FILE_GPR] = 255; + set->last[NV_FILE_OUT] = 127; + set->last[NV_FILE_FLAGS] = 4; + set->last[NV_FILE_ADDR] = 4; + + set->pc = pc; +} + +static void +insert_ordered_tail(struct nv_value *list, struct nv_value *nval) +{ + struct nv_value *elem = list->prev; + + // debug_printf("inserting value %i\n", nval->n); + + for (elem = list->prev; + elem != list && elem->livei->bgn > nval->livei->bgn; + elem = elem->prev); + /* now elem begins before or at the same time as val */ + + nval->prev = elem; + nval->next = elem->next; + elem->next->prev = nval; + elem->next = nval; +} + +static int +pass_linear_scan(struct nv_pc_pass *ctx, int iter) +{ + struct nv_instruction *i; + struct register_set f, free; + int k, n; + struct nv_value *cur, *val, *tmp[2]; + struct nv_value active, inactive, handled, unhandled; + + make_empty_list(&active); + make_empty_list(&inactive); + make_empty_list(&handled); + make_empty_list(&unhandled); + + nv50_ctor_register_set(ctx->pc, &free); + + /* joined values should have range = NULL and thus not be added; + * also, fixed memory values won't be added because they're not + * def'd, just used + */ + for (n = 0; n < ctx->num_insns; ++n) { + i = ctx->insns[n]; + + for (k = 0; k < 4; ++k) { + if (i->def[k] && i->def[k]->livei) + insert_ordered_tail(&unhandled, i->def[k]); + else + if (0 && i->def[k]) + debug_printf("skipping def'd value %i: no livei\n", i->def[k]->n); + } + if (i->flags_def && i->flags_def->livei) + insert_ordered_tail(&unhandled, i->flags_def); + } + + for (val = unhandled.next; val != unhandled.prev; val = val->next) { + assert(val->join == val); + assert(val->livei->bgn <= val->next->livei->bgn); + } + + foreach_s(cur, tmp[0], &unhandled) { + remove_from_list(cur); + + /* debug_printf("handling value %i\n", cur->n); */ + + foreach_s(val, tmp[1], &active) { + if (livei_end(val) <= cur->livei->bgn) { + reg_release(&free, val); + move_to_head(&handled, val); + } else + if (!livei_contains(val, cur->livei->bgn)) { + reg_release(&free, val); + move_to_head(&inactive, val); + } + } + + foreach_s(val, tmp[1], &inactive) { + if (livei_end(val) <= cur->livei->bgn) + move_to_head(&handled, val); + else + if (livei_contains(val, cur->livei->bgn)) { + reg_occupy(&free, val); + move_to_head(&active, val); + } + } + + f = free; + + foreach(val, &inactive) + if (livei_have_overlap(val, cur)) + reg_occupy(&f, val); + + foreach(val, &unhandled) + if (val->reg.id >= 0 && livei_have_overlap(val, cur)) + reg_occupy(&f, val); + + if (cur->reg.id < 0) { + boolean mem = FALSE; + + if (nv_is_vector_op(cur->insn->opcode)) + mem = !reg_assign(&f, &cur->insn->def[0], 4); + else + if (iter) + mem = !reg_assign(&f, &cur, 1); + + if (mem) { + NOUVEAU_ERR("out of registers\n"); + abort(); + } + } + insert_at_head(&active, cur); + reg_occupy(&free, cur); + } + + return 0; +} + +static int +pass_eliminate_moves(struct nv_pc_pass *ctx) +{ + return 0; +} + +int +nv_pc_exec_pass1(struct nv_pc *pc) +{ + struct nv_pc_pass *ctx; + int i, ret; + + debug_printf("REGISTER ALLOCATION - entering\n"); + + ctx = CALLOC_STRUCT(nv_pc_pass); + if (!ctx) + return -1; + ctx->pc = pc; + + nv_print_program(ctx->pc->root); + + ctx->insns = CALLOC(pc->num_instructions, sizeof(struct nv_instruction *)); + + pc->pass_seq++; + ret = pass_generate_phi_movs(ctx, pc->root); + assert(!ret); + + nv_print_program(ctx->pc->root); + + for (i = 0; i < pc->loop_nesting_bound; ++i) { + pc->pass_seq++; + ret = pass_build_live_sets(ctx, pc->root); + assert(!ret && "live sets"); + if (ret) { + NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i); + goto out; + } + } + + pc->pass_seq++; + ret = pass_order_instructions(ctx, pc->root); + assert(!ret && "order instructions"); + if (ret) + goto out; + + pc->pass_seq++; + ret = pass_build_intervals(ctx, pc->root); + assert(!ret && "build intervals"); + if (ret) { + NOUVEAU_ERR("failed to build live intervals\n"); + goto out; + } + + for (i = 0; i < 2; ++i) { + ret = pass_join_values(ctx, i); + if (ret) + goto out; + ret = pass_linear_scan(ctx, i); + if (ret) + goto out; + } + assert(!ret && "joining"); + + ret = pass_eliminate_moves(ctx); + + for (i = 0; i < pc->num_values; ++i) + livei_release(&pc->values[i]); + + debug_printf("REGISTER ALLOCATION - leaving\n"); + nv_print_program(ctx->pc->root); + +out: + FREE(ctx); + return ret; +} diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 8cb1639013..26d1be8db8 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Ben Skeggs + * Copyright 2010 Chrsitoph Bumiller * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,4674 +20,553 @@ * SOFTWARE. */ -#include "pipe/p_context.h" -#include "pipe/p_defines.h" -#include "pipe/p_state.h" -#include "util/u_inlines.h" +#include "nv50_program.h" +#include "nv50_pc.h" +#include "nv50_context.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" -#include "nv50_context.h" -#include "nv50_transfer.h" - -#define NV50_SU_MAX_TEMP 127 -#define NV50_SU_MAX_ADDR 4 -//#define NV50_PROGRAM_DUMP - -/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ - -/* ARL - gallium craps itself on progs/vp/arl.txt - * - * MSB - Like MAD, but MUL+SUB - * - Fuck it off, introduce a way to negate args for ops that - * support it. - * - * Look into inlining IMMD for ops other than MOV (make it general?) - * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, - * but can emit to P_TEMP first - then MOV later. NVIDIA does this - * - * In ops such as ADD it's possible to construct a bad opcode in the !is_long() - * case, if the emit_src() causes the inst to suddenly become long. - * - * Verify half-insns work where expected - and force disable them where they - * don't work - MUL has it forcibly disabled atm as it fixes POW.. - * - * FUCK! watch dst==src vectors, can overwrite components that are needed. - * ie. SUB R0, R0.yzxw, R0 - * - * Things to check with renouveau: - * FP attr/result assignment - how? - * attrib - * - 0x16bc maps vp output onto fp hpos - * - 0x16c0 maps vp output onto fp col0 - * result - * - colr always 0-3 - * - depr always 4 - * 0x16bc->0x16e8 --> some binding between vp/fp regs - * 0x16b8 --> VP output count - * - * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 - * "MOV rcol.x, fcol.y" = 0x00000004 - * 0x19a8 --> as above but 0x00000100 and 0x00000000 - * - 0x00100000 used when KIL used - * 0x196c --> as above but 0x00000011 and 0x00000000 - * - * 0x1988 --> 0xXXNNNNNN - * - XX == FP high something - */ -struct nv50_reg { - enum { - P_TEMP, - P_ATTR, - P_RESULT, - P_CONST, - P_IMMD, - P_ADDR - } type; - int index; - - int hw; - int mod; - - int rhw; /* result hw for FP outputs, or interpolant index */ - int acc; /* instruction where this reg is last read (first insn == 1) */ - - int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */ - int indirect[2]; /* index into pc->addr, or -1 */ - - ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */ -}; - -#define NV50_MOD_NEG 1 -#define NV50_MOD_ABS 2 -#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) -#define NV50_MOD_SAT 4 -#define NV50_MOD_I32 8 - -/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ - -/* STACK: Conditionals and loops have to use the (per warp) stack. - * Stack entries consist of an entry type (divergent path, join at), - * a mask indicating the active threads of the warp, and an address. - * MPs can store 12 stack entries internally, if we need more (and - * we probably do), we have to create a stack buffer in VRAM. - */ -/* impose low limits for now */ -#define NV50_MAX_COND_NESTING 4 -#define NV50_MAX_LOOP_NESTING 3 - -#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 - -struct nv50_pc { - struct nv50_program *p; - - /* hw resources */ - struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; - struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; - - /* tgsi resources */ - struct nv50_reg *temp; - int temp_nr; - struct nv50_reg *attr; - int attr_nr; - struct nv50_reg *result; - int result_nr; - struct nv50_reg *param; - int param_nr; - struct nv50_reg *immd; - uint32_t *immd_buf; - int immd_nr; - struct nv50_reg **addr; - int addr_nr; - struct nv50_reg *sysval; - int sysval_nr; - - struct nv50_reg *temp_temp[16]; - struct nv50_program_exec *temp_temp_exec[16]; - unsigned temp_temp_nr; - - /* broadcast and destination replacement regs */ - struct nv50_reg *r_brdc; - struct nv50_reg *r_dst[4]; - - struct nv50_reg reg_instances[16]; - unsigned reg_instance_nr; - - unsigned interp_mode[32]; - /* perspective interpolation registers */ - struct nv50_reg *iv_p; - struct nv50_reg *iv_c; - - struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; - struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; - struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; - int if_lvl, loop_lvl; - unsigned loop_pos[NV50_MAX_LOOP_NESTING]; - - unsigned *insn_pos; /* actual program offset of each TGSI insn */ - boolean in_subroutine; - - /* current instruction and total number of insns */ - unsigned insn_cur; - unsigned insn_nr; - - boolean allow32; - - uint8_t edgeflag_out; -}; - -static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *); - -static INLINE void -ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) -{ - reg->type = type; - reg->index = index; - reg->hw = hw; - reg->mod = 0; - reg->rhw = -1; - reg->vtx = -1; - reg->acc = 0; - reg->indirect[0] = reg->indirect[1] = -1; - reg->buf_index = (type == P_CONST) ? 1 : 0; -} - static INLINE unsigned -popcnt4(uint32_t val) -{ - static const unsigned cnt[16] - = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; - return cnt[val & 0xf]; -} - -static void -terminate_mbb(struct nv50_pc *pc) -{ - int i; - - /* remove records of temporary address register values */ - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) - if (pc->r_addr[i].index < 0) - pc->r_addr[i].acc = 0; -} - -static void -alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) -{ - int i = 0; - - if (reg->type == P_RESULT) { - if (pc->p->cfg.high_result < (reg->hw + 1)) - pc->p->cfg.high_result = reg->hw + 1; - } - - if (reg->type != P_TEMP) - return; - - if (reg->hw >= 0) { - /*XXX: do this here too to catch FP temp-as-attr usage.. - * not clean, but works */ - if (pc->p->cfg.high_temp < (reg->hw + 1)) - pc->p->cfg.high_temp = reg->hw + 1; - return; - } - - if (reg->rhw != -1) { - /* try to allocate temporary with index rhw first */ - if (!(pc->r_temp[reg->rhw])) { - pc->r_temp[reg->rhw] = reg; - reg->hw = reg->rhw; - if (pc->p->cfg.high_temp < (reg->rhw + 1)) - pc->p->cfg.high_temp = reg->rhw + 1; - return; - } - /* make sure we don't get things like $r0 needs to go - * in $r1 and $r1 in $r0 - */ - i = pc->result_nr * 4; - } - - for (; i < NV50_SU_MAX_TEMP; i++) { - if (!(pc->r_temp[i])) { - pc->r_temp[i] = reg; - reg->hw = i; - if (pc->p->cfg.high_temp < (i + 1)) - pc->p->cfg.high_temp = i + 1; - return; - } - } - - NOUVEAU_ERR("out of registers\n"); - abort(); -} - -static INLINE struct nv50_reg * -reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) -{ - struct nv50_reg *ri; - - assert(pc->reg_instance_nr < 16); - ri = &pc->reg_instances[pc->reg_instance_nr++]; - if (reg) { - alloc_reg(pc, reg); - *ri = *reg; - reg->indirect[0] = reg->indirect[1] = -1; - reg->mod = 0; - } - return ri; -} - -/* XXX: For shaders that aren't executed linearly (e.g. shaders that - * contain loops), we need to assign all hw regs to TGSI TEMPs early, - * lest we risk temp_temps overwriting regs alloc'd "later". - */ -static struct nv50_reg * -alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) -{ - struct nv50_reg *r; - int i; - - if (dst && dst->type == P_TEMP && dst->hw == -1) - return dst; - - for (i = 0; i < NV50_SU_MAX_TEMP; i++) { - if (!pc->r_temp[i]) { - r = MALLOC_STRUCT(nv50_reg); - ctor_reg(r, P_TEMP, -1, i); - pc->r_temp[i] = r; - return r; - } - } - - NOUVEAU_ERR("out of registers\n"); - abort(); - return NULL; -} - -/* release the hardware resource held by r */ -static void -release_hw(struct nv50_pc *pc, struct nv50_reg *r) +bitcount4(const uint32_t val) { - assert(r->type == P_TEMP); - if (r->hw == -1) - return; - - assert(pc->r_temp[r->hw] == r); - pc->r_temp[r->hw] = NULL; - - r->acc = 0; - if (r->index == -1) - FREE(r); -} - -static void -free_temp(struct nv50_pc *pc, struct nv50_reg *r) -{ - if (r->index == -1) { - unsigned hw = r->hw; - - FREE(pc->r_temp[hw]); - pc->r_temp[hw] = NULL; - } + static const unsigned cnt[16] + = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + return cnt[val & 0xf]; } -static int -alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) -{ - int i; - - if ((idx + 4) >= NV50_SU_MAX_TEMP) - return 1; +static unsigned +nv50_tgsi_src_mask(const struct tgsi_full_instruction *inst, int c) +{ + unsigned mask = inst->Dst[0].Register.WriteMask; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); + case TGSI_OPCODE_DP3: + return 0x7; + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + case TGSI_OPCODE_KIL: /* WriteMask ignored */ + return 0xf; + case TGSI_OPCODE_DST: + return mask & (c ? 0xa : 0x6); + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_EXP: + case TGSI_OPCODE_LG2: + case TGSI_OPCODE_LOG: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_SCS: + return 0x1; + case TGSI_OPCODE_IF: + return 0x1; + case TGSI_OPCODE_LIT: + return 0xb; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + { + const struct tgsi_instruction_texture *tex; + + assert(inst->Instruction.Texture); + tex = &inst->Texture; + + mask = 0x7; + if (inst->Instruction.Opcode != TGSI_OPCODE_TEX && + inst->Instruction.Opcode != TGSI_OPCODE_TXD) + mask |= 0x8; /* bias, lod or proj */ + + switch (tex->Texture) { + case TGSI_TEXTURE_1D: + mask &= 0x9; + break; + case TGSI_TEXTURE_SHADOW1D: + mask &= 0x5; + break; + case TGSI_TEXTURE_2D: + mask &= 0xb; + break; + default: + break; + } + } + return mask; + case TGSI_OPCODE_XPD: + { + unsigned x = 0; + if (mask & 1) x |= 0x6; + if (mask & 2) x |= 0x5; + if (mask & 4) x |= 0x3; + return x; + } + default: + break; + } + + return mask; +} + +static void +nv50_indirect_inputs(struct nv50_translation_info *ti, int id) +{ + int i, c; + + for (i = 0; i < PIPE_MAX_SHADER_INPUTS; ++i) + for (c = 0; c < 4; ++c) + ti->input_access[i][c] = id; + + ti->indirect_inputs = TRUE; +} + +static void +nv50_indirect_outputs(struct nv50_translation_info *ti, int id) +{ + int i, c; + + for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) + for (c = 0; c < 4; ++c) + ti->output_access[i][c] = id; + + ti->indirect_outputs = TRUE; +} + +static void +prog_inst(struct nv50_translation_info *ti, + const struct tgsi_full_instruction *inst, int id) +{ + const struct tgsi_dst_register *dst; + const struct tgsi_src_register *src; + int s, c, k; + unsigned mask; + + if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) { + for (c = 0; c < 4; ++c) { + dst = &inst->Dst[0].Register; + if (inst->Dst[0].Register.Indirect) + nv50_indirect_outputs(ti, id); + if (!(dst->WriteMask & (1 << c))) + continue; + ti->output_access[dst->Index][c] = id; + } + + if (inst->Instruction.Opcode == TGSI_OPCODE_MOV && + inst->Src[0].Register.File == TGSI_FILE_INPUT && + dst->Index == ti->edgeflag_out) + ti->p->vp.edgeflag = inst->Src[0].Register.Index; + } - if (pc->r_temp[idx] || pc->r_temp[idx + 1] || - pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) - return alloc_temp4(pc, dst, idx + 4); + for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { + src = &inst->Src[s].Register; + if (src->File != TGSI_FILE_INPUT) + continue; + mask = nv50_tgsi_src_mask(inst, s); - for (i = 0; i < 4; i++) { - dst[i] = MALLOC_STRUCT(nv50_reg); - ctor_reg(dst[i], P_TEMP, -1, idx + i); - pc->r_temp[idx + i] = dst[i]; - } + if (inst->Src[s].Register.Indirect) + nv50_indirect_inputs(ti, id); - return 0; + for (c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) + continue; + k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c); + if (k <= TGSI_SWIZZLE_W) + ti->input_access[src->Index][k] = id; + } + } } static void -free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) +prog_immediate(struct nv50_translation_info *ti, + const struct tgsi_full_immediate *imm) { - int i; - - for (i = 0; i < 4; i++) - free_temp(pc, reg[i]); -} + int c; + unsigned n = ++ti->immd32_nr; -static struct nv50_reg * -temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - if (pc->temp_temp_nr >= 16) - assert(0); + if (n == (1 << (ffs(n) - 1))) + ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16); - pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); - pc->temp_temp_exec[pc->temp_temp_nr] = e; - return pc->temp_temp[pc->temp_temp_nr++]; + for (c = 0; c < 4; ++c) + ti->immd32[(n - 1) * 4 + c] = imm->u[c].Uint; } -/* This *must* be called for all nv50_program_exec that have been - * given as argument to temp_temp, or the temps will be leaked ! - */ -static void -kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - int i; - - for (i = 0; i < pc->temp_temp_nr; i++) - if (pc->temp_temp_exec[i] == e) - free_temp(pc, pc->temp_temp[i]); - if (!e) - pc->temp_temp_nr = 0; +static INLINE unsigned +translate_interpolate(const struct tgsi_full_declaration *decl) +{ + unsigned mode; + + if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_CONSTANT) + mode = NV50_INTERP_FLAT; + else + if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_PERSPECTIVE) + mode = 0; + else + mode = NV50_INTERP_LINEAR; + + if (decl->Declaration.Centroid) + mode |= NV50_INTERP_CENTROID; + + return mode; +} + +static void +prog_decl(struct nv50_translation_info *ti, + const struct tgsi_full_declaration *decl) +{ + unsigned i, first, last, sn = 0, si = 0; + + first = decl->Range.First; + last = decl->Range.Last; + + if (decl->Declaration.Semantic) { + sn = decl->Semantic.Name; + si = decl->Semantic.Index; + } + tgsi_dump_declaration(decl); + + switch (decl->Declaration.File) { + case TGSI_FILE_INPUT: + for (i = first; i <= last; ++i) + ti->interp_mode[i] = translate_interpolate(decl); + + if (!decl->Declaration.Semantic) + break; + + for (i = first; i <= last; ++i) { + ti->p->in[i].sn = sn; + ti->p->in[i].si = si; + } + + switch (sn) { + case TGSI_SEMANTIC_FACE: + break; + case TGSI_SEMANTIC_COLOR: + if (ti->p->type == PIPE_SHADER_FRAGMENT) + ti->p->vp.bfc[si] = first; + break; + } + break; + case TGSI_FILE_OUTPUT: + if (!decl->Declaration.Semantic) + break; + + for (i = first; i <= last; ++i) { + ti->p->out[i].sn = sn; + ti->p->out[i].si = si; + } + + switch (sn) { + case TGSI_SEMANTIC_BCOLOR: + ti->p->vp.bfc[si] = first; + break; + case TGSI_SEMANTIC_PSIZE: + ti->p->vp.psiz = first; + break; + case TGSI_SEMANTIC_EDGEFLAG: + ti->edgeflag_out = first; + break; + default: + break; + } + break; + case TGSI_FILE_SYSTEM_VALUE: + switch (decl->Semantic.Name) { + case TGSI_SEMANTIC_FACE: + break; + case TGSI_SEMANTIC_INSTANCEID: + break; + case TGSI_SEMANTIC_PRIMID: + break; + /* + case TGSI_SEMANTIC_PRIMIDIN: + break; + case TGSI_SEMANTIC_VERTEXID: + break; + */ + default: + break; + } + break; + case TGSI_FILE_CONSTANT: + ti->p->parm_size = MAX2(ti->p->parm_size, (last + 1) * 16); + break; + case TGSI_FILE_ADDRESS: + case TGSI_FILE_SAMPLER: + case TGSI_FILE_TEMPORARY: + break; + default: + assert(0); + break; + } } static int -ctor_immd_4u32(struct nv50_pc *pc, - uint32_t x, uint32_t y, uint32_t z, uint32_t w) -{ - unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); - - pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); - - pc->immd_buf[(pc->immd_nr * 4) + 0] = x; - pc->immd_buf[(pc->immd_nr * 4) + 1] = y; - pc->immd_buf[(pc->immd_nr * 4) + 2] = z; - pc->immd_buf[(pc->immd_nr * 4) + 3] = w; - - return pc->immd_nr++; -} - -static INLINE int -ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) -{ - return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); -} - -static struct nv50_reg * -alloc_immd(struct nv50_pc *pc, float f) -{ - struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); - unsigned hw; - - for (hw = 0; hw < pc->immd_nr * 4; hw++) - if (pc->immd_buf[hw] == fui(f)) - break; - - if (hw == pc->immd_nr * 4) - hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; - - ctor_reg(r, P_IMMD, -1, hw); - return r; -} - -static struct nv50_program_exec * -exec(struct nv50_pc *pc) -{ - struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); - - e->param.index = -1; - return e; -} - -static void -emit(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - struct nv50_program *p = pc->p; - - if (p->exec_tail) - p->exec_tail->next = e; - if (!p->exec_head) - p->exec_head = e; - p->exec_tail = e; - p->exec_size += (e->inst[0] & 1) ? 2 : 1; - - kill_temp_temp(pc, e); -} - -static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); - -static boolean -is_long(struct nv50_program_exec *e) -{ - if (e->inst[0] & 1) - return TRUE; - return FALSE; -} - -static boolean -is_immd(struct nv50_program_exec *e) -{ - if (is_long(e) && (e->inst[1] & 3) == 3) - return TRUE; - return FALSE; -} - -static boolean -is_join(struct nv50_program_exec *e) -{ - if (is_long(e) && (e->inst[1] & 3) == 2) - return TRUE; - return FALSE; -} - -static INLINE boolean -is_control_flow(struct nv50_program_exec *e) -{ - return (e->inst[0] & 2); -} - -static INLINE void -set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, - struct nv50_program_exec *e) -{ - assert(!is_immd(e)); - set_long(pc, e); - e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); - e->inst[1] |= (pred << 7) | (idx << 12); -} - -static INLINE void -set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, - struct nv50_program_exec *e) -{ - set_long(pc, e); - e->inst[1] &= ~((0x3 << 4) | (1 << 6)); - e->inst[1] |= (idx << 4) | (on << 6); -} - -static INLINE void -set_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - if (is_long(e)) - return; - - e->inst[0] |= 1; - set_pred(pc, 0xf, 0, e); - set_pred_wr(pc, 0, 0, e); -} - -static INLINE void -set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) -{ - if (dst->type == P_RESULT) { - set_long(pc, e); - e->inst[1] |= 0x00000008; - } - - alloc_reg(pc, dst); - if (dst->hw > 63) - set_long(pc, e); - e->inst[0] |= (dst->hw << 2); -} - -static INLINE void -set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) -{ - set_long(pc, e); - /* XXX: can't be predicated - bits overlap; cases where both - * are required should be avoided by using pc->allow32 */ - set_pred(pc, 0, 0, e); - set_pred_wr(pc, 0, 0, e); - - e->inst[1] |= 0x00000002 | 0x00000001; - e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; - e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; -} - -static INLINE void -set_addr(struct nv50_program_exec *e, struct nv50_reg *a) -{ - assert(a->type == P_ADDR); - - assert(!(e->inst[0] & 0x0c000000)); - assert(!(e->inst[1] & 0x00000004)); - - e->inst[0] |= (a->hw & 3) << 26; - e->inst[1] |= a->hw & 4; -} - -static void -emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t); - -static void -emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int); - -static void -emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[1] = 0x40000000; - set_long(pc, e); - set_dst(pc, dst, e); - set_addr(e, src); - - emit(pc, e); -} - -static void -emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, uint16_t src1_val) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000 | (src1_val << 9); - e->inst[1] = 0x20000000; - set_long(pc, e); - e->inst[0] |= dst->hw << 2; - if (src0) /* otherwise will add to $a0, which is always 0 */ - set_addr(e, src0); - - emit(pc, e); -} - -#define INTERP_LINEAR 0 -#define INTERP_FLAT 1 -#define INTERP_PERSPECTIVE 2 -#define INTERP_CENTROID 4 - -/* interpolant index has been stored in dst->rhw */ -static void -emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, - unsigned mode) -{ - struct nv50_program_exec *e = exec(pc); - assert(dst->rhw != -1); - - e->inst[0] |= 0x80000000; - set_dst(pc, dst, e); - e->inst[0] |= (dst->rhw << 16); - - if (mode & INTERP_FLAT) { - e->inst[0] |= (1 << 8); - } else { - if (mode & INTERP_PERSPECTIVE) { - e->inst[0] |= (1 << 25); - alloc_reg(pc, iv); - e->inst[0] |= (iv->hw << 9); - } - - if (mode & INTERP_CENTROID) - e->inst[0] |= (1 << 24); - } - - emit(pc, e); -} - -static void -set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, - struct nv50_program_exec *e) -{ - set_long(pc, e); - - e->param.index = src->hw & 127; - e->param.shift = s; - e->param.mask = m << (s % 32); - - if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */ - set_addr(e, get_address_reg(pc, src)); - else - if (src->acc < 0) { - assert(src->type == P_CONST); - set_addr(e, pc->addr[src->indirect[0]]); - } - - e->inst[1] |= (src->buf_index << 22); -} - -/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ -static void -emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x10000000; - if (!pc->allow32) - set_long(pc, e); - - set_dst(pc, dst, e); - - if (!is_long(e) && src->type == P_IMMD) { - set_immd(pc, src, e); - /*XXX: 32-bit, but steals part of "half" reg space - need to - * catch and handle this case if/when we do half-regs - */ - } else - if (src->type == P_IMMD || src->type == P_CONST) { - set_long(pc, e); - set_data(pc, src, 0x7f, 9, e); - e->inst[1] |= 0x20000000; /* mov from c[] */ - } else { - if (src->type == P_ATTR) { - set_long(pc, e); - e->inst[1] |= 0x00200000; - - if (src->vtx >= 0) { - /* indirect (vertex base + c) load from p[] */ - e->inst[0] |= 0x01800000; - set_addr(e, get_address_reg(pc, src)); - } - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); - } - - if (is_long(e) && !is_immd(e)) { - e->inst[1] |= 0x04000000; /* 32-bit */ - e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ - if (!(e->inst[1] & 0x20000000)) - e->inst[1] |= 0x00030000; /* lane mask 2:3 */ - } else - e->inst[0] |= 0x00008000; - - emit(pc, e); -} - -static INLINE void -emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) -{ - struct nv50_reg *imm = alloc_immd(pc, f); - emit_mov(pc, dst, imm); - FREE(imm); -} - -/* Assign the hw of the discarded temporary register src - * to the tgsi register dst and free src. - */ -static void -assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - assert(src->index == -1 && src->hw != -1); - - if (pc->if_lvl || pc->loop_lvl || - (dst->type != P_TEMP) || - (src->hw < pc->result_nr * 4 && - pc->p->type == PIPE_SHADER_FRAGMENT) || - pc->p->info.opcode_count[TGSI_OPCODE_CAL] || - pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { - - emit_mov(pc, dst, src); - free_temp(pc, src); - return; - } - - if (dst->hw != -1) - pc->r_temp[dst->hw] = NULL; - pc->r_temp[src->hw] = dst; - dst->hw = src->hw; - - FREE(src); -} - -static void -emit_nop(struct nv50_pc *pc) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xf0000000; - set_long(pc, e); - e->inst[1] = 0xe0000000; - emit(pc, e); -} - -static boolean -check_swap_src_0_1(struct nv50_pc *pc, - struct nv50_reg **s0, struct nv50_reg **s1) -{ - struct nv50_reg *src0 = *s0, *src1 = *s1; - - if (src0->type == P_CONST) { - if (src1->type != P_CONST) { - *s0 = src1; - *s1 = src0; - return TRUE; - } - } else - if (src1->type == P_ATTR) { - if (src0->type != P_ATTR) { - *s0 = src1; - *s1 = src0; - return TRUE; - } - } - - return FALSE; -} - -static void -set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, - struct nv50_program_exec *e) -{ - struct nv50_reg *temp; - - if (src->type != P_TEMP) { - temp = temp_temp(pc, e); - emit_mov(pc, temp, src); - src = temp; - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); -} - -static void -set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - if (src->type == P_ATTR) { - set_long(pc, e); - e->inst[1] |= 0x00200000; - - if (src->vtx >= 0) { - e->inst[0] |= 0x01800000; /* src from p[] */ - set_addr(e, get_address_reg(pc, src)); - } - } else - if (src->type == P_CONST || src->type == P_IMMD) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); -} - -static void -set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - if (src->type == P_ATTR) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else - if (src->type == P_CONST || src->type == P_IMMD) { - if (e->inst[0] & 0x01800000) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else { - assert(!(e->inst[0] & 0x00800000)); - set_data(pc, src, 0x7f, 16, e); - e->inst[0] |= 0x00800000; - } - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= ((src->hw & 127) << 16); -} - -static void -set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - set_long(pc, e); - - if (src->type == P_ATTR) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else - if (src->type == P_CONST || src->type == P_IMMD) { - if (e->inst[0] & 0x01800000) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else { - assert(!(e->inst[0] & 0x01000000)); - set_data(pc, src, 0x7f, 32+14, e); - e->inst[0] |= 0x01000000; - } - } - - alloc_reg(pc, src); - e->inst[1] |= ((src->hw & 127) << 14); -} - -static void -set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh, - struct nv50_program_exec *e, int pos) -{ - struct nv50_reg *r = src; - - alloc_reg(pc, r); - if (r->type != P_TEMP) { - r = temp_temp(pc, e); - emit_mov(pc, r, src); - } - - if (r->hw > (NV50_SU_MAX_TEMP / 2)) { - NOUVEAU_ERR("out of low GPRs\n"); - abort(); - } - - e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32); -} - -static void -emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) -{ - struct nv50_program_exec *e = exec(pc); - - assert(dst->type == P_TEMP); - e->inst[1] = 0x20000000 | (pred << 12); - set_long(pc, e); - set_dst(pc, dst, e); - - emit(pc, e); -} - -static void -emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x000001fc; - e->inst[1] = 0xa0000008; - set_long(pc, e); - set_pred_wr(pc, 1, pred, e); - set_src_0_restricted(pc, src, e); - - emit(pc, e); -} - -static void -emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xc0000000; - - if (!pc->allow32) - set_long(pc, e); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - if (src1->type == P_IMMD && !is_long(e)) { - if (src0->mod ^ src1->mod) - e->inst[0] |= 0x00008000; - set_immd(pc, src1, e); - } else { - set_src_1(pc, src1, e); - if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { - if (is_long(e)) - e->inst[1] |= 0x08000000; - else - e->inst[0] |= 0x00008000; - } - } - - emit(pc, e); -} - -static void -emit_add(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xb0000000; - - alloc_reg(pc, src1); - check_swap_src_0_1(pc, &src0, &src1); - - if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { - set_long(pc, e); - e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | - ((src1->mod & NV50_MOD_NEG) << 27); - } - - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) - set_src_2(pc, src1, e); - else - if (src1->type == P_IMMD) - set_immd(pc, src1, e); - else - set_src_1(pc, src1, e); - - emit(pc, e); -} - -static void -emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, - uint8_t s) -{ - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[1] |= 0xc0000000; - - e->inst[0] |= dst->hw << 2; - e->inst[0] |= s << 16; /* shift left */ - set_src_0(pc, src, e); - - emit(pc, e); -} - -static boolean -address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r) -{ - if (!r) - return FALSE; - - if (r->vtx != a->vtx) - return FALSE; - if (r->vtx >= 0) - return (r->indirect[1] == a->indirect[1]); - - if (r->hw < a->rhw || (r->hw - a->rhw) >= 128) - return FALSE; - - if (a->index >= 0) - return (a->index == r->indirect[0]); - return (a->indirect[0] == r->indirect[0]); -} - -static void -load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *a, int shift) -{ - struct nv50_reg mem, *temp; - - ctor_reg(&mem, P_ATTR, -1, dst->vtx); - - assert(dst->type == P_ADDR); - if (!a) { - emit_arl(pc, dst, &mem, 0); - return; - } - temp = alloc_temp(pc, NULL); - - if (shift) { - emit_mov_from_addr(pc, temp, a); - if (shift < 0) - emit_shl_imm(pc, temp, temp, shift); - emit_arl(pc, dst, temp, MAX2(shift, 0)); - } - emit_mov(pc, temp, &mem); - set_addr(pc->p->exec_tail, dst); - - emit_arl(pc, dst, temp, 0); - free_temp(pc, temp); -} - -/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS - * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX - * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX - * case (vtx < 0, acc >= 0): memory address too high to encode - * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS - */ -static struct nv50_reg * -get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref) -{ - int i; - struct nv50_reg *a_ref, *a = NULL; - - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { - if (pc->r_addr[i].acc == 0) - a = &pc->r_addr[i]; /* an unused address reg */ - else - if (address_reg_suitable(&pc->r_addr[i], ref)) { - pc->r_addr[i].acc = pc->insn_cur; - return &pc->r_addr[i]; - } else - if (!a && pc->r_addr[i].index < 0 && - pc->r_addr[i].acc < pc->insn_cur) - a = &pc->r_addr[i]; - } - if (!a) { - /* We'll be able to spill address regs when this - * mess is replaced with a proper compiler ... - */ - NOUVEAU_ERR("out of address regs\n"); - abort(); - return NULL; - } - - /* initialize and reserve for this TGSI instruction */ - a->rhw = 0; - a->index = a->indirect[0] = a->indirect[1] = -1; - a->acc = pc->insn_cur; - - if (!ref) { - a->vtx = -1; - return a; - } - a->vtx = ref->vtx; - - /* now put in the correct value ... */ - - if (ref->vtx >= 0) { - a->indirect[1] = ref->indirect[1]; - - /* For an indirect vertex index, we need to shift address right - * by 2, the address register will contain vtx * 16, we need to - * load from a[vtx * 4]. - */ - load_vertex_base(pc, a, (ref->acc < 0) ? - pc->addr[ref->indirect[1]] : NULL, -2); - } else { - assert(ref->acc < 0 || ref->indirect[0] < 0); - - a->rhw = ref->hw & ~0x7f; - a->indirect[0] = ref->indirect[0]; - a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL; - - emit_add_addr_imm(pc, a, a_ref, a->rhw * 4); - } - return a; -} - -#define NV50_MAX_F32 0x880 -#define NV50_MAX_S32 0x08c -#define NV50_MAX_U32 0x084 -#define NV50_MIN_F32 0x8a0 -#define NV50_MIN_S32 0x0ac -#define NV50_MIN_U32 0x0a4 - -static void -emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20); - e->inst[1] |= (sub << 24); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - - if (src0->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - if (src1->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00080000; - - emit(pc, e); -} - -static INLINE void -emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1) -{ - src1->mod ^= NV50_MOD_NEG; - emit_add(pc, dst, src0, src1); - src1->mod ^= NV50_MOD_NEG; -} - -static void -emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, unsigned op) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000; - set_long(pc, e); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && - op != TGSI_OPCODE_XOR) - assert(!"invalid bit op"); - - assert(!(src0->mod | src1->mod)); - - if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { - set_immd(pc, src1, e); - if (op == TGSI_OPCODE_OR) - e->inst[0] |= 0x0100; - else - if (op == TGSI_OPCODE_XOR) - e->inst[0] |= 0x8000; - } else { - set_src_1(pc, src1, e); - e->inst[1] |= 0x04000000; /* 32 bit */ - if (op == TGSI_OPCODE_OR) - e->inst[1] |= 0x4000; - else - if (op == TGSI_OPCODE_XOR) - e->inst[1] |= 0x8000; - } - - emit(pc, e); -} - -static void -emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000; - e->inst[1] = 0x0402c000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_1(pc, src, e); - - emit(pc, e); -} - -static void -emit_shift(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x30000000; - e->inst[1] = 0xc4000000; - - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (src1->type == P_IMMD) { - e->inst[1] |= (1 << 20); - e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16; - } else - set_src_1(pc, src1, e); - - if (dir != TGSI_OPCODE_SHL) - e->inst[1] |= (1 << 29); - - if (dir == TGSI_OPCODE_ISHR) - e->inst[1] |= (1 << 27); - - emit(pc, e); -} - -static void -emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src, int s) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x30000000; - e->inst[1] = 0xc4100000; - if (s < 0) { - e->inst[1] |= 1 << 29; - s = -s; - } - e->inst[1] |= ((s & 0x7f) << 16); - - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - - emit(pc, e); -} - -static void -emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xe0000000; - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - set_src_2(pc, src2, e); - - if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src2->mod & NV50_MOD_NEG) - e->inst[1] |= 0x08000000; - - emit(pc, e); -} - -static INLINE void -emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, struct nv50_reg *src2) -{ - src2->mod ^= NV50_MOD_NEG; - emit_mad(pc, dst, src0, src1, src2); - src2->mod ^= NV50_MOD_NEG; -} - -#define NV50_FLOP_RCP 0 -#define NV50_FLOP_RSQ 2 -#define NV50_FLOP_LG2 3 -#define NV50_FLOP_SIN 4 -#define NV50_FLOP_COS 5 -#define NV50_FLOP_EX2 6 - -/* rcp, rsqrt, lg2 support neg and abs */ -static void -emit_flop(struct nv50_pc *pc, unsigned sub, - struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0x90000000; - if (sub || src->mod) { - set_long(pc, e); - e->inst[1] |= (sub << 29); - } - - set_dst(pc, dst, e); - set_src_0_restricted(pc, src, e); - - assert(!src->mod || sub < 4); - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -static void -emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xb0000000; - - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_long(pc, e); - e->inst[1] |= (6 << 29) | 0x00004000; - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -static void -emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xb0000000; - - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_long(pc, e); - e->inst[1] |= (6 << 29); - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -#define CVT_RN (0x00 << 16) -#define CVT_FLOOR (0x02 << 16) -#define CVT_CEIL (0x04 << 16) -#define CVT_TRUNC (0x06 << 16) -#define CVT_SAT (0x08 << 16) -#define CVT_ABS (0x10 << 16) - -#define CVT_X32_X32 0x04004000 -#define CVT_X32_S32 0x04014000 -#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) -#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) -#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) -#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) -#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) -#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) -#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) -#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) - -#define CVT_NEG 0x20000000 -#define CVT_RI 0x08000000 - -static void -emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, - int wp, uint32_t cvn) -{ - struct nv50_program_exec *e; - - e = exec(pc); - - if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; - if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; - - e->inst[0] = 0xa0000000; - e->inst[1] = cvn; - set_long(pc, e); - set_src_0(pc, src, e); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - emit(pc, e); -} - -/* nv50 Condition codes: - * 0x1 = LT - * 0x2 = EQ - * 0x3 = LE - * 0x4 = GT - * 0x5 = NE - * 0x6 = GE - * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) - * 0x8 = unordered bit (allows NaN) - * - * mode = 0x04 (u32), 0x0c (s32), 0x80 (f32) - */ -static void -emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, - struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode) -{ - static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; - - struct nv50_program_exec *e = exec(pc); - struct nv50_reg *rdst; - - assert(ccode < 16); - if (check_swap_src_0_1(pc, &src0, &src1)) - ccode = cc_swapped[ccode & 7] | (ccode & 8); - - rdst = dst; - if (dst && dst->type != P_TEMP) - dst = alloc_temp(pc, NULL); - - set_long(pc, e); - e->inst[0] |= 0x30000000 | (mode << 24); - e->inst[1] |= 0x60000000 | (ccode << 14); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - - emit(pc, e); - - if (rdst && mode == 0x80) /* convert to float ? */ - emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); - if (rdst && rdst != dst) - free_temp(pc, dst); -} - -static INLINE void -map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty) -{ - switch (op) { - case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break; - case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break; - case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break; - case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break; - case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break; - case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break; - - case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break; - case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break; - case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break; - case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break; - case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break; - case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break; - default: - assert(0); - return; - } -} - -static void -emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *rsrc1) -{ - struct nv50_program_exec *e = exec(pc); - struct nv50_reg *src1; - - e->inst[0] = 0x20000000; - - alloc_reg(pc, rsrc1); - check_swap_src_0_1(pc, &src0, &rsrc1); - - src1 = rsrc1; - if (src0->mod & rsrc1->mod & NV50_MOD_NEG) { - src1 = temp_temp(pc, e); - emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32); - } - - if (!pc->allow32 || src1->hw > 63 || - (src1->type != P_TEMP && src1->type != P_IMMD)) - set_long(pc, e); - - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (is_long(e)) { - e->inst[1] |= 1 << 26; - set_src_2(pc, src1, e); - } else { - e->inst[0] |= 0x8000; - if (src1->type == P_IMMD) - set_immd(pc, src1, e); - else - set_src_1(pc, src1, e); - } - - if (src0->mod & NV50_MOD_NEG) - e->inst[0] |= 1 << 28; - else - if (src1->mod & NV50_MOD_NEG) - e->inst[0] |= 1 << 22; - - emit(pc, e); -} - -static void -emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1, - struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x60000000; - if (!pc->allow32) - set_long(pc, e); - set_dst(pc, dst, e); - - set_half_src(pc, src0, lh_0, e, 9); - set_half_src(pc, src1, lh_1, e, 16); - alloc_reg(pc, src2); - if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw)) - set_src_2(pc, src2, e); - - emit(pc, e); -} - -static void -emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x40000000; - set_long(pc, e); - set_dst(pc, dst, e); - - set_half_src(pc, src0, lh_0, e, 9); - set_half_src(pc, src1, lh_1, e, 16); - - emit(pc, e); -} - -static void -emit_sad(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x50000000; - if (!pc->allow32) - set_long(pc, e); - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - alloc_reg(pc, src2); - if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw)) - set_src_2(pc, src2, e); - - if (is_long(e)) - e->inst[1] |= 0x0c << 24; - else - e->inst[0] |= 0x81 << 8; - - emit(pc, e); -} - -static INLINE void -emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); -} - -static void -emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *v, struct nv50_reg *e) -{ - struct nv50_reg *temp = alloc_temp(pc, NULL); - - emit_flop(pc, NV50_FLOP_LG2, temp, v); - emit_mul(pc, temp, temp, e); - emit_preex2(pc, temp, temp); - emit_flop(pc, NV50_FLOP_EX2, dst, temp); - - free_temp(pc, temp); -} - -static INLINE void -emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); -} - -static void -emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, - struct nv50_reg **src) -{ - struct nv50_reg *one = alloc_immd(pc, 1.0); - struct nv50_reg *zero = alloc_immd(pc, 0.0); - struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); - struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); - struct nv50_reg *tmp[4] = { 0 }; - boolean allow32 = pc->allow32; - - pc->allow32 = FALSE; - - if (mask & (3 << 1)) { - tmp[0] = alloc_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero); - } - - if (mask & (1 << 2)) { - set_pred_wr(pc, 1, 0, pc->p->exec_tail); - - tmp[1] = temp_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero); - - tmp[3] = temp_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128); - emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128); - - emit_pow(pc, dst[2], tmp[1], tmp[3]); - emit_mov(pc, dst[2], zero); - set_pred(pc, 3, 0, pc->p->exec_tail); - } - - if (mask & (1 << 1)) - assimilate_temp(pc, dst[1], tmp[0]); - else - if (mask & (1 << 2)) - free_temp(pc, tmp[0]); - - pc->allow32 = allow32; - - /* do this last, in case src[i,j] == dst[0,3] */ - if (mask & (1 << 0)) - emit_mov(pc, dst[0], one); - - if (mask & (1 << 3)) - emit_mov(pc, dst[3], one); - - FREE(pos128); - FREE(neg128); - FREE(zero); - FREE(one); -} - -static void -emit_kil(struct nv50_pc *pc, struct nv50_reg *src) -{ - struct nv50_program_exec *e; - const int r_pred = 1; - - e = exec(pc); - e->inst[0] = 0x00000002; /* discard */ - set_long(pc, e); /* sets cond code to ALWAYS */ - - if (src) { - set_pred(pc, 0x1 /* cc = LT */, r_pred, e); - /* write to predicate reg */ - emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); - } - - emit(pc, e); -} - -static struct nv50_program_exec * -emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = (op << 28) | 2; - set_long(pc, e); - if (pred >= 0) - set_pred(pc, cc, pred, e); - - emit(pc, e); - return e; -} - -static INLINE struct nv50_program_exec * -emit_breakaddr(struct nv50_pc *pc) -{ - return emit_control_flow(pc, 0x4, -1, 0); -} - -static INLINE void -emit_break(struct nv50_pc *pc, int pred, unsigned cc) -{ - emit_control_flow(pc, 0x5, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_joinat(struct nv50_pc *pc) -{ - return emit_control_flow(pc, 0xa, -1, 0); -} - -static INLINE struct nv50_program_exec * -emit_branch(struct nv50_pc *pc, int pred, unsigned cc) -{ - return emit_control_flow(pc, 0x1, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_call(struct nv50_pc *pc, int pred, unsigned cc) -{ - return emit_control_flow(pc, 0x2, pred, cc); -} - -static INLINE void -emit_ret(struct nv50_pc *pc, int pred, unsigned cc) -{ - emit_control_flow(pc, 0x3, pred, cc); -} - -static void -emit_prim_cmd(struct nv50_pc *pc, unsigned cmd) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xf0000000 | (cmd << 9); - e->inst[1] = 0xc0000000; - set_long(pc, e); - - emit(pc, e); -} - -#define QOP_ADD 0 -#define QOP_SUBR 1 -#define QOP_SUB 2 -#define QOP_MOV_SRC1 3 - -/* For a quad of threads / top left, top right, bottom left, bottom right - * pixels, do a different operation, and take src0 from a specific thread. - */ -static void -emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, - struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xc0000000; - e->inst[1] = 0x80000000; - set_long(pc, e); - e->inst[0] |= lane_src0 << 16; - set_src_0(pc, src0, e); - set_src_2(pc, src1, e); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - e->inst[0] |= (qop & 3) << 20; - e->inst[1] |= (qop >> 2) << 22; - - emit(pc, e); -} - -static void -load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], - struct nv50_reg **src, unsigned arg, boolean proj) -{ - int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; - - src[0]->mod |= NV50_MOD_ABS; - src[1]->mod |= NV50_MOD_ABS; - src[2]->mod |= NV50_MOD_ABS; - - emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]); - emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]); - - src[0]->mod = mod[0]; - src[1]->mod = mod[1]; - src[2]->mod = mod[2]; - - if (proj && 0 /* looks more correct without this */) - emit_mul(pc, t[2], t[2], src[3]); - else - if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ - emit_mov(pc, t[3], src[3]); - - emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); - - emit_mul(pc, t[0], src[0], t[2]); - emit_mul(pc, t[1], src[1], t[2]); - emit_mul(pc, t[2], src[2], t[2]); -} - -static void -load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], - struct nv50_reg **src, unsigned dim, unsigned arg) -{ - unsigned c, mode; - - if (src[0]->type == P_TEMP && src[0]->rhw != -1) { - mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; - - t[3]->rhw = src[3]->rhw; - emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); - emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); - - for (c = 0; c < dim; ++c) { - t[c]->rhw = src[c]->rhw; - emit_interp(pc, t[c], t[3], mode); - } - if (arg != dim) { /* depth reference value */ - t[dim]->rhw = src[2]->rhw; - emit_interp(pc, t[dim], t[3], mode); - } - } else { - /* XXX: for some reason the blob sometimes uses MAD - * (mad f32 $rX $rY $rZ neg $r63) - */ - emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); - for (c = 0; c < dim; ++c) - emit_mul(pc, t[c], src[c], t[3]); - if (arg != dim) /* depth reference value */ - emit_mul(pc, t[dim], src[2], t[3]); - } -} - -static INLINE void -get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) -{ - switch (type) { - case TGSI_TEXTURE_1D: - *arg = *dim = 1; - break; - case TGSI_TEXTURE_SHADOW1D: - *dim = 1; - *arg = 2; - break; - case TGSI_TEXTURE_UNKNOWN: - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - *arg = *dim = 2; - break; - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - *dim = 2; - *arg = 3; - break; - case TGSI_TEXTURE_3D: - case TGSI_TEXTURE_CUBE: - *dim = *arg = 3; - break; - default: - assert(0); - break; - } -} - -/* We shouldn't execute TEXLOD if any of the pixels in a quad have - * different LOD values, so branch off groups of equal LOD. - */ -static void -emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, - struct nv50_reg *src, struct nv50_program_exec *tex) -{ - struct nv50_program_exec *join_at; - unsigned i, target = pc->p->exec_size + 9 * 2; - - if (pc->p->type != PIPE_SHADER_FRAGMENT) { - emit(pc, tex); - return; - } - pc->allow32 = FALSE; - - /* Subtract lod of each pixel from lod of top left pixel, jump - * texlod insn if result is 0, then repeat for 2 other pixels. - */ - join_at = emit_joinat(pc); - emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); - emit_branch(pc, 0, 2)->param.index = target; - - for (i = 1; i < 4; ++i) { - emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); - emit_branch(pc, 0, 2)->param.index = target; - } - - emit_mov(pc, tlod, src); /* target */ - emit(pc, tex); /* texlod */ - - join_at->param.index = target + 2 * 2; - JOIN_ON(emit_nop(pc)); /* join _after_ tex */ -} - -static void -emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, - struct nv50_program_exec *tex) -{ - struct nv50_program_exec *e; - struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); - int r_pred = 0; - unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; - - pc->allow32 = FALSE; - ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); - - /* Subtract bias value of thread i from bias values of each thread, - * store result in r_pred, and set bit i in r_bits if result was 0. - */ - assert(arg < 4); - for (i = 0; i < 4; ++i, ++imm_1248.hw) { - emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); - emit_mov(pc, r_bits, &imm_1248); - set_pred(pc, 2, r_pred, pc->p->exec_tail); - } - emit_mov_to_pred(pc, r_pred, r_bits); - - /* The lanes of a quad are now grouped by the bit in r_pred they have - * set. Put the input values for TEX into a new register set for each - * group and execute TEX only for a specific group. - * We cannot use the same register set for each group because we need - * the derivatives, which are implicitly calculated, to be correct. - */ - for (i = 1; i < 4; ++i) { - alloc_temp4(pc, t123[i], 0); - - for (c = 0; c <= arg; ++c) - emit_mov(pc, t123[i][c], t[c]); - - *(e = exec(pc)) = *(tex); - e->inst[0] &= ~0x01fc; - set_dst(pc, t123[i][0], e); - set_pred(pc, cc[i], r_pred, e); - emit(pc, e); - } - /* finally TEX on the original regs (where we kept the input) */ - set_pred(pc, cc[0], r_pred, tex); - emit(pc, tex); - - /* put the 3 * n other results into regs for lane 0 */ - n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); - for (i = 1; i < 4; ++i) { - for (c = 0; c < n; ++c) { - emit_mov(pc, t[c], t123[i][c]); - set_pred(pc, cc[i], r_pred, pc->p->exec_tail); - } - free_temp4(pc, t123[i]); - } - - emit_nop(pc); - free_temp(pc, r_bits); -} - -static void -emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, - struct nv50_reg **src, unsigned unit, unsigned type, - boolean proj, int bias_lod) -{ - struct nv50_reg *t[4]; - struct nv50_program_exec *e; - unsigned c, dim, arg; - - /* t[i] must be within a single 128 bit super-reg */ - alloc_temp4(pc, t, 0); - - e = exec(pc); - e->inst[0] = 0xf0000000; - set_long(pc, e); - set_dst(pc, t[0], e); - - /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ - e->inst[0] |= (unit << 9) /* | (unit << 17) */; - - /* live flag (don't set if TEX results affect input to another TEX): */ - /* e->inst[0] |= 0x00000004; */ - - get_tex_dim(type, &dim, &arg); - - if (type == TGSI_TEXTURE_CUBE) { - e->inst[0] |= 0x08000000; - load_cube_tex_coords(pc, t, src, arg, proj); - } else - if (proj) - load_proj_tex_coords(pc, t, src, dim, arg); - else { - for (c = 0; c < dim; c++) - emit_mov(pc, t[c], src[c]); - if (arg != dim) /* depth reference value (always src.z here) */ - emit_mov(pc, t[dim], src[2]); - } - - e->inst[0] |= (mask & 0x3) << 25; - e->inst[1] |= (mask & 0xc) << 12; - - if (!bias_lod) { - e->inst[0] |= (arg - 1) << 22; - emit(pc, e); - } else - if (bias_lod < 0) { - assert(pc->p->type == PIPE_SHADER_FRAGMENT); - e->inst[0] |= arg << 22; - e->inst[1] |= 0x20000000; /* texbias */ - emit_mov(pc, t[arg], src[3]); - emit_texbias_sequence(pc, t, arg, e); - } else { - e->inst[0] |= arg << 22; - e->inst[1] |= 0x40000000; /* texlod */ - emit_mov(pc, t[arg], src[3]); - emit_texlod_sequence(pc, t[arg], src[3], e); - } - -#if 1 - c = 0; - if (mask & 1) emit_mov(pc, dst[0], t[c++]); - if (mask & 2) emit_mov(pc, dst[1], t[c++]); - if (mask & 4) emit_mov(pc, dst[2], t[c++]); - if (mask & 8) emit_mov(pc, dst[3], t[c]); - - free_temp4(pc, t); -#else - /* XXX: if p.e. MUL is used directly after TEX, it would still use - * the texture coordinates, not the fetched values: latency ? */ - - for (c = 0; c < 4; c++) { - if (mask & (1 << c)) - assimilate_temp(pc, dst[c], t[c]); - else - free_temp(pc, t[c]); - } -#endif -} - -static void -emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - assert(src->type == P_TEMP); - - e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; - e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_src_2(pc, src, e); - - emit(pc, e); -} - -static void -emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) +nv50_vertprog_prepare(struct nv50_translation_info *ti) { - struct nv50_program_exec *e = exec(pc); - - assert(src->type == P_TEMP); + struct nv50_program *p = ti->p; + int i, c; + unsigned num_inputs = 0; - e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; - e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_src_2(pc, src, e); + ti->input_file = NV_FILE_MEM_S; + ti->output_file = NV_FILE_OUT; - emit(pc, e); -} + for (i = 0; i <= ti->scan.file_max[TGSI_FILE_INPUT]; ++i) { + p->in[i].id = i; + p->in[i].hw = num_inputs; -static void -convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - unsigned q = 0, m = ~0; + for (c = 0; c < 4; ++c) { + if (!ti->input_access[i][c]) + continue; + ti->input_map[i][c] = num_inputs++; + p->vp.attrs[(4 * i + c) / 32] |= 1 << ((i * 4 + c) % 32); + } + } - assert(!is_long(e)); + for (i = 0; i <= ti->scan.file_max[TGSI_FILE_OUTPUT]; ++i) { + p->out[i].id = i; + p->out[i].hw = p->max_out; - switch (e->inst[0] >> 28) { - case 0x1: - /* MOV */ - q = 0x0403c000; - m = 0xffff7fff; - break; - case 0x2: - case 0x3: - /* ADD, SUB, SUBR b32 */ - m = ~(0x8000 | (127 << 16)); - q = ((e->inst[0] & (~m)) >> 2) | (1 << 26); - break; - case 0x5: - /* SAD */ - m = ~(0x81 << 8); - q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12); - break; - case 0x6: - /* MAD u16 */ - q = (e->inst[0] & (0x7f << 2)) << 12; - break; - case 0x8: - /* INTERP (move centroid, perspective and flat bits) */ - m = ~0x03000100; - q = (e->inst[0] & (3 << 24)) >> (24 - 16); - q |= (e->inst[0] & (1 << 8)) << (18 - 8); - break; - case 0x9: - /* RCP */ - break; - case 0xB: - /* ADD */ - m = ~(127 << 16); - q = ((e->inst[0] & (~m)) >> 2); - break; - case 0xC: - /* MUL */ - m = ~0x00008000; - q = ((e->inst[0] & (~m)) << 12); - break; - case 0xE: - /* MAD (if src2 == dst) */ - q = ((e->inst[0] & 0x1fc) << 12); - break; - default: - assert(0); - break; - } + for (c = 0; c < 4; ++c) { + if (!ti->output_access[i][c]) + continue; + ti->output_map[i][c] = p->max_out++; + p->out[i].mask |= 1 << c; + } + } - set_long(pc, e); - pc->p->exec_size++; + if (p->vp.psiz < 0x40) + p->vp.psiz = p->out[p->vp.psiz].hw; - e->inst[0] &= m; - e->inst[1] |= q; + return 0; } -/* Some operations support an optional negation flag. */ static int -get_supported_mods(const struct tgsi_full_instruction *insn, int i) -{ - switch (insn->Instruction.Opcode) { - case TGSI_OPCODE_ADD: - case TGSI_OPCODE_COS: - case TGSI_OPCODE_DDX: - case TGSI_OPCODE_DDY: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_KIL: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_MAD: - case TGSI_OPCODE_MUL: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ - case TGSI_OPCODE_SCS: - case TGSI_OPCODE_SIN: - case TGSI_OPCODE_SUB: - return NV50_MOD_NEG; - case TGSI_OPCODE_MAX: - case TGSI_OPCODE_MIN: - case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ - return NV50_MOD_ABS; - case TGSI_OPCODE_CEIL: - case TGSI_OPCODE_FLR: - case TGSI_OPCODE_TRUNC: - return NV50_MOD_NEG | NV50_MOD_ABS; - case TGSI_OPCODE_F2I: - case TGSI_OPCODE_F2U: - case TGSI_OPCODE_I2F: - case TGSI_OPCODE_U2F: - return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; - case TGSI_OPCODE_UADD: - return NV50_MOD_NEG | NV50_MOD_I32; - case TGSI_OPCODE_SAD: - case TGSI_OPCODE_SHL: - case TGSI_OPCODE_IMAX: - case TGSI_OPCODE_IMIN: - case TGSI_OPCODE_ISHR: - case TGSI_OPCODE_NOT: - case TGSI_OPCODE_UMAD: - case TGSI_OPCODE_UMAX: - case TGSI_OPCODE_UMIN: - case TGSI_OPCODE_UMUL: - case TGSI_OPCODE_USHR: - return NV50_MOD_I32; - default: - return 0; - } -} - -/* Return a read mask for source registers deduced from opcode & write mask. */ -static unsigned -nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) -{ - unsigned x, mask = insn->Dst[0].Register.WriteMask; - - switch (insn->Instruction.Opcode) { - case TGSI_OPCODE_COS: - case TGSI_OPCODE_SIN: - return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); - case TGSI_OPCODE_DP3: - return 0x7; - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_KIL: /* WriteMask ignored */ - return 0xf; - case TGSI_OPCODE_DST: - return mask & (c ? 0xa : 0x6); - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_SCS: - return 0x1; - case TGSI_OPCODE_IF: - return 0x1; - case TGSI_OPCODE_LIT: - return 0xb; - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXP: - { - const struct tgsi_instruction_texture *tex; - - assert(insn->Instruction.Texture); - tex = &insn->Texture; - - mask = 0x7; - if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && - insn->Instruction.Opcode != TGSI_OPCODE_TXD) - mask |= 0x8; /* bias, lod or proj */ - - switch (tex->Texture) { - case TGSI_TEXTURE_1D: - mask &= 0x9; - break; - case TGSI_TEXTURE_SHADOW1D: - mask &= 0x5; - break; - case TGSI_TEXTURE_2D: - mask &= 0xb; - break; - default: - break; - } - } - return mask; - case TGSI_OPCODE_XPD: - x = 0; - if (mask & 1) x |= 0x6; - if (mask & 2) x |= 0x5; - if (mask & 4) x |= 0x3; - return x; - default: - break; - } - - return mask; -} - -static struct nv50_reg * -tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) -{ - switch (dst->Register.File) { - case TGSI_FILE_TEMPORARY: - return &pc->temp[dst->Register.Index * 4 + c]; - case TGSI_FILE_OUTPUT: - return &pc->result[dst->Register.Index * 4 + c]; - case TGSI_FILE_ADDRESS: - { - struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; - if (!r) { - r = get_address_reg(pc, NULL); - r->index = dst->Register.Index * 4 + c; - pc->addr[r->index] = r; - } - assert(r); - return r; - } - case TGSI_FILE_NULL: - return NULL; - case TGSI_FILE_SYSTEM_VALUE: - assert(pc->sysval[dst->Register.Index].type == P_RESULT); - assert(c == 0); - return &pc->sysval[dst->Register.Index]; - default: - break; - } - - return NULL; -} - -static struct nv50_reg * -tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, - int mod) -{ - struct nv50_reg *r = NULL; - struct nv50_reg *temp = NULL; - unsigned sgn, c, swz, cvn; - - if (src->Register.File != TGSI_FILE_CONSTANT) - assert(!src->Register.Indirect); - - sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); - - c = tgsi_util_get_full_src_register_swizzle(src, chan); - switch (c) { - case TGSI_SWIZZLE_X: - case TGSI_SWIZZLE_Y: - case TGSI_SWIZZLE_Z: - case TGSI_SWIZZLE_W: - switch (src->Register.File) { - case TGSI_FILE_INPUT: - r = &pc->attr[src->Register.Index * 4 + c]; - - if (!src->Dimension.Dimension) - break; - r = reg_instance(pc, r); - r->vtx = src->Dimension.Index; - - if (!src->Dimension.Indirect) - break; - swz = tgsi_util_get_src_register_swizzle( - &src->DimIndirect, 0); - r->acc = -1; - r->indirect[1] = src->DimIndirect.Index * 4 + swz; - break; - case TGSI_FILE_TEMPORARY: - r = &pc->temp[src->Register.Index * 4 + c]; - break; - case TGSI_FILE_CONSTANT: - if (!src->Register.Indirect) { - r = &pc->param[src->Register.Index * 4 + c]; - break; - } - /* Indicate indirection by setting r->acc < 0 and - * use the index field to select the address reg. - */ - r = reg_instance(pc, NULL); - ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c); - - swz = tgsi_util_get_src_register_swizzle( - &src->Indirect, 0); - r->acc = -1; - r->indirect[0] = src->Indirect.Index * 4 + swz; - break; - case TGSI_FILE_IMMEDIATE: - r = &pc->immd[src->Register.Index * 4 + c]; - break; - case TGSI_FILE_SAMPLER: - return NULL; - case TGSI_FILE_ADDRESS: - r = pc->addr[src->Register.Index * 4 + c]; - assert(r); - break; - case TGSI_FILE_SYSTEM_VALUE: - assert(c == 0); - r = &pc->sysval[src->Register.Index]; - break; - default: - assert(0); - break; - } - break; - default: - assert(0); - break; - } - - cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; - - switch (sgn) { - case TGSI_UTIL_SIGN_CLEAR: - r->mod = NV50_MOD_ABS; - break; - case TGSI_UTIL_SIGN_SET: - r->mod = NV50_MOD_NEG_ABS; - break; - case TGSI_UTIL_SIGN_TOGGLE: - r->mod = NV50_MOD_NEG; - break; - default: - assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); - break; - } - - if ((r->mod & mod) != r->mod) { - temp = temp_temp(pc, NULL); - emit_cvt(pc, temp, r, -1, cvn); - r->mod = 0; - r = temp; - } else - r->mod |= mod & NV50_MOD_I32; - - assert(r); - if (r->acc >= 0 && r->vtx < 0 && r != temp) - return reg_instance(pc, r); /* will clear r->mod */ - return r; -} - -/* return TRUE for ops that produce only a single result */ -static boolean -is_scalar_op(unsigned op) -{ - switch (op) { - case TGSI_OPCODE_COS: - case TGSI_OPCODE_DP2: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_SIN: - /* - case TGSI_OPCODE_KIL: - case TGSI_OPCODE_LIT: - case TGSI_OPCODE_SCS: - */ - return TRUE; - default: - return FALSE; - } -} - -/* Returns a bitmask indicating which dst components depend - * on source s, component c (reverse of nv50_tgsi_src_mask). - */ -static unsigned -nv50_tgsi_dst_revdep(unsigned op, int s, int c) -{ - if (is_scalar_op(op)) - return 0x1; - - switch (op) { - case TGSI_OPCODE_DST: - return (1 << c) & (s ? 0xa : 0x6); - case TGSI_OPCODE_XPD: - switch (c) { - case 0: return 0x6; - case 1: return 0x5; - case 2: return 0x3; - case 3: return 0x0; - default: - assert(0); - return 0x0; - } - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_LIT: - case TGSI_OPCODE_SCS: - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXP: - /* these take care of dangerous swizzles themselves */ - return 0x0; - case TGSI_OPCODE_IF: - case TGSI_OPCODE_KIL: - /* don't call this function for these ops */ - assert(0); - return 0; - default: - /* linear vector instruction */ - return (1 << c); - } -} - -static INLINE boolean -has_pred(struct nv50_program_exec *e, unsigned cc) -{ - if (!is_long(e) || is_immd(e)) - return FALSE; - return ((e->inst[1] & 0x780) == (cc << 7)); -} - -/* on ENDIF see if we can do "@p0.neu single_op" instead of: - * join_at ENDIF - * @p0.eq bra ENDIF - * single_op - * ENDIF: nop.join - */ -static boolean -nv50_kill_branch(struct nv50_pc *pc) -{ - int lvl = pc->if_lvl; - - if (pc->if_insn[lvl]->next != pc->p->exec_tail) - return FALSE; - if (is_immd(pc->p->exec_tail)) - return FALSE; - - /* if ccode == 'true', the BRA is from an ELSE and the predicate - * reg may no longer be valid, since we currently always use $p0 - */ - if (has_pred(pc->if_insn[lvl], 0xf)) - return FALSE; - assert(pc->if_insn[lvl] && pc->if_join[lvl]); - - /* We'll use the exec allocated for JOIN_AT (we can't easily - * access nv50_program_exec's prev). - */ - pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ - - *pc->if_join[lvl] = *pc->p->exec_tail; - - FREE(pc->if_insn[lvl]); - FREE(pc->p->exec_tail); - - pc->p->exec_tail = pc->if_join[lvl]; - pc->p->exec_tail->next = NULL; - set_pred(pc, 0xd, 0, pc->p->exec_tail); - - return TRUE; -} - -static void -nv50_fp_move_results(struct nv50_pc *pc) -{ - struct nv50_reg reg; - unsigned i; - - ctor_reg(®, P_TEMP, -1, -1); - - for (i = 0; i < pc->result_nr * 4; ++i) { - if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) - continue; - if (pc->result[i].rhw != pc->result[i].hw) { - reg.hw = pc->result[i].rhw; - emit_mov(pc, ®, &pc->result[i]); - } - } -} - -static boolean -nv50_program_tx_insn(struct nv50_pc *pc, - const struct tgsi_full_instruction *inst) -{ - struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; - unsigned mask, sat, unit = 0; - int i, c; - - mask = inst->Dst[0].Register.WriteMask; - sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; - - memset(src, 0, sizeof(src)); - - for (c = 0; c < 4; c++) { - if ((mask & (1 << c)) && !pc->r_dst[c]) - dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); - else - dst[c] = pc->r_dst[c]; - rdst[c] = dst[c]; - } - - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *fs = &inst->Src[i]; - unsigned src_mask; - int mod_supp; - - src_mask = nv50_tgsi_src_mask(inst, i); - mod_supp = get_supported_mods(inst, i); - - if (fs->Register.File == TGSI_FILE_SAMPLER) - unit = fs->Register.Index; - - for (c = 0; c < 4; c++) - if (src_mask & (1 << c)) - src[i][c] = tgsi_src(pc, c, fs, mod_supp); - } - - brdc = temp = pc->r_brdc; - if (brdc && brdc->type != P_TEMP) { - temp = temp_temp(pc, NULL); - if (sat) - brdc = temp; - } else - if (sat) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) - continue; - /* rdst[c] = dst[c]; */ /* done above */ - dst[c] = temp_temp(pc, NULL); - } - } - - assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_ABS: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_ABS | CVT_F32_F32); - } - break; - case TGSI_OPCODE_ADD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_add(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_AND: - case TGSI_OPCODE_XOR: - case TGSI_OPCODE_OR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_bitop2(pc, dst[c], src[0][c], src[1][c], - inst->Instruction.Opcode); - } - break; - case TGSI_OPCODE_ARL: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, temp, src[0][c], -1, - CVT_FLOOR | CVT_S32_F32); - emit_arl(pc, dst[c], temp, 4); - } - break; - case TGSI_OPCODE_BGNLOOP: - pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); - pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; - terminate_mbb(pc); - break; - case TGSI_OPCODE_BGNSUB: - assert(!pc->in_subroutine); - pc->in_subroutine = TRUE; - /* probably not necessary, but align to 8 byte boundary */ - if (!is_long(pc->p->exec_tail)) - convert_to_long(pc, pc->p->exec_tail); - break; - case TGSI_OPCODE_BRK: - assert(pc->loop_lvl > 0); - emit_break(pc, -1, 0); - break; - case TGSI_OPCODE_CAL: - assert(inst->Label.Label < pc->insn_nr); - emit_call(pc, -1, 0)->param.index = inst->Label.Label; - /* replaced by actual offset in nv50_program_fixup_insns */ - break; - case TGSI_OPCODE_CEIL: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_CEIL | CVT_F32_F32 | CVT_RI); - } - break; - case TGSI_OPCODE_CMP: - pc->allow32 = FALSE; - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); - emit_mov(pc, dst[c], src[1][c]); - set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ - emit_mov(pc, dst[c], src[2][c]); - set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ - } - break; - case TGSI_OPCODE_CONT: - assert(pc->loop_lvl > 0); - emit_branch(pc, -1, 0)->param.index = - pc->loop_pos[pc->loop_lvl - 1]; - break; - case TGSI_OPCODE_COS: - if (mask & 8) { - emit_precossin(pc, temp, src[0][3]); - emit_flop(pc, NV50_FLOP_COS, dst[3], temp); - if (!(mask &= 7)) - break; - if (temp == dst[3]) - temp = brdc = temp_temp(pc, NULL); - } - emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_COS, brdc, temp); - break; - case TGSI_OPCODE_DDX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_ddx(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_DDY: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_ddy(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_DP3: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, brdc, src[0][2], src[1][2], temp); - break; - case TGSI_OPCODE_DP4: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_mad(pc, brdc, src[0][3], src[1][3], temp); - break; - case TGSI_OPCODE_DPH: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_add(pc, brdc, src[1][3], temp); - break; - case TGSI_OPCODE_DST: - if (mask & (1 << 1)) - emit_mul(pc, dst[1], src[0][1], src[1][1]); - if (mask & (1 << 2)) - emit_mov(pc, dst[2], src[0][2]); - if (mask & (1 << 3)) - emit_mov(pc, dst[3], src[1][3]); - if (mask & (1 << 0)) - emit_mov_immdval(pc, dst[0], 1.0f); - break; - case TGSI_OPCODE_ELSE: - emit_branch(pc, -1, 0); - pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; - pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; - terminate_mbb(pc); - break; - case TGSI_OPCODE_EMIT: - emit_prim_cmd(pc, 1); - break; - case TGSI_OPCODE_ENDIF: - pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; - - /* try to replace branch over 1 insn with a predicated insn */ - if (nv50_kill_branch(pc) == TRUE) - break; - - if (pc->if_join[pc->if_lvl]) { - pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; - pc->if_join[pc->if_lvl] = NULL; - } - terminate_mbb(pc); - /* emit a NOP as join point, we could set it on the next - * one, but would have to make sure it is long and !immd - */ - JOIN_ON(emit_nop(pc)); - break; - case TGSI_OPCODE_ENDLOOP: - emit_branch(pc, -1, 0)->param.index = - pc->loop_pos[--pc->loop_lvl]; - pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; - terminate_mbb(pc); - break; - case TGSI_OPCODE_ENDPRIM: - emit_prim_cmd(pc, 2); - break; - case TGSI_OPCODE_ENDSUB: - assert(pc->in_subroutine); - terminate_mbb(pc); - pc->in_subroutine = FALSE; - break; - case TGSI_OPCODE_EX2: - emit_preex2(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_EX2, brdc, temp); - break; - case TGSI_OPCODE_EXP: - { - struct nv50_reg *t[2]; - - assert(!temp); - t[0] = temp_temp(pc, NULL); - t[1] = temp_temp(pc, NULL); - - if (mask & 0x6) - emit_mov(pc, t[0], src[0][0]); - if (mask & 0x3) - emit_flr(pc, t[1], src[0][0]); - - if (mask & (1 << 1)) - emit_sub(pc, dst[1], t[0], t[1]); - if (mask & (1 << 0)) { - emit_preex2(pc, t[1], t[1]); - emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); - } - if (mask & (1 << 2)) { - emit_preex2(pc, t[0], t[0]); - emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0f); - } - break; - case TGSI_OPCODE_F2I: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_S32_F32); - } - break; - case TGSI_OPCODE_F2U: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_U32_F32); - } - break; - case TGSI_OPCODE_FLR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_flr(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_FRC: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_flr(pc, temp, src[0][c]); - emit_sub(pc, dst[c], src[0][c], temp); - } - break; - case TGSI_OPCODE_I2F: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); - } - break; - case TGSI_OPCODE_IF: - assert(pc->if_lvl < NV50_MAX_COND_NESTING); - emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); - pc->if_join[pc->if_lvl] = emit_joinat(pc); - pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; - terminate_mbb(pc); - break; - case TGSI_OPCODE_IMAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_IMIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_INEG: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_S32_S32 | CVT_NEG); - } - break; - case TGSI_OPCODE_KIL: - assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); - emit_kil(pc, src[0][0]); - emit_kil(pc, src[0][1]); - emit_kil(pc, src[0][2]); - emit_kil(pc, src[0][3]); - break; - case TGSI_OPCODE_KILP: - emit_kil(pc, NULL); - break; - case TGSI_OPCODE_LIT: - emit_lit(pc, &dst[0], mask, &src[0][0]); - break; - case TGSI_OPCODE_LG2: - emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); - break; - case TGSI_OPCODE_LOG: - { - struct nv50_reg *t[2]; - - t[0] = temp_temp(pc, NULL); - if (mask & (1 << 1)) - t[1] = temp_temp(pc, NULL); - else - t[1] = t[0]; - - emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); - emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); - if (mask & (1 << 2)) - emit_mov(pc, dst[2], t[1]); - emit_flr(pc, t[1], t[1]); - if (mask & (1 << 0)) - emit_mov(pc, dst[0], t[1]); - if (mask & (1 << 1)) { - t[1]->mod = NV50_MOD_NEG; - emit_preex2(pc, t[1], t[1]); - t[1]->mod = 0; - emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); - emit_mul(pc, dst[1], t[0], t[1]); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0f); - } - break; - case TGSI_OPCODE_LRP: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sub(pc, temp, src[1][c], src[2][c]); - emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); - } - break; - case TGSI_OPCODE_MAD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); - } - break; - case TGSI_OPCODE_MAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_MIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_MOV: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_MUL: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_NOT: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_not(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_POW: - emit_pow(pc, brdc, src[0][0], src[1][0]); - break; - case TGSI_OPCODE_RCP: - if (!sat && popcnt4(mask) == 1) - brdc = dst[ffs(mask) - 1]; - emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); - break; - case TGSI_OPCODE_RET: - if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) - nv50_fp_move_results(pc); - emit_ret(pc, -1, 0); - break; - case TGSI_OPCODE_RSQ: - if (!sat && popcnt4(mask) == 1) - brdc = dst[ffs(mask) - 1]; - src[0][0]->mod |= NV50_MOD_ABS; - emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); - break; - case TGSI_OPCODE_SAD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]); - } - break; - case TGSI_OPCODE_SCS: - temp = temp_temp(pc, NULL); - if (mask & 3) - emit_precossin(pc, temp, src[0][0]); - if (mask & (1 << 0)) - emit_flop(pc, NV50_FLOP_COS, dst[0], temp); - if (mask & (1 << 1)) - emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); - if (mask & (1 << 2)) - emit_mov_immdval(pc, dst[2], 0.0); - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0); - break; - case TGSI_OPCODE_SHL: - case TGSI_OPCODE_ISHR: - case TGSI_OPCODE_USHR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_shift(pc, dst[c], src[0][c], src[1][c], - inst->Instruction.Opcode); - } - break; - case TGSI_OPCODE_SIN: - if (mask & 8) { - emit_precossin(pc, temp, src[0][3]); - emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); - if (!(mask &= 7)) - break; - if (temp == dst[3]) - temp = brdc = temp_temp(pc, NULL); - } - emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_SIN, brdc, temp); - break; - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SGT: - case TGSI_OPCODE_SLE: - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_ISLT: - case TGSI_OPCODE_ISGE: - case TGSI_OPCODE_USEQ: - case TGSI_OPCODE_USGE: - case TGSI_OPCODE_USLT: - case TGSI_OPCODE_USNE: - { - uint8_t cc, ty; - - map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty); - - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty); - } - } - break; - case TGSI_OPCODE_SUB: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sub(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_TEX: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, 0); - break; - case TGSI_OPCODE_TXB: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, -1); - break; - case TGSI_OPCODE_TXL: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, 1); - break; - case TGSI_OPCODE_TXP: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, TRUE, 0); - break; - case TGSI_OPCODE_TRUNC: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_F32_F32 | CVT_RI); - } - break; - case TGSI_OPCODE_U2F: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); - } - break; - case TGSI_OPCODE_UADD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_add_b32(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMAD: - { - assert(!temp); - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); - emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, - temp); - emit_shl_imm(pc, temp, temp, 16); - emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0, - temp); - emit_add_b32(pc, dst[c], temp, src[2][c]); - } - } - break; - case TGSI_OPCODE_UMUL: - { - assert(!temp); - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); - emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, - temp); - emit_shl_imm(pc, temp, temp, 16); - emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0, - temp); - } - } - break; - case TGSI_OPCODE_XPD: - temp = temp_temp(pc, NULL); - if (mask & (1 << 0)) { - emit_mul(pc, temp, src[0][2], src[1][1]); - emit_msb(pc, dst[0], src[0][1], src[1][2], temp); - } - if (mask & (1 << 1)) { - emit_mul(pc, temp, src[0][0], src[1][2]); - emit_msb(pc, dst[1], src[0][2], src[1][0], temp); - } - if (mask & (1 << 2)) { - emit_mul(pc, temp, src[0][1], src[1][0]); - emit_msb(pc, dst[2], src[0][0], src[1][1], temp); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0); - break; - case TGSI_OPCODE_END: - if (pc->p->type == PIPE_SHADER_FRAGMENT) - nv50_fp_move_results(pc); - - if (!pc->p->exec_tail || - is_immd(pc->p->exec_tail) || - is_join(pc->p->exec_tail) || - is_control_flow(pc->p->exec_tail)) - emit_nop(pc); - - /* last insn must be long so it can have the exit bit set */ - if (!is_long(pc->p->exec_tail)) - convert_to_long(pc, pc->p->exec_tail); - - pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ - - terminate_mbb(pc); - break; - default: - NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); - return FALSE; - } - - if (brdc) { - if (sat) - emit_sat(pc, brdc, brdc); - for (c = 0; c < 4; c++) - if ((mask & (1 << c)) && dst[c] != brdc) - emit_mov(pc, dst[c], brdc); - } else - if (sat) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - /* In this case we saturate later, and dst[c] won't - * be another temp_temp (and thus lost), since rdst - * already is TEMP (see above). */ - if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) - continue; - emit_sat(pc, rdst[c], dst[c]); - } - } - - kill_temp_temp(pc, NULL); - pc->reg_instance_nr = 0; - - return TRUE; -} - -static void -prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) -{ - struct nv50_reg *r, *reg = NULL; - const struct tgsi_full_src_register *src; - const struct tgsi_dst_register *dst; - unsigned i, c, k, mask; - - dst = &insn->Dst[0].Register; - mask = dst->WriteMask; - - if (dst->File == TGSI_FILE_TEMPORARY) - reg = pc->temp; - else - if (dst->File == TGSI_FILE_OUTPUT) { - reg = pc->result; - - if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && - dst->Index == pc->edgeflag_out && - insn->Src[0].Register.File == TGSI_FILE_INPUT) - pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; - } - - if (reg) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - reg[dst->Index * 4 + c].acc = pc->insn_nr; - } - } - - for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { - src = &insn->Src[i]; - - if (src->Register.File == TGSI_FILE_TEMPORARY) - reg = pc->temp; - else - if (src->Register.File == TGSI_FILE_INPUT) - reg = pc->attr; - else - continue; - - mask = nv50_tgsi_src_mask(insn, i); - - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - k = tgsi_util_get_full_src_register_swizzle(src, c); - - r = ®[src->Register.Index * 4 + k]; - - /* If used before written, pre-allocate the reg, - * lest we overwrite results from a subroutine. - */ - if (!r->acc && r->type == P_TEMP) - alloc_reg(pc, r); - - r->acc = pc->insn_nr; - } - } -} - -/* Returns a bitmask indicating which dst components need to be - * written to temporaries first to avoid 'corrupting' sources. - * - * m[i] (out) indicate component to write in the i-th position - * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source - */ -static unsigned -nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) -{ - unsigned i, c, x, unsafe = 0; - - for (c = 0; c < 4; c++) - m[c] = c; - - /* Swap as long as a dst component written earlier is depended on - * by one written later, but the next one isn't depended on by it. - */ - for (c = 0; c < 3; c++) { - if (rdep[m[c + 1]] & (1 << m[c])) - continue; /* if next one is depended on by us */ - for (i = c + 1; i < 4; i++) - /* if we are depended on by a later one */ - if (rdep[m[c]] & (1 << m[i])) - break; - if (i == 4) - continue; - /* now, swap */ - x = m[c]; - m[c] = m[c + 1]; - m[c + 1] = x; - - /* restart */ - c = 0; - } - - /* mark dependencies that could not be resolved by reordering */ - for (i = 0; i < 3; ++i) - for (c = i + 1; c < 4; ++c) - if (rdep[m[i]] & (1 << m[c])) - unsafe |= (1 << i); - - /* NOTE: $unsafe is with respect to order, not component */ - return unsafe; -} - -/* Select a suitable dst register for broadcasting scalar results, - * or return NULL if we have to allocate an extra TEMP. - * - * If e.g. only 1 component is written, we may also emit the final - * result to a write-only register. - */ -static struct nv50_reg * -tgsi_broadcast_dst(struct nv50_pc *pc, - const struct tgsi_full_dst_register *fd, unsigned mask) -{ - if (fd->Register.File == TGSI_FILE_TEMPORARY) { - int c = ffs(~mask & fd->Register.WriteMask); - if (c) - return tgsi_dst(pc, c - 1, fd); - } else { - int c = ffs(fd->Register.WriteMask) - 1; - if ((1 << c) == fd->Register.WriteMask) - return tgsi_dst(pc, c, fd); - } - - return NULL; -} - -/* Scan source swizzles and return a bitmask indicating dst regs that - * also occur among the src regs, and fill rdep for nv50_revdep_reoder. - */ -static unsigned -nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, - unsigned rdep[4]) -{ - const struct tgsi_full_dst_register *fd = &insn->Dst[0]; - const struct tgsi_full_src_register *fs; - unsigned i, deqs = 0; - - for (i = 0; i < 4; ++i) - rdep[i] = 0; - - for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { - unsigned chn, mask = nv50_tgsi_src_mask(insn, i); - int ms = get_supported_mods(insn, i); - - fs = &insn->Src[i]; - if (fs->Register.File != fd->Register.File || - fs->Register.Index != fd->Register.Index) - continue; - - for (chn = 0; chn < 4; ++chn) { - unsigned s, c; - - if (!(mask & (1 << chn))) /* src is not read */ - continue; - c = tgsi_util_get_full_src_register_swizzle(fs, chn); - s = tgsi_util_get_full_src_register_sign_mode(fs, chn); - - if (!(fd->Register.WriteMask & (1 << c))) - continue; - - if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) - continue; - if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) - continue; - if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) - continue; - - rdep[c] |= nv50_tgsi_dst_revdep( - insn->Instruction.Opcode, i, chn); - deqs |= (1 << c); - } - } - - return deqs; -} - -static boolean -nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) -{ - struct tgsi_full_instruction insn = tok->FullInstruction; - const struct tgsi_full_dst_register *fd; - unsigned i, deqs, rdep[4], m[4]; - - fd = &tok->FullInstruction.Dst[0]; - deqs = nv50_tgsi_scan_swizzle(&insn, rdep); - - if (is_scalar_op(insn.Instruction.Opcode)) { - pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); - if (!pc->r_brdc) - pc->r_brdc = temp_temp(pc, NULL); - return nv50_program_tx_insn(pc, &insn); - } - pc->r_brdc = NULL; - - if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) - return nv50_program_tx_insn(pc, &insn); - - deqs = nv50_revdep_reorder(m, rdep); - - for (i = 0; i < 4; ++i) { - assert(pc->r_dst[m[i]] == NULL); - - insn.Dst[0].Register.WriteMask = - fd->Register.WriteMask & (1 << m[i]); - - if (!insn.Dst[0].Register.WriteMask) - continue; - - if (deqs & (1 << i)) - pc->r_dst[m[i]] = alloc_temp(pc, NULL); - - if (!nv50_program_tx_insn(pc, &insn)) - return FALSE; - } - - for (i = 0; i < 4; i++) { - struct nv50_reg *reg = pc->r_dst[i]; - if (!reg) - continue; - pc->r_dst[i] = NULL; - - if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) - emit_sat(pc, tgsi_dst(pc, i, fd), reg); - else - emit_mov(pc, tgsi_dst(pc, i, fd), reg); - free_temp(pc, reg); - } - - return TRUE; -} - -static void -load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) -{ - struct nv50_reg *iv, **ppiv; - unsigned mode = pc->interp_mode[reg->index]; - - ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; - iv = *ppiv; - - if ((mode & INTERP_PERSPECTIVE) && !iv) { - iv = *ppiv = alloc_temp(pc, NULL); - iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; - - emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); - emit_flop(pc, NV50_FLOP_RCP, iv, iv); - - /* XXX: when loading interpolants dynamically, move these - * to the program head, or make sure it can't be skipped. - */ - } - - emit_interp(pc, reg, iv, mode); -} - -/* The face input is always at v[255] (varying space), with a - * value of 0 for back-facing, and 0xffffffff for front-facing. - */ -static void -load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv) -{ - struct nv50_reg *temp = alloc_temp(pc, NULL); - int r_pred = 0; - - temp->rhw = 255; - emit_interp(pc, temp, NULL, INTERP_FLAT); - - emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32); - - emit_not(pc, temp, temp); - set_pred(pc, 0x2, r_pred, pc->p->exec_tail); - emit_cvt(pc, sv, temp, -1, CVT_F32_S32); - set_pred(pc, 0x2, r_pred, pc->p->exec_tail); - - free_temp(pc, temp); -} - -static void -load_instance_id(struct nv50_pc *pc, unsigned index) -{ - struct nv50_reg reg, mem; - - ctor_reg(®, P_TEMP, -1, -1); - ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */ - mem.buf_index = 2; - - emit_add_b32(pc, ®, &pc->sysval[index], &mem); - pc->sysval[index] = reg; -} - -static void -copy_semantic_info(struct nv50_program *p) -{ - unsigned i, id; - - for (i = 0; i < p->cfg.in_nr; ++i) { - id = p->cfg.in[i].id; - p->cfg.in[i].sn = p->info.input_semantic_name[id]; - p->cfg.in[i].si = p->info.input_semantic_index[id]; - } - - for (i = 0; i < p->cfg.out_nr; ++i) { - id = p->cfg.out[i].id; - p->cfg.out[i].sn = p->info.output_semantic_name[id]; - p->cfg.out[i].si = p->info.output_semantic_index[id]; - } -} - -static boolean -nv50_program_tx_prep(struct nv50_pc *pc) -{ - struct tgsi_parse_context tp; - struct nv50_program *p = pc->p; - boolean ret = FALSE; - unsigned i, c, instance_id = 0, vertex_id = 0, flat_nr = 0; - - tgsi_parse_init(&tp, pc->p->pipe.tokens); - while (!tgsi_parse_end_of_tokens(&tp)) { - const union tgsi_full_token *tok = &tp.FullToken; - - tgsi_parse_token(&tp); - switch (tok->Token.Type) { - case TGSI_TOKEN_TYPE_IMMEDIATE: - { - const struct tgsi_full_immediate *imm = - &tp.FullToken.FullImmediate; - - ctor_immd_4f32(pc, imm->u[0].Float, - imm->u[1].Float, - imm->u[2].Float, - imm->u[3].Float); - } - break; - case TGSI_TOKEN_TYPE_DECLARATION: - { - const struct tgsi_full_declaration *d; - unsigned si, last, first, mode; - - d = &tp.FullToken.FullDeclaration; - first = d->Range.First; - last = d->Range.Last; - - switch (d->Declaration.File) { - case TGSI_FILE_TEMPORARY: - break; - case TGSI_FILE_OUTPUT: - if (!d->Declaration.Semantic || - p->type == PIPE_SHADER_FRAGMENT) - break; - - si = d->Semantic.Index; - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_BCOLOR: - p->cfg.two_side[si].hw = first; - if (p->cfg.out_nr > first) - p->cfg.out_nr = first; - break; - case TGSI_SEMANTIC_PSIZE: - p->cfg.psiz = first; - if (p->cfg.out_nr > first) - p->cfg.out_nr = first; - break; - case TGSI_SEMANTIC_EDGEFLAG: - pc->edgeflag_out = first; - break; - /* - case TGSI_SEMANTIC_CLIP_DISTANCE: - p->cfg.clpd = MIN2(p->cfg.clpd, first); - break; - */ - default: - break; - } - break; - case TGSI_FILE_INPUT: - { - if (p->type != PIPE_SHADER_FRAGMENT) - break; - - switch (d->Declaration.Interpolate) { - case TGSI_INTERPOLATE_CONSTANT: - mode = INTERP_FLAT; - flat_nr++; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - mode = INTERP_PERSPECTIVE; - p->cfg.regs[1] |= 0x08 << 24; - break; - default: - mode = INTERP_LINEAR; - break; - } - if (d->Declaration.Centroid) - mode |= INTERP_CENTROID; - - assert(last < 32); - for (i = first; i <= last; i++) - pc->interp_mode[i] = mode; - } - break; - case TGSI_FILE_SYSTEM_VALUE: - assert(d->Declaration.Semantic); - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_FACE: - assert(p->type == PIPE_SHADER_FRAGMENT); - load_frontfacing(pc, - &pc->sysval[first]); - break; - case TGSI_SEMANTIC_INSTANCEID: - assert(p->type == PIPE_SHADER_VERTEX); - instance_id = first; - p->cfg.regs[0] |= (1 << 4); - break; - case TGSI_SEMANTIC_PRIMID: - assert(p->type != PIPE_SHADER_VERTEX); - p->cfg.prim_id = first; - break; - /* - case TGSI_SEMANTIC_PRIMIDIN: - assert(p->type == PIPE_SHADER_GEOMETRY); - pc->sysval[first].hw = 6; - p->cfg.regs[0] |= (1 << 8); - break; - case TGSI_SEMANTIC_VERTEXID: - assert(p->type == PIPE_SHADER_VERTEX); - vertex_id = first; - p->cfg.regs[0] |= (1 << 12) | (1 << 0); - break; - */ - } - break; - case TGSI_FILE_ADDRESS: - case TGSI_FILE_CONSTANT: - case TGSI_FILE_SAMPLER: - break; - default: - NOUVEAU_ERR("bad decl file %d\n", - d->Declaration.File); - goto out_err; - } - } - break; - case TGSI_TOKEN_TYPE_INSTRUCTION: - pc->insn_nr++; - prep_inspect_insn(pc, &tok->FullInstruction); - break; - default: - break; - } - } - - if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) { - int rid = 0; - - if (p->type == PIPE_SHADER_GEOMETRY) { - for (i = 0; i < pc->attr_nr; ++i) { - p->cfg.in[i].hw = rid; - p->cfg.in[i].id = i; - - for (c = 0; c < 4; ++c) { - int n = i * 4 + c; - if (!pc->attr[n].acc) - continue; - pc->attr[n].hw = rid++; - p->cfg.in[i].mask |= 1 << c; - } - } - } else { - for (i = 0; i < pc->attr_nr * 4; ++i) { - if (pc->attr[i].acc) { - pc->attr[i].hw = rid++; - p->cfg.attr[i / 32] |= 1 << (i % 32); - } - } - if (p->cfg.regs[0] & (1 << 0)) - pc->sysval[vertex_id].hw = rid++; - if (p->cfg.regs[0] & (1 << 4)) { - pc->sysval[instance_id].hw = rid++; - load_instance_id(pc, instance_id); - } - } - - for (i = 0, rid = 0; i < pc->result_nr; ++i) { - p->cfg.out[i].hw = rid; - p->cfg.out[i].id = i; - - for (c = 0; c < 4; ++c) { - int n = i * 4 + c; - if (!pc->result[n].acc) - continue; - pc->result[n].hw = rid++; - p->cfg.out[i].mask |= 1 << c; - } - } - if (p->cfg.prim_id < 0x40) { - /* GP has to write to PrimitiveID */ - ctor_reg(&pc->sysval[p->cfg.prim_id], - P_RESULT, p->cfg.prim_id, rid); - p->cfg.prim_id = rid++; - } - - for (c = 0; c < 2; ++c) - if (p->cfg.two_side[c].hw < 0x40) - p->cfg.two_side[c] = p->cfg.out[ - p->cfg.two_side[c].hw]; - - if (p->cfg.psiz < 0x40) - p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw; - - copy_semantic_info(p); - } else - if (p->type == PIPE_SHADER_FRAGMENT) { - int rid = 0, aid; - unsigned n = 0, m = pc->attr_nr - flat_nr; - - pc->allow32 = TRUE; - - /* do we read FragCoord ? */ - if (pc->attr_nr && - p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) { - /* select FCRD components we want accessible */ - for (c = 0; c < 4; ++c) - if (pc->attr[c].acc) - p->cfg.regs[1] |= 1 << (24 + c); - aid = 0; - } else /* offset by 1 if FCRD.w is needed for pinterp */ - aid = popcnt4(p->cfg.regs[1] >> 24); - - /* non-flat interpolants have to be mapped to - * the lower hardware IDs, so sort them: - */ - for (i = 0; i < pc->attr_nr; i++) { - if (pc->interp_mode[i] == INTERP_FLAT) - p->cfg.in[m++].id = i; - else { - if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) - p->cfg.in[n].linear = TRUE; - p->cfg.in[n++].id = i; - } - } - copy_semantic_info(p); - - for (n = 0; n < pc->attr_nr; ++n) { - p->cfg.in[n].hw = rid = aid; - i = p->cfg.in[n].id; - - if (p->info.input_semantic_name[i] == - TGSI_SEMANTIC_FACE) { - load_frontfacing(pc, &pc->attr[i * 4]); - continue; - } - - for (c = 0; c < 4; ++c) { - if (!pc->attr[i * 4 + c].acc) - continue; - pc->attr[i * 4 + c].rhw = rid++; - p->cfg.in[n].mask |= 1 << c; - - load_interpolant(pc, &pc->attr[i * 4 + c]); - } - aid += popcnt4(p->cfg.in[n].mask); - } - - m = popcnt4(p->cfg.regs[1] >> 24); - - /* set count of non-position inputs and of non-flat - * non-position inputs for FP_INTERPOLANT_CTRL - */ - p->cfg.regs[1] |= aid - m; - - if (flat_nr) { - i = p->cfg.in[pc->attr_nr - flat_nr].hw; - p->cfg.regs[1] |= (i - m) << 16; - } else - p->cfg.regs[1] |= p->cfg.regs[1] << 16; - - /* mark color semantic for light-twoside */ - n = 0x80; - for (i = 0; i < p->cfg.in_nr; i++) { - if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) { - n = MIN2(n, p->cfg.in[i].hw - m); - p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i]; - - p->cfg.regs[0] += /* increase colour count */ - popcnt4(p->cfg.in[i].mask) << 16; - } - } - if (n < 0x80) - p->cfg.regs[0] += n; - - if (p->cfg.prim_id < 0x40) { - pc->sysval[p->cfg.prim_id].rhw = rid++; - emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL, - INTERP_FLAT); - /* increase FP_INTERPOLANT_CTRL_COUNT */ - p->cfg.regs[1] += 1; - } - - /* Initialize FP results: - * FragDepth is always first TGSI and last hw output - */ - i = p->info.writes_z ? 4 : 0; - for (rid = 0; i < pc->result_nr * 4; i++) - pc->result[i].rhw = rid++; - if (p->info.writes_z) - pc->result[2].rhw = rid++; - - p->cfg.high_result = rid; - - /* separate/different colour results for MRTs ? */ - if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) - p->cfg.regs[2] |= 1; - } - - if (pc->immd_nr) { - int rid = 0; - - pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->immd) - goto out_err; - - for (i = 0; i < pc->immd_nr; i++) { - for (c = 0; c < 4; c++, rid++) - ctor_reg(&pc->immd[rid], P_IMMD, i, rid); - } - } - - ret = TRUE; -out_err: - if (pc->iv_p) - free_temp(pc, pc->iv_p); - if (pc->iv_c) - free_temp(pc, pc->iv_c); - - tgsi_parse_free(&tp); - return ret; -} - -static void -free_nv50_pc(struct nv50_pc *pc) -{ - if (pc->immd) - FREE(pc->immd); - if (pc->param) - FREE(pc->param); - if (pc->result) - FREE(pc->result); - if (pc->attr) - FREE(pc->attr); - if (pc->temp) - FREE(pc->temp); - if (pc->sysval) - FREE(pc->sysval); - if (pc->insn_pos) - FREE(pc->insn_pos); - - FREE(pc); -} - -static INLINE uint32_t -nv50_map_gs_output_prim(unsigned pprim) -{ - switch (pprim) { - case PIPE_PRIM_POINTS: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS; - case PIPE_PRIM_LINE_STRIP: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP; - case PIPE_PRIM_TRIANGLE_STRIP: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP; - default: - NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim); - abort(); - return 0; - } -} - -static boolean -ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) -{ - int i, c; - unsigned rtype[2] = { P_ATTR, P_RESULT }; - - pc->p = p; - pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; - pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; - pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; - pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; - pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; - assert(pc->addr_nr <= 2); - pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1; - - p->cfg.high_temp = 4; - - p->cfg.two_side[0].hw = 0x40; - p->cfg.two_side[1].hw = 0x40; - p->cfg.prim_id = 0x40; - - p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; - - for (i = 0; i < p->info.num_properties; ++i) { - unsigned *data = &p->info.properties[i].data[0]; - - switch (p->info.properties[i].name) { - case TGSI_PROPERTY_GS_OUTPUT_PRIM: - p->cfg.prim_type = nv50_map_gs_output_prim(data[0]); - break; - case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES: - p->cfg.vert_count = data[0]; - break; - default: - break; - } - } - - switch (p->type) { - case PIPE_SHADER_VERTEX: - p->cfg.psiz = 0x40; - p->cfg.clpd = 0x40; - p->cfg.out_nr = pc->result_nr; - break; - case PIPE_SHADER_GEOMETRY: - assert(p->cfg.prim_type); - assert(p->cfg.vert_count); - - p->cfg.psiz = 0x80; - p->cfg.clpd = 0x80; - p->cfg.prim_id = 0x80; - p->cfg.out_nr = pc->result_nr; - p->cfg.in_nr = pc->attr_nr; - - p->cfg.two_side[0].hw = 0x80; - p->cfg.two_side[1].hw = 0x80; - break; - case PIPE_SHADER_FRAGMENT: - rtype[0] = rtype[1] = P_TEMP; - - p->cfg.regs[0] = 0x01000004; - p->cfg.in_nr = pc->attr_nr; - - if (p->info.writes_z) { - p->cfg.regs[2] |= 0x00000100; - p->cfg.regs[3] |= 0x00000011; - } - if (p->info.uses_kill) - p->cfg.regs[2] |= 0x00100000; - break; - } - - if (pc->temp_nr) { - pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->temp) - return FALSE; - - for (i = 0; i < pc->temp_nr * 4; ++i) - ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); - } - - if (pc->attr_nr) { - pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->attr) - return FALSE; - - for (i = 0; i < pc->attr_nr * 4; ++i) - ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); - } - - if (pc->result_nr) { - unsigned nr = pc->result_nr * 4; - - pc->result = MALLOC(nr * sizeof(struct nv50_reg)); - if (!pc->result) - return FALSE; - - for (i = 0; i < nr; ++i) - ctor_reg(&pc->result[i], rtype[1], i / 4, -1); - } - - if (pc->param_nr) { - int rid = 0; - - pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->param) - return FALSE; - - for (i = 0; i < pc->param_nr; ++i) - for (c = 0; c < 4; ++c, ++rid) - ctor_reg(&pc->param[rid], P_CONST, i, rid); - } - - if (pc->addr_nr) { - pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); - if (!pc->addr) - return FALSE; - } - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) - ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1); - - if (pc->sysval_nr) { - pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *)); - if (!pc->sysval) - return FALSE; - /* will only ever use SYSTEM_VALUE[i].x (hopefully) */ - for (i = 0; i < pc->sysval_nr; ++i) - ctor_reg(&pc->sysval[i], rtype[0], i, -1); - } - - return TRUE; -} - -static void -nv50_program_fixup_insns(struct nv50_pc *pc) -{ - struct nv50_program_exec *e, **bra_list; - unsigned i, n, pos; - - bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); - - /* Collect branch instructions, we need to adjust their offsets - * when converting 32 bit instructions to 64 bit ones - */ - for (n = 0, e = pc->p->exec_head; e; e = e->next) - if (e->param.index >= 0 && !e->param.mask) - bra_list[n++] = e; - - /* Make sure we don't have any single 32 bit instructions. */ - for (e = pc->p->exec_head, pos = 0; e; e = e->next) { - pos += is_long(e) ? 2 : 1; - - if ((pos & 1) && (!e->next || is_long(e->next))) { - for (i = 0; i < n; ++i) - if (bra_list[i]->param.index >= pos) - bra_list[i]->param.index += 1; - for (i = 0; i < pc->insn_nr; ++i) - if (pc->insn_pos[i] >= pos) - pc->insn_pos[i] += 1; - convert_to_long(pc, e); - ++pos; - } - } - - FREE(bra_list); - - if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) - return; - - /* fill in CALL offsets */ - for (e = pc->p->exec_head; e; e = e->next) { - if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) - e->param.index = pc->insn_pos[e->param.index]; - } -} - -static boolean -nv50_program_tx(struct nv50_program *p) -{ - struct tgsi_parse_context parse; - struct nv50_pc *pc; - boolean ret; - - pc = CALLOC_STRUCT(nv50_pc); - if (!pc) - return FALSE; - - ret = ctor_nv50_pc(pc, p); - if (ret == FALSE) - goto out_cleanup; - - ret = nv50_program_tx_prep(pc); - if (ret == FALSE) - goto out_cleanup; - - pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); - - tgsi_parse_init(&parse, pc->p->pipe.tokens); - while (!tgsi_parse_end_of_tokens(&parse)) { - const union tgsi_full_token *tok = &parse.FullToken; - - /* previously allow32 was FALSE for first & last instruction */ - pc->allow32 = TRUE; - - tgsi_parse_token(&parse); - - switch (tok->Token.Type) { - case TGSI_TOKEN_TYPE_INSTRUCTION: - pc->insn_pos[pc->insn_cur] = pc->p->exec_size; - ++pc->insn_cur; - ret = nv50_tgsi_insn(pc, tok); - if (ret == FALSE) - goto out_err; - break; - default: - break; - } - } - - nv50_program_fixup_insns(pc); - - p->param_nr = pc->param_nr * 4; - p->immd_nr = pc->immd_nr * 4; - p->immd = pc->immd_buf; - -out_err: - tgsi_parse_free(&parse); - -out_cleanup: - free_nv50_pc(pc); - return ret; -} - -static void -nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) -{ - if (nv50_program_tx(p) == FALSE) - assert(0); - p->translated = TRUE; -} - -static void -nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, - unsigned start, unsigned count, unsigned cbuf) -{ - struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; - - while (count) { - unsigned nr = count > 2047 ? 2047 : count; - - BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); - OUT_RING (chan, (cbuf << 0) | (start << 8)); - BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); - OUT_RINGp (chan, map, nr); - - map += nr; - start += nr; - count -= nr; - } -} - -static void -nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) -{ - struct pipe_context *pipe = &nv50->pipe; - struct pipe_transfer *transfer; - - if (!p->data[0] && p->immd_nr) { - struct nouveau_resource *heap = nv50->screen->immd_heap; - - if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { - while (heap->next && heap->size < p->immd_nr) { - struct nv50_program *evict = heap->next->priv; - nouveau_resource_free(&evict->data[0]); - } - - if (nouveau_resource_alloc(heap, p->immd_nr, p, - &p->data[0])) - assert(0); - } - - /* immediates only need to be uploaded again when freed */ - nv50_program_upload_data(nv50, p->immd, p->data[0]->start, - p->immd_nr, NV50_CB_PMISC); - } - - assert(p->param_nr <= 16384); - - if (p->param_nr) { - unsigned cb; - uint32_t *map = pipe_buffer_map(pipe, - nv50->constbuf[p->type], - PIPE_TRANSFER_READ, - &transfer); - switch (p->type) { - case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break; - case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break; - default: - cb = NV50_CB_PVP; - assert(p->type == PIPE_SHADER_VERTEX); - break; - } - - nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); - pipe_buffer_unmap(pipe, nv50->constbuf[p->type], - transfer); - } -} - -static void -nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) -{ - struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program_exec *e; - uint32_t *up, i; - boolean upload = FALSE; - unsigned offset; - int width; - - if (!p->bo) { - nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, - p->exec_size * 4, &p->bo); - upload = TRUE; - } - - if (p->data[0] && p->data[0]->start != p->data_start[0]) - upload = TRUE; - - if (!upload) - return; - - up = MALLOC(p->exec_size * 4); - - for (i = 0, e = p->exec_head; e; e = e->next) { - unsigned ei, ci, bs; - - if (e->param.index >= 0 && e->param.mask) { - bs = (e->inst[1] >> 22) & 0x07; - assert(bs < 2); - ei = e->param.shift >> 5; - ci = e->param.index; - if (bs == 0) - ci += p->data[bs]->start; - - e->inst[ei] &= ~e->param.mask; - e->inst[ei] |= (ci << e->param.shift); - } else - if (e->param.index >= 0) { - /* zero mask means param is a jump/branch offset */ - assert(!(e->param.index & 1)); - /* seem to be 8 byte steps */ - ei = (e->param.index >> 1) + 0 /* START_ID */; - - e->inst[0] &= 0xf0000fff; - e->inst[0] |= ei << 12; - } - - up[i++] = e->inst[0]; - if (is_long(e)) - up[i++] = e->inst[1]; - } - assert(i == p->exec_size); - - if (p->data[0]) - p->data_start[0] = p->data[0]->start; - -#ifdef NV50_PROGRAM_DUMP - NOUVEAU_ERR("-------\n"); - for (e = p->exec_head; e; e = e->next) { - NOUVEAU_ERR("0x%08x\n", e->inst[0]); - if (is_long(e)) - NOUVEAU_ERR("0x%08x\n", e->inst[1]); - } -#endif - - /* SIFC_HEIGHT/SIFC_WIDTH of 65536 do not work, and are not reported - * as data error either. hw bug ? */ -#define SIFC_MAX_WIDTH (65536 - 256) - offset = 0; - width = p->exec_size * 4; - while (width > 0) { - nv50_upload_sifc(nv50, p->bo, offset, NOUVEAU_BO_VRAM, - NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, - &up[offset / 4], NV50_2D_SIFC_FORMAT_R8_UNORM, - 0, 0, 0, MIN2(SIFC_MAX_WIDTH, width), 1, 1); - width -= SIFC_MAX_WIDTH; - offset += SIFC_MAX_WIDTH; - } - BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); - OUT_RING (chan, 0); - - FREE(up); -} - -struct nouveau_stateobj * -nv50_vertprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->vertprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_VERTPROG)) - return NULL; - - so = so_new(5, 7, 2); - so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); - so_data (so, p->cfg.attr[0]); - so_data (so, p->cfg.attr[1]); - so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_VP_START_ID, 1); - so_data (so, 0); /* program start offset */ - return so; -} - -struct nouveau_stateobj * -nv50_fragprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->fragprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_FRAGPROG)) - return NULL; - - so = so_new(6, 7, 2); - so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_FP_CONTROL, 1); - so_data (so, p->cfg.regs[2]); - so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); - so_data (so, p->cfg.regs[3]); - so_method(so, tesla, NV50TCL_FP_START_ID, 1); - so_data (so, 0); /* program start offset */ - return so; -} - -struct nouveau_stateobj * -nv50_geomprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->geomprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_GEOMPROG)) - return NULL; - - so = so_new(6, 7, 2); - so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); - so_data (so, p->cfg.prim_type); - so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); - so_data (so, p->cfg.vert_count); - so_method(so, tesla, NV50TCL_GP_START_ID, 1); - so_data (so, 0); - return so; -} - -static uint32_t -nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) -{ - struct nv50_program *vp; - struct nv50_program *fp = nv50->fragprog; - unsigned i, c, m = base; - uint32_t origin = 0x00000010; - - vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog; - - /* XXX: this might not work correctly in all cases yet - we'll - * just assume that an FP generic input that is not written in - * the VP is PointCoord. - */ - memset(pntc, 0, 8 * sizeof(uint32_t)); - - for (i = 0; i < fp->cfg.in_nr; i++) { - unsigned j, n = popcnt4(fp->cfg.in[i].mask); - - if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) { - m += n; - continue; - } - - for (j = 0; j < vp->cfg.out_nr; ++j) - if (vp->cfg.out[j].sn == fp->cfg.in[i].sn && - vp->cfg.out[j].si == fp->cfg.in[i].si) - break; - - if (j < vp->info.num_outputs) { - ubyte enable = - (nv50->rasterizer->pipe.sprite_coord_enable >> vp->cfg.out[j].si) & 1; - - if (enable == 0) { - m += n; - continue; - } - } - - /* this is either PointCoord or replaced by sprite coords */ - for (c = 0; c < 4; c++) { - if (!(fp->cfg.in[i].mask & (1 << c))) - continue; - pntc[m / 8] |= (c + 1) << ((m % 8) * 4); - ++m; - } - } - return (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT ? 0 : origin); +nv50_fragprog_prepare(struct nv50_translation_info *ti) +{ + struct nv50_program *p = ti->p; + int i, j, c; + unsigned nvary, nintp, depr; + unsigned n = 0, m = 0, skip = 0; + ubyte sn[16], si[16]; + + /* FP flags */ + + if (ti->scan.writes_z) { + p->fp.flags[1] = 0x11; + p->fp.flags[0] |= NV50TCL_FP_CONTROL_EXPORTS_Z; + } + + if (ti->scan.uses_kill) + p->fp.flags[0] |= NV50TCL_FP_CONTROL_USES_KIL; + + /* FP inputs */ + + ti->input_file = NV_FILE_MEM_V; + ti->output_file = NV_FILE_GPR; + + /* count non-flat inputs, save semantic info */ + for (i = 0; i < p->in_nr; ++i) { + m += (ti->interp_mode[i] & NV50_INTERP_FLAT) ? 0 : 1; + sn[i] = p->in[i].sn; + si[i] = p->in[i].si; + } + + /* reorder p->in[] so that non-flat inputs are first and + * kick out special inputs that don't use VP/GP_RESULT_MAP + */ + nintp = 0; + for (i = 0; i < p->in_nr; ++i) { + if (sn[i] == TGSI_SEMANTIC_POSITION) { + for (c = 0; c < 4; ++c) { + ti->input_map[i][c] = nintp; + if (ti->input_access[i][c]) { + p->fp.interp |= 1 << (24 + c); + ++nintp; + } + } + skip++; + continue; + } else + if (sn[i] == TGSI_SEMANTIC_FACE) { + ti->input_map[i][0] = 255; + skip++; + continue; + } + + j = (ti->interp_mode[i] & NV50_INTERP_FLAT) ? m++ : n++; + + if (sn[i] == TGSI_SEMANTIC_COLOR) + p->vp.bfc[si[i]] = j; + + p->in[j].linear = (ti->interp_mode[i] & NV50_INTERP_LINEAR) ? 1 : 0; + p->in[j].id = i; + p->in[j].sn = sn[i]; + p->in[j].si = si[i]; + } + assert(n <= m); + p->in_nr -= skip; + + if (!(p->fp.interp & (8 << 24))) { + p->fp.interp |= (8 << 24); + ++nintp; + } + + p->fp.colors = (1 << 24) | 4; /* CLAMP, FFC0_ID = 4 */ + + for (i = 0; i < p->in_nr; ++i) { + int j = p->in[i].id; + p->in[i].hw = nintp; + + for (c = 0; c < 4; ++c) { + if (!ti->input_access[j][c]) + continue; + p->in[i].mask |= 1 << c; + ti->input_map[j][c] = nintp++; + } + /* count color inputs */ + if (i == p->vp.bfc[0] || i == p->vp.bfc[1]) + p->fp.colors += bitcount4(p->in[i].mask) << 16; + } + nintp -= bitcount4(p->fp.interp >> 24); /* subtract position inputs */ + nvary = nintp; + if (n < m) + nvary -= p->in[n].hw; + + p->fp.interp |= nvary << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_SHIFT; + p->fp.interp |= nintp << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_SHIFT; + + /* FP outputs */ + + if (p->out_nr > (1 + (ti->scan.writes_z ? 1 : 0))) + p->fp.flags[0] |= NV50TCL_FP_CONTROL_MULTIPLE_RESULTS; + + depr = p->out_nr; + for (i = 0; i < p->out_nr; ++i) { + p->out[i].id = i; + if (p->out[i].sn == TGSI_SEMANTIC_POSITION) { + depr = i; + continue; + } + p->out[i].hw = p->max_out; + p->out[i].mask = 0xf; + + for (c = 0; c < 4; ++c) + ti->output_map[i][c] = p->max_out++; + } + if (depr < p->out_nr) { + p->out[depr].mask = 0x4; + p->out[depr].hw = p->max_out++; + } + + return 0; } static int -nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4], - struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) +nv50_geomprog_prepare(struct nv50_translation_info *ti) { - int c; - uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; - uint8_t *map = (uint8_t *)map32; - - for (c = 0; c < 4; ++c) { - if (mf & 1) { - if (fpi->linear == TRUE) - lin[mid / 32] |= 1 << (mid % 32); - if (mv & 1) - map[mid] = oid; - else - map[mid] = (c == 3) ? (zval + 1) : zval; - ++mid; - } - - oid += mv & 1; - mf >>= 1; - mv >>= 1; - } + ti->input_file = NV_FILE_MEM_S; + ti->output_file = NV_FILE_OUT; - return mid; -} - -struct nouveau_stateobj * -nv50_fp_linkage_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *vp = nv50->vertprog; - struct nv50_program *fp = nv50->fragprog; - struct nouveau_stateobj *so; - struct nv50_sreg4 dummy; - int i, n, c, m = 0; - uint32_t map[16], lin[4], reg[6], pcrd[8]; - uint8_t zval = 0x40; - - if (nv50->geomprog) { - vp = nv50->geomprog; - zval = 0x80; - } - memset(map, 0, sizeof(map)); - memset(lin, 0, sizeof(lin)); - - reg[1] = 0x00000004; /* low and high clip distance map ids */ - reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ - reg[3] = 0x00000000; /* point size map id & enable */ - reg[5] = 0x00000000; /* primitive ID map slot */ - reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ - reg[4] = fp->cfg.regs[1]; /* interpolant info */ - - dummy.linear = FALSE; - dummy.mask = 0xf; /* map all components of HPOS */ - m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]); - - dummy.mask = 0x0; - - if (vp->cfg.clpd < 0x40) { - for (c = 0; c < vp->cfg.clpd_nr; ++c) { - map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8); - ++m; - } - reg[1] = (m << 8); - } - - reg[0] |= m << 8; /* adjust BFC0 id */ - - /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ - if (nv50->rasterizer->pipe.light_twoside) { - struct nv50_sreg4 *vpo = &vp->cfg.two_side[0]; - struct nv50_sreg4 *fpi = &fp->cfg.two_side[0]; - - m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]); - m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]); - } - - reg[0] += m - 4; /* adjust FFC0 id */ - reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ - - for (i = 0; i < fp->cfg.in_nr; i++) { - /* maybe even remove these from cfg.io */ - if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION || - fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE) - continue; - - for (n = 0; n < vp->cfg.out_nr; ++n) - if (vp->cfg.out[n].sn == fp->cfg.in[i].sn && - vp->cfg.out[n].si == fp->cfg.in[i].si) - break; - - m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i], - (n < vp->cfg.out_nr) ? - &vp->cfg.out[n] : &dummy); - } - /* PrimitiveID either is replaced by the system value, or - * written by the geometry shader into an output register - */ - if (fp->cfg.prim_id < 0x40) { - map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8); - reg[5] = m++; - } - - if (nv50->rasterizer->pipe.point_size_per_vertex) { - map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); - reg[3] = (m++ << 4) | 1; - } - - /* now fill the stateobj (at most 28 so_data) */ - so = so_new(10, 54, 0); - - n = (m + 3) / 4; - assert(m <= 64); - if (vp->type == PIPE_SHADER_GEOMETRY) { - so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); - so_data (so, m); - so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); - so_datap (so, map, n); - } else { - so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); - so_data (so, vp->cfg.regs[0]); - - so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); - so_data (so, reg[5]); - - so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); - so_data (so, m); - so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); - so_datap (so, map, n); - } - - so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); - so_datap (so, reg, 4); - - so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); - so_data (so, reg[4]); - - so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); - so_datap (so, lin, 4); - - if (nv50->rasterizer->pipe.sprite_coord_enable) { - so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); - so_data (so, - nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff)); - - so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); - so_datap (so, pcrd, 8); - } - - so_method(so, tesla, NV50TCL_GP_ENABLE, 1); - so_data (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); - - return so; + assert(0); + return 1; } static int -construct_vp_gp_mapping(uint32_t *map32, int m, - struct nv50_program *vp, struct nv50_program *gp) -{ - uint8_t *map = (uint8_t *)map32; - int i, j, c; - - for (i = 0; i < gp->cfg.in_nr; ++i) { - uint8_t oid = 0, mv = 0, mg = gp->cfg.in[i].mask; - - for (j = 0; j < vp->cfg.out_nr; ++j) { - if (vp->cfg.out[j].sn == gp->cfg.in[i].sn && - vp->cfg.out[j].si == gp->cfg.in[i].si) { - mv = vp->cfg.out[j].mask; - oid = vp->cfg.out[j].hw; - break; - } - } - - for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { - if (mg & mv & 1) - map[m++] = oid; - else - if (mg & 1) - map[m++] = (c == 3) ? 0x41 : 0x40; - oid += mv & 1; - } - } - return m; -} - -struct nouveau_stateobj * -nv50_gp_linkage_validate(struct nv50_context *nv50) +nv50_prog_scan(struct nv50_translation_info *ti) +{ + struct nv50_program *p = ti->p; + struct tgsi_parse_context parse; + int ret; + + p->vp.psiz = 0x40; + p->vp.bfc[0] = 0x40; + p->vp.bfc[1] = 0x40; + p->gp.primid = 0x80; + + tgsi_scan_shader(p->pipe.tokens, &ti->scan); + + tgsi_parse_init(&parse, p->pipe.tokens); + while (!tgsi_parse_end_of_tokens(&parse)) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: + prog_immediate(ti, &parse.FullToken.FullImmediate); + break; + case TGSI_TOKEN_TYPE_DECLARATION: + prog_decl(ti, &parse.FullToken.FullDeclaration); + break; + case TGSI_TOKEN_TYPE_INSTRUCTION: + prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr); + break; + } + } + + p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1; + p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1; + + switch (p->type) { + case PIPE_SHADER_VERTEX: + ret = nv50_vertprog_prepare(ti); + break; + case PIPE_SHADER_FRAGMENT: + ret = nv50_fragprog_prepare(ti); + break; + case PIPE_SHADER_GEOMETRY: + ret = nv50_geomprog_prepare(ti); + break; + default: + assert(!"unsupported program type"); + ret = -1; + break; + } + + assert(!ret); + return ret; +} + +boolean +nv50_program_tx(struct nv50_program *p) { - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nouveau_stateobj *so; - struct nv50_program *vp = nv50->vertprog; - struct nv50_program *gp = nv50->geomprog; - uint32_t map[16]; - int m = 0; + struct nv50_translation_info *ti; + int ret; - if (!gp) - return NULL; - memset(map, 0, sizeof(map)); + ti = CALLOC_STRUCT(nv50_translation_info); + ti->p = p; - m = construct_vp_gp_mapping(map, m, vp, gp); + ti->edgeflag_out = PIPE_MAX_SHADER_OUTPUTS; - so = so_new(3, 24 - 3, 0); + ret = nv50_prog_scan(ti); + if (ret) { + NOUVEAU_ERR("unsupported shader program\n"); + goto out; + } - so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); - so_data (so, vp->cfg.regs[0] | gp->cfg.regs[0]); + ret = nv50_generate_code(ti); + if (ret) { + NOUVEAU_ERR("error during shader translation\n"); + goto out; + } - assert(m <= 32); - so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); - so_data (so, m); - - m = (m + 3) / 4; - so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); - so_datap (so, map, m); - - return so; +out: + if (ti->immd32) + FREE(ti->immd32); + FREE(ti); + return ret ? FALSE : TRUE; } void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) { - while (p->exec_head) { - struct nv50_program_exec *e = p->exec_head; - - p->exec_head = e->next; - FREE(e); - } - p->exec_tail = NULL; - p->exec_size = 0; + nouveau_bo_ref(NULL, &p->bo); - nouveau_bo_ref(NULL, &p->bo); + so_ref(NULL, &p->so); - FREE(p->immd); - nouveau_resource_free(&p->data[0]); + if (p->code) + FREE(p->code); - p->translated = 0; + p->translated = FALSE; } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 1e3ad6bff0..654bce59f3 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -1,75 +1,116 @@ -#ifndef __NV50_PROGRAM_H__ -#define __NV50_PROGRAM_H__ +/* + * Copyright 2010 Ben Skeggs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NV50_PROG_H__ +#define __NV50_PROG_H__ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" +#include "nouveau/nouveau_class.h" -struct nv50_program_exec { - struct nv50_program_exec *next; +struct nv50_varying { + uint8_t id; /* tgsi index */ + uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ - unsigned inst[2]; - struct { - int index; - unsigned mask; - unsigned shift; - } param; -}; - -struct nv50_sreg4 { - uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ - uint8_t id; /* tgsi index */ - - uint8_t mask; - boolean linear; + uint8_t mask : 4; + uint8_t linear : 1; + uint8_t pad : 3; - ubyte sn, si; /* semantic name & index */ + ubyte sn; /* semantic name */ + ubyte si; /* semantic index */ }; struct nv50_program { - struct pipe_shader_state pipe; - struct tgsi_shader_info info; - boolean translated; - - unsigned type; - struct nv50_program_exec *exec_head; - struct nv50_program_exec *exec_tail; - unsigned exec_size; - struct nouveau_resource *data[1]; - unsigned data_start[1]; - - struct nouveau_bo *bo; - - uint32_t *immd; - unsigned immd_nr; - unsigned param_nr; - - struct { - unsigned high_temp; - unsigned high_result; - - uint32_t attr[2]; - uint32_t regs[4]; - - /* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */ - unsigned in_nr, out_nr; - struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS]; - struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS]; - - /* FP colour inputs, VP/GP back colour outputs */ - struct nv50_sreg4 two_side[2]; - - /* GP only */ - unsigned vert_count; - uint8_t prim_type; - - /* VP & GP only */ - uint8_t clpd, clpd_nr; - uint8_t psiz; - uint8_t edgeflag_in; - - /* FP & GP only */ - uint8_t prim_id; - } cfg; + struct pipe_shader_state pipe; + + ubyte type; + boolean translated; + + struct nouveau_bo *bo; + struct nouveau_stateobj *so; + + uint32_t *code; + unsigned code_size; + unsigned code_start; /* offset inside bo */ + uint32_t *immd; + unsigned immd_size; + unsigned parm_size; /* size limit of uniform buffer */ + + ubyte max_gpr; /* REG_ALLOC_TEMP */ + ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */ + + ubyte in_nr; + ubyte out_nr; + struct nv50_varying in[16]; + struct nv50_varying out[16]; + + struct { + uint32_t attrs[3]; /* VP_ATTR_EN_0,1 and VP_GP_BUILTIN_ATTR_EN */ + ubyte psiz; + ubyte bfc[2]; + ubyte edgeflag; + ubyte clpd; + ubyte clpd_nr; + } vp; + + struct { + uint32_t flags[2]; /* 0x19a8, 196c */ + uint32_t interp; /* 0x1988 */ + uint32_t colors; /* 0x1904 */ + } fp; + + struct { + ubyte primid; /* primitive id output register */ + uint8_t vert_count; + uint8_t prim_type; /* point, line strip or tri strip */ + } gp; + + void *fixups; + unsigned num_fixups; }; -#endif +#define NV50_INTERP_LINEAR (1 << 0) +#define NV50_INTERP_FLAT (1 << 1) +#define NV50_INTERP_CENTROID (1 << 2) + +struct nv50_translation_info { + struct nv50_program *p; + unsigned inst_nr; + ubyte input_file; + ubyte output_file; + ubyte input_map[PIPE_MAX_SHADER_INPUTS][4]; + ubyte output_map[PIPE_MAX_SHADER_OUTPUTS][4]; + ubyte interp_mode[PIPE_MAX_SHADER_INPUTS]; + int input_access[PIPE_MAX_SHADER_INPUTS][4]; + int output_access[PIPE_MAX_SHADER_OUTPUTS][4]; + boolean indirect_inputs; + boolean indirect_outputs; + struct tgsi_shader_info scan; + uint32_t *immd32; + unsigned immd32_nr; + ubyte edgeflag_out; +}; + +int nv50_generate_code(struct nv50_translation_info *ti); +boolean nv50_program_tx(struct nv50_program *p); + +#endif /* __NV50_PROG_H__ */ diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c index c3ac804146..481182dd8d 100644 --- a/src/gallium/drivers/nv50/nv50_push.c +++ b/src/gallium/drivers/nv50/nv50_push.c @@ -227,7 +227,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe, ctx.idxbuf = NULL; ctx.vtx_size = 0; ctx.edgeflag = 0.5f; - ctx.edgeflag_attr = nv50->vertprog->cfg.edgeflag_in; + ctx.edgeflag_attr = nv50->vertprog->vp.edgeflag; /* map vertex buffers, determine vertex size */ for (i = 0; i < nv50->vtxelt->num_elements; i++) { diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c new file mode 100644 index 0000000000..f7e6355286 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -0,0 +1,619 @@ +/* + * Copyright 2008 Ben Skeggs + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "util/u_inlines.h" + +#include "nv50_context.h" +#include "nv50_transfer.h" + +static void +nv50_transfer_constbuf(struct nv50_context *nv50, + struct pipe_resource *buf, unsigned size, unsigned cbi) +{ + struct pipe_context *pipe = &nv50->pipe; + struct pipe_transfer *transfer; + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + uint32_t *map; + unsigned count, start; + + map = pipe_buffer_map(pipe, buf, PIPE_TRANSFER_READ, &transfer); + if (!map) + return; + + count = MIN2(buf->width0, size); + start = 0; + + while (count) { + unsigned nr = count; + nr = MIN2(nr, 2047); + + /* FIXME: emit relocs for unsuiTed MM */ + BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); + OUT_RING (chan, (start << 8) | cbi); + BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); + OUT_RINGp (chan, map, nr); + + count -= nr; + start += nr; + map += nr; + } + + pipe_buffer_unmap(pipe, buf, transfer); +} + +static void +nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + unsigned cbi; + + if (p->immd_size) { + uint32_t *data = p->immd; + unsigned count = p->immd_size / 4; + unsigned start = 0; + + while (count) { + unsigned nr = count; + nr = MIN2(nr, 2047); + + BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); + OUT_RING (chan, (start << 8) | NV50_CB_PMISC); + BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); + OUT_RINGp (chan, data, nr); + + count -= nr; + start += nr; + data += nr; + } + } + + if (p->parm_size == 0) + return; + + switch (p->type) { + case PIPE_SHADER_VERTEX: + cbi = NV50_CB_PVP; + break; + case PIPE_SHADER_FRAGMENT: + cbi = NV50_CB_PFP; + break; + case PIPE_SHADER_GEOMETRY: + cbi = NV50_CB_PGP; + break; + default: + assert(0); + break; + } + + nv50_transfer_constbuf(nv50, nv50->constbuf[p->type], p->parm_size, cbi); +} + +static void +nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_grobj *eng2d = nv50->screen->eng2d; + int ret; + unsigned offset; + unsigned size = p->code_size; + uint32_t *data = p->code; + + assert(p->translated); + + /* TODO: use a single bo (for each type) for shader code */ + if (p->bo) + return; + ret = nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, size, &p->bo); + assert(!ret); + + offset = p->code_start = 0; + + BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2); + OUT_RING (chan, NV50_2D_DST_FORMAT_R8_UNORM); + OUT_RING (chan, 1); + BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1); + OUT_RING (chan, 0x40000); + BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 2); + OUT_RING (chan, 0x10000); + OUT_RING (chan, 1); + + while (size) { + unsigned nr = size / 4; + + if (AVAIL_RING(chan) < 32) + FIRE_RING(chan); + + nr = MIN2(nr, AVAIL_RING(chan) - 18); + nr = MIN2(nr, 1792); + if (nr < (size / 4)) + nr &= ~0x3f; + assert(!(size & 3)); + + BEGIN_RING(chan, eng2d, NV50_2D_DST_ADDRESS_HIGH, 2); + OUT_RELOCh(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2); + OUT_RING (chan, 0); + OUT_RING (chan, NV50_2D_SIFC_FORMAT_R8_UNORM); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10); + OUT_RING (chan, nr * 4); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + + BEGIN_RING_NI(chan, eng2d, NV50_2D_SIFC_DATA, nr); + OUT_RINGp (chan, data, nr); + + data += nr; + offset += nr * 4; + size -= nr * 4; + } + + BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); + OUT_RING (chan, 0); +} + +static void +nv50_vp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(5, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); + so_data (so, p->vp.attrs[0]); + so_data (so, p->vp.attrs[1]); + so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_VP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static void +nv50_fp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(6, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_FP_CONTROL, 1); + so_data (so, p->fp.flags[0]); + so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); + so_data (so, p->fp.flags[1]); + so_method(so, tesla, NV50TCL_FP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static void +nv50_gp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(6, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); + so_data (so, p->gp.prim_type); + so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); + so_data (so, p->gp.vert_count); + so_method(so, tesla, NV50TCL_GP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static boolean +nv50_program_validate(struct nv50_program *p) +{ + p->translated = nv50_program_tx(p); + assert(p->translated); + return p->translated; +} + +struct nouveau_stateobj * +nv50_vertprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->vertprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_vp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_VERTPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_VERTPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +struct nouveau_stateobj * +nv50_fragprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->fragprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_fp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_FRAGPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_FRAGPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +struct nouveau_stateobj * +nv50_geomprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->geomprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_gp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_GEOMPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_GEOMPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +/* XXX: this might not work correctly in all cases yet: we assume that + * an FP generic input that is not written in the VP is gl_PointCoord. + */ +static uint32_t +nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned m) +{ + struct nv50_program *vp = nv50->vertprog; + struct nv50_program *fp = nv50->fragprog; + unsigned i, c; + + memset(pntc, 0, 8 * sizeof(uint32_t)); + + if (nv50->geomprog) + vp = nv50->geomprog; + + for (i = 0; i < fp->in_nr; i++) { + unsigned j, n = util_bitcount(fp->in[i].mask); + + if (fp->in[i].sn != TGSI_SEMANTIC_GENERIC) { + m += n; + continue; + } + + for (j = 0; j < vp->out_nr; ++j) + if (vp->out[j].sn == fp->in[i].sn && vp->out[j].si == fp->in[i].si) + break; + + if (j < vp->out_nr) { + ubyte en = nv50->rasterizer->pipe.sprite_coord_enable; + + if (!(en & (1 << vp->out[j].si))) { + m += n; + continue; + } + } + + /* this is either PointCoord or replaced by sprite coords */ + for (c = 0; c < 4; c++) { + if (!(fp->in[i].mask & (1 << c))) + continue; + pntc[m / 8] |= (c + 1) << ((m % 8) * 4); + ++m; + } + } + if (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) + return 0; + return (1 << 4); +} + +static int +nv50_vec4_map(uint32_t *map32, int mid, uint32_t lin[4], + struct nv50_varying *in, struct nv50_varying *out) +{ + int c; + uint8_t mv = out->mask, mf = in->mask, oid = out->hw; + uint8_t *map = (uint8_t *)map32; + + for (c = 0; c < 4; ++c) { + if (mf & 1) { + if (in->linear) + lin[mid / 32] |= 1 << (mid % 32); + if (mv & 1) + map[mid] = oid; + else + if (c == 3) + map[mid] |= 1; + ++mid; + } + + oid += mv & 1; + mf >>= 1; + mv >>= 1; + } + + return mid; +} + +struct nouveau_stateobj * +nv50_fp_linkage_validate(struct nv50_context *nv50) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_program *vp; + struct nv50_program *fp = nv50->fragprog; + struct nouveau_stateobj *so; + struct nv50_varying dummy; + int i, n, c, m; + + uint32_t map[16], lin[4], pntc[8]; + + uint32_t interp = fp->fp.interp; + uint32_t colors = fp->fp.colors; + uint32_t clip = 0x04; + uint32_t psiz = 0x000; + uint32_t primid = 0; + uint32_t sysval = 0; + + if (nv50->geomprog) { + vp = nv50->geomprog; + memset(map, 0x80, sizeof(map)); + } else { + vp = nv50->vertprog; + memset(map, 0x40, sizeof(map)); + } + memset(lin, 0, sizeof(lin)); + + dummy.linear = 0; + dummy.mask = 0xf; /* map all components of HPOS */ + m = nv50_vec4_map(map, 0, lin, &dummy, &vp->out[0]); + + if (vp->vp.clpd < 0x40) { + for (c = 0; c < vp->vp.clpd_nr; ++c) { + map[m / 4] |= (vp->vp.clpd + c) << ((m % 4) * 8); + ++m; + } + clip |= vp->vp.clpd_nr << 8; + } + + colors |= m << 8; /* adjust BFC0 id */ + + /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ + if (nv50->rasterizer->pipe.light_twoside) { + for (i = 0; i < 2; ++i) + m = nv50_vec4_map(map, m, lin, + &fp->in[fp->vp.bfc[i]], + &vp->out[vp->vp.bfc[i]]); + } + + colors += m - 4; /* adjust FFC0 id */ + interp |= m << 8; /* set mid where 'normal' FP inputs start */ + + dummy.mask = 0x0; + for (i = 0; i < fp->in_nr; i++) { + for (n = 0; n < vp->out_nr; ++n) + if (vp->out[n].sn == fp->in[i].sn && + vp->out[n].si == fp->in[i].si) + break; + + m = nv50_vec4_map(map, m, lin, + &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy); + } + /* PrimitiveID either is replaced by the system value, or + * written by the geometry shader into an output register + */ + if (fp->gp.primid < 0x40) { + map[m / 4] |= vp->gp.primid << ((m % 4) * 8); + primid = m++; + } + + if (nv50->rasterizer->pipe.point_size_per_vertex) { + map[m / 4] |= vp->vp.psiz << ((m % 4) * 8); + psiz = (m++ << 4) | 1; + } + + /* now fill the stateobj (at most 28 so_data) */ + so = so_new(10, 54, 0); + + n = (m + 3) / 4; + assert(m <= 64); + if (vp->type == PIPE_SHADER_GEOMETRY) { + so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); + so_data (so, m); + so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); + so_datap (so, map, n); + } else { + so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); + so_data (so, vp->vp.attrs[2]); + + so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); + so_data (so, primid); + + so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); + so_data (so, m); + so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); + so_datap (so, map, n); + } + + //colors = 0x01000404; + so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); + so_data (so, colors); + so_data (so, clip); + so_data (so, sysval); + so_data (so, psiz); + + so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); + so_data (so, interp); + + so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); + so_datap (so, lin, 4); + + if (nv50->rasterizer->pipe.sprite_coord_enable) { + so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); + so_data (so, + nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff)); + + so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); + so_datap (so, pntc, 8); + } + + so_method(so, tesla, NV50TCL_GP_ENABLE, 1); + so_data (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); + + return so; +} + +static int +nv50_vp_gp_mapping(uint32_t *map32, int m, + struct nv50_program *vp, struct nv50_program *gp) +{ + uint8_t *map = (uint8_t *)map32; + int i, j, c; + + for (i = 0; i < gp->in_nr; ++i) { + uint8_t oid = 0, mv = 0, mg = gp->in[i].mask; + + for (j = 0; j < vp->out_nr; ++j) { + if (vp->out[j].sn == gp->in[i].sn && + vp->out[j].si == gp->in[i].si) { + mv = vp->out[j].mask; + oid = vp->out[j].hw; + break; + } + } + + for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { + if (mg & mv & 1) + map[m++] = oid; + else + if (mg & 1) + map[m++] = (c == 3) ? 0x41 : 0x40; + oid += mv & 1; + } + } + return m; +} + +struct nouveau_stateobj * +nv50_gp_linkage_validate(struct nv50_context *nv50) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so; + struct nv50_program *vp = nv50->vertprog; + struct nv50_program *gp = nv50->geomprog; + uint32_t map[16]; + int m = 0; + + if (!gp) + return NULL; + memset(map, 0, sizeof(map)); + + m = nv50_vp_gp_mapping(map, m, vp, gp); + + so = so_new(3, 24 - 3, 0); + + so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); + so_data (so, vp->vp.attrs[2] | gp->vp.attrs[2]); + + assert(m <= 32); + so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); + so_data (so, m); + + m = (m + 3) / 4; + so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); + so_datap (so, map, m); + + return so; +} diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index 42c5a58318..0d744ab788 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -546,7 +546,6 @@ nv50_vp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_VERTEX; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } @@ -578,7 +577,6 @@ nv50_fp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_FRAGMENT; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } @@ -610,7 +608,6 @@ nv50_gp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_GEOMETRY; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index 524696f35d..8d662d8f60 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -81,6 +81,9 @@ validate_fb(struct nv50_context *nv50) case PIPE_FORMAT_R16G16B16A16_UNORM: so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM); break; + case PIPE_FORMAT_R16G16B16A16_FLOAT: + so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT); + break; case PIPE_FORMAT_R32G32B32A32_FLOAT: so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT); break; @@ -135,6 +138,12 @@ validate_fb(struct nv50_context *nv50) case PIPE_FORMAT_Z32_FLOAT: so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT); break; + case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED: + so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM); + break; + case PIPE_FORMAT_Z16_UNORM: + so_data(so, NV50TCL_ZETA_FORMAT_Z16_UNORM); + break; default: NOUVEAU_ERR("AIIII unknown format %s\n", util_format_name(fb->zsbuf->format)); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c new file mode 100644 index 0000000000..aa15917774 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -0,0 +1,1266 @@ + +#include <unistd.h> + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" + +#include "util/u_simple_list.h" +#include "tgsi/tgsi_dump.h" + +#define BLD_MAX_TEMPS 64 +#define BLD_MAX_ADDRS 4 +#define BLD_MAX_PREDS 4 +#define BLD_MAX_IMMDS 128 + +#define BLD_MAX_COND_NESTING 4 +#define BLD_MAX_LOOP_NESTING 4 +#define BLD_MAX_CALL_NESTING 2 + +/* collects all values assigned to the same TGSI register */ +struct bld_value_stack { + struct nv_value *top; + struct nv_value **body; + unsigned size; +}; + +static INLINE void +bld_push_value(struct bld_value_stack *stk) +{ + assert(!stk->size || (stk->body[stk->size - 1] != stk->top)); + + if (!(stk->size % 8)) { + unsigned old_sz = (stk->size + 0) * sizeof(struct nv_value *); + unsigned new_sz = (stk->size + 8) * sizeof(struct nv_value *); + stk->body = (struct nv_value **)REALLOC(stk->body, old_sz, new_sz); + } + stk->body[stk->size++] = stk->top; + stk->top = NULL; +} + +static INLINE void +bld_push_values(struct bld_value_stack *stacks, int n) +{ + int i, c; + + for (i = 0; i < n; ++i) + for (c = 0; c < 4; ++c) + if (stacks[i * 4 + c].top) + bld_push_value(&stacks[i * 4 + c]); +} + +#define FETCH_TEMP(i, c) (bld->tvs[i][c].top) +#define STORE_TEMP(i, c, v) (bld->tvs[i][c].top = (v)) +#define FETCH_ADDR(i, c) (bld->avs[i][c].top) +#define STORE_ADDR(i, c, v) (bld->avs[i][c].top = (v)) +#define FETCH_PRED(i, c) (bld->pvs[i][c].top) +#define STORE_PRED(i, c, v) (bld->pvs[i][c].top = (v)) +#define FETCH_OUTR(i, c) (bld->ovs[i][c].top) +#define STORE_OUTR(i, c, v) \ + do { \ + bld->ovs[i][c].top = (v); \ + bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \ + } while (0) + +struct bld_context { + struct nv50_translation_info *ti; + + struct nv_pc *pc; + struct nv_basic_block *b; + + struct tgsi_parse_context parse[BLD_MAX_CALL_NESTING]; + int call_lvl; + + struct nv_basic_block *cond_bb[BLD_MAX_COND_NESTING]; + struct nv_basic_block *join_bb[BLD_MAX_COND_NESTING]; + struct nv_basic_block *else_bb[BLD_MAX_COND_NESTING]; + int cond_lvl; + struct nv_basic_block *loop_bb[BLD_MAX_LOOP_NESTING]; + int loop_lvl; + + struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */ + struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */ + struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ + struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4]; + + uint32_t outputs_written[PIPE_MAX_SHADER_OUTPUTS / 32]; + + struct nv_value *frgcrd[4]; + struct nv_value *sysval[4]; + + /* wipe on new BB */ + struct nv_value *saved_addr[4][2]; + struct nv_value *saved_inputs[128]; + struct nv_value *saved_immd[BLD_MAX_IMMDS]; + uint num_immds; +}; + +static INLINE struct nv_value * +bld_def(struct nv_instruction *i, int c, struct nv_value *value) +{ + i->def[c] = value; + value->insn = i; + return value; +} + +static INLINE struct nv_value * +find_by_bb(struct bld_value_stack *stack, struct nv_basic_block *b) +{ + int i; + + if (stack->top && stack->top->insn->bb == b) + return stack->top; + + for (i = stack->size - 1; i >= 0; --i) + if (stack->body[i]->insn->bb == b) + return stack->body[i]; + return NULL; +} + +/* fetch value from stack that was defined in the specified basic block, + * or search for first definitions in all of its predecessors + */ +static void +fetch_by_bb(struct bld_value_stack *stack, + struct nv_value **vals, int *n, + struct nv_basic_block *b) +{ + int i; + struct nv_value *val; + + assert(*n < 16); /* MAX_COND_NESTING */ + + val = find_by_bb(stack, b); + if (val) { + for (i = 0; i < *n; ++i) + if (vals[i] == val) + return; + vals[(*n)++] = val; + return; + } + for (i = 0; i < b->num_in; ++i) + fetch_by_bb(stack, vals, n, b->in[i]); +} + +static struct nv_value * +bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) +{ + struct nv_value *vals[16], *phi = NULL; + int j, i = 0, n = 0; + + fetch_by_bb(stack, vals, &n, bld->pc->current_block); + + assert(n); + if (n == 1) + return vals[0]; + + debug_printf("phi required: %i candidates\n", n); + + while (i < n) { + struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_PHI); + + j = phi ? 1 : 0; + if (phi) + insn->src[0] = new_ref(bld->pc, phi); + + phi = new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type); + + bld_def(insn, 0, phi); + + for (; j < 4; ++j) { + insn->src[j] = new_ref(bld->pc, vals[i++]); + if (i == n) + break; + } + debug_printf("new phi: %i, %i in\n", phi->n, j); + } + + /* insert_at_head(list, phi) is done at end of block */ + return phi; +} + +static INLINE struct nv_value * +bld_imm_u32(struct bld_context *bld, uint32_t u) +{ + int i; + unsigned n = bld->num_immds; + + debug_printf("bld_imm_u32: 0x%08x\n", u); + + for (i = 0; i < n; ++i) + if (bld->saved_immd[i]->reg.imm.u32 == u) + return bld->saved_immd[i]; + assert(n < BLD_MAX_IMMDS); + + debug_printf("need new one\n"); + + bld->num_immds++; + + bld->saved_immd[n] = new_value(bld->pc, NV_FILE_IMM, NV_TYPE_U32); + bld->saved_immd[n]->reg.imm.u32 = u; + return bld->saved_immd[n]; +} + +static INLINE struct nv_value * +bld_imm_f32(struct bld_context *bld, float f) +{ + return bld_imm_u32(bld, fui(f)); +} + +#define SET_TYPE(v, t) ((v)->reg.type = NV_TYPE_##t) + +static struct nv_value * +bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + assert(insn); + + nv_reference(bld->pc, &insn->src[0], src0); /* NOTE: new_ref would suffice */ + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +static struct nv_value * +bld_insn_2(struct bld_context *bld, uint opcode, + struct nv_value *src0, struct nv_value *src1) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + + nv_reference(bld->pc, &insn->src[0], src0); + nv_reference(bld->pc, &insn->src[1], src1); + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +static struct nv_value * +bld_insn_3(struct bld_context *bld, uint opcode, + struct nv_value *src0, struct nv_value *src1, + struct nv_value *src2) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + + nv_reference(bld->pc, &insn->src[0], src0); + nv_reference(bld->pc, &insn->src[1], src1); + nv_reference(bld->pc, &insn->src[2], src2); + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +#define BLD_INSN_1_EX(d, op, dt, s0, s0t) \ + do { \ + (d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \ + (d)->reg.type = NV_TYPE_##dt; \ + (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ + } while(0) + +static struct nv_value * +bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e) +{ + struct nv_value *val; + + BLD_INSN_1_EX(val, LG2, F32, x, F32); + BLD_INSN_1_EX(val, MUL, F32, e, F32); + val = bld_insn_1(bld, NV_OP_PREEX2, val); + val = bld_insn_1(bld, NV_OP_EX2, val); + + return val; +} + +static INLINE struct nv_value * +bld_load_imm_f32(struct bld_context *bld, float f) +{ + return bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f)); +} + +static INLINE struct nv_value * +bld_load_imm_u32(struct bld_context *bld, uint32_t u) +{ + return bld_insn_1(bld, NV_OP_MOV, bld_imm_u32(bld, u)); +} + +static struct nv_value * +bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) +{ + int i; + struct nv_instruction *nvi; + + for (i = 0; i < 4; ++i) { + if (!bld->saved_addr[i][0]) + break; + if (bld->saved_addr[i][1] == indirect) { + nvi = bld->saved_addr[i][0]->insn; + if (nvi->src[0]->value->reg.imm.u32 == id) + return bld->saved_addr[i][0]; + } + } + i &= 3; + + bld->saved_addr[i][0] = bld_load_imm_u32(bld, id); + bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR; + bld->saved_addr[i][1] = indirect; + return bld->saved_addr[i][0]; +} + + +static struct nv_value * +bld_predicate(struct bld_context *bld, struct nv_value *src) +{ + struct nv_instruction *nvi = src->insn; + + if (nvi->opcode == NV_OP_LDA || + nvi->opcode == NV_OP_PHI || + nvi->bb != bld->pc->current_block) { + nvi = new_instruction(bld->pc, NV_OP_CVT); + nv_reference(bld->pc, &nvi->src[0], src); + } + + if (!nvi->flags_def) { + nvi->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + nvi->flags_def->insn = nvi; + } + return nvi->flags_def; +} + +static void +bld_kil(struct bld_context *bld, struct nv_value *src) +{ + struct nv_instruction *nvi; + + src = bld_predicate(bld, src); + nvi = new_instruction(bld->pc, NV_OP_KIL); + nvi->fixed = 1; + nvi->flags_src = new_ref(bld->pc, src); + nvi->cc = NV_CC_LT; +} + +static void +bld_flow(struct bld_context *bld, uint opcode, ubyte cc, + struct nv_value *src, boolean plan_reconverge) +{ + struct nv_instruction *nvi; + + if (plan_reconverge) + new_instruction(bld->pc, NV_OP_JOINAT)->fixed = 1; + + nvi = new_instruction(bld->pc, opcode); + nvi->is_terminator = 1; + nvi->cc = cc; + nvi->flags_src = new_ref(bld->pc, src); +} + +static ubyte +translate_setcc(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_SLT: return NV_CC_LT; + case TGSI_OPCODE_SGE: return NV_CC_GE; + case TGSI_OPCODE_SEQ: return NV_CC_EQ; + case TGSI_OPCODE_SGT: return NV_CC_GT; + case TGSI_OPCODE_SLE: return NV_CC_LE; + case TGSI_OPCODE_SNE: return NV_CC_NE | NV_CC_U; + case TGSI_OPCODE_STR: return NV_CC_TR; + case TGSI_OPCODE_SFL: return NV_CC_FL; + + case TGSI_OPCODE_ISLT: return NV_CC_LT; + case TGSI_OPCODE_ISGE: return NV_CC_GE; + case TGSI_OPCODE_USEQ: return NV_CC_EQ; + case TGSI_OPCODE_USGE: return NV_CC_GE; + case TGSI_OPCODE_USLT: return NV_CC_LT; + case TGSI_OPCODE_USNE: return NV_CC_NE; + default: + assert(0); + return NV_CC_FL; + } +} + +static uint +translate_opcode(uint opcode) +{ + switch (opcode) { + case TGSI_OPCODE_ABS: return NV_OP_ABS; + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_UADD: return NV_OP_ADD; + case TGSI_OPCODE_AND: return NV_OP_AND; + case TGSI_OPCODE_EX2: return NV_OP_EX2; + case TGSI_OPCODE_CEIL: return NV_OP_CEIL; + case TGSI_OPCODE_FLR: return NV_OP_FLOOR; + case TGSI_OPCODE_TRUNC: return NV_OP_TRUNC; + case TGSI_OPCODE_DDX: return NV_OP_DFDX; + case TGSI_OPCODE_DDY: return NV_OP_DFDY; + case TGSI_OPCODE_F2I: + case TGSI_OPCODE_F2U: + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_U2F: return NV_OP_CVT; + case TGSI_OPCODE_INEG: return NV_OP_NEG; + case TGSI_OPCODE_LG2: return NV_OP_LG2; + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_USHR: return NV_OP_SHR; + case TGSI_OPCODE_MAD: + case TGSI_OPCODE_UMAD: return NV_OP_MAD; + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_UMAX: return NV_OP_MAX; + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_UMIN: return NV_OP_MIN; + case TGSI_OPCODE_MUL: + case TGSI_OPCODE_UMUL: return NV_OP_MUL; + case TGSI_OPCODE_OR: return NV_OP_OR; + case TGSI_OPCODE_RCP: return NV_OP_RCP; + case TGSI_OPCODE_RSQ: return NV_OP_RSQ; + case TGSI_OPCODE_SAD: return NV_OP_SAD; + case TGSI_OPCODE_SHL: return NV_OP_SHL; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: return NV_OP_SET; + case TGSI_OPCODE_TEX: return NV_OP_TEX; + case TGSI_OPCODE_TXP: return NV_OP_TEX; + case TGSI_OPCODE_TXB: return NV_OP_TXB; + case TGSI_OPCODE_TXL: return NV_OP_TXL; + case TGSI_OPCODE_XOR: return NV_OP_XOR; + default: + return NV_OP_NOP; + } +} + +static ubyte +infer_src_type(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_SAD: + case TGSI_OPCODE_U2F: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + case TGSI_OPCODE_USHR: + return NV_TYPE_U32; + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_ISLT: + return NV_TYPE_S32; + default: + return NV_TYPE_F32; + } +} + +static ubyte +infer_dst_type(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_F2U: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_SAD: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + case TGSI_OPCODE_USHR: + return NV_TYPE_U32; + case TGSI_OPCODE_F2I: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_ISLT: + return NV_TYPE_S32; + default: + return NV_TYPE_F32; + } +} + +static void +emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, + unsigned chan, struct nv_value *value) +{ + const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + + assert(chan < 4); + + if (inst->Instruction.Opcode != TGSI_OPCODE_MOV) + value->reg.type = infer_dst_type(inst->Instruction.Opcode); + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_NONE: + break; + case TGSI_SAT_ZERO_ONE: + BLD_INSN_1_EX(value, SAT, F32, value, F32); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + value = bld_insn_2(bld, NV_OP_MAX, value, bld_load_imm_f32(bld, -1.0f)); + value = bld_insn_2(bld, NV_OP_MIN, value, bld_load_imm_f32(bld, 1.0f)); + value->reg.type = NV_TYPE_F32; + break; + } + + switch (reg->Register.File) { + case TGSI_FILE_OUTPUT: + value = bld_insn_1(bld, NV_OP_MOV, value); + value->reg.file = bld->ti->output_file; + + if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) { + STORE_OUTR(reg->Register.Index, chan, value); + } else { + value->insn->fixed = 1; + value->reg.id = bld->ti->output_map[reg->Register.Index][chan]; + } + break; + case TGSI_FILE_TEMPORARY: + assert(reg->Register.Index < BLD_MAX_TEMPS); + value->reg.file = NV_FILE_GPR; + if (value->insn->bb != bld->pc->current_block) + value = bld_insn_1(bld, NV_OP_MOV, value); + STORE_TEMP(reg->Register.Index, chan, value); + break; + case TGSI_FILE_ADDRESS: + assert(reg->Register.Index < BLD_MAX_ADDRS); + value->reg.file = NV_FILE_ADDR; + STORE_ADDR(reg->Register.Index, chan, value); + break; + } +} + +static INLINE uint32_t +bld_is_output_written(struct bld_context *bld, int i, int c) +{ + if (c < 0) + return bld->outputs_written[i / 8] & (0xf << ((i * 4) % 32)); + return bld->outputs_written[i / 8] & (1 << ((i * 4 + c) % 32)); +} + +static void +bld_export_outputs(struct bld_context *bld) +{ + struct nv_value *vals[4]; + struct nv_instruction *nvi; + int i, c, n; + + bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + + for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) { + if (!bld_is_output_written(bld, i, -1)) + continue; + for (n = 0, c = 0; c < 4; ++c) { + if (!bld_is_output_written(bld, i, c)) + continue; + vals[n] = bld_fetch_global(bld, &bld->ovs[i][c]); + vals[n] = bld_insn_1(bld, NV_OP_MOV, vals[n]); + vals[n++]->reg.id = bld->ti->output_map[i][c]; + } + assert(n); + + (nvi = new_instruction(bld->pc, NV_OP_EXPORT))->fixed = 1; + + for (c = 0; c < n; ++c) + nvi->src[c] = new_ref(bld->pc, vals[c]); + } +} + +static void +bld_new_block(struct bld_context *bld, struct nv_basic_block *b) +{ + int i; + + bld_push_values(&bld->tvs[0][0], BLD_MAX_TEMPS); + bld_push_values(&bld->avs[0][0], BLD_MAX_ADDRS); + bld_push_values(&bld->pvs[0][0], BLD_MAX_PREDS); + bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + + bld->pc->current_block = b; + + for (i = 0; i < 4; ++i) + bld->saved_addr[i][0] = NULL; +} + +static struct nv_value * +bld_saved_input(struct bld_context *bld, unsigned i, unsigned c) +{ + unsigned idx = bld->ti->input_map[i][c]; + + if (bld->ti->p->type != PIPE_SHADER_FRAGMENT) + return NULL; + if (bld->saved_inputs[idx]) + return bld->saved_inputs[idx]; + return NULL; +} + +static struct nv_value * +bld_interpolate(struct bld_context *bld, unsigned mode, struct nv_value *val) +{ + if (mode & (NV50_INTERP_LINEAR | NV50_INTERP_FLAT)) + val = bld_insn_1(bld, NV_OP_LINTERP, val); + else + val = bld_insn_2(bld, NV_OP_PINTERP, val, bld->frgcrd[3]); + + val->insn->flat = (mode & NV50_INTERP_FLAT) ? 1 : 0; + val->insn->centroid = (mode & NV50_INTERP_CENTROID) ? 1 : 0; + return val; +} + +static struct nv_value * +emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, + const unsigned s, const unsigned chan) +{ + const struct tgsi_full_src_register *src = &insn->Src[s]; + struct nv_value *res; + unsigned idx, swz, dim_idx, ind_idx, ind_swz; + ubyte type = infer_src_type(insn->Instruction.Opcode); + + idx = src->Register.Index; + swz = tgsi_util_get_full_src_register_swizzle(src, chan); + dim_idx = -1; + ind_idx = -1; + ind_swz = 0; + + if (src->Register.Indirect) { + ind_idx = src->Indirect.Index; + ind_swz = tgsi_util_get_src_register_swizzle(&src->Indirect, 0); + } + + switch (src->Register.File) { + case TGSI_FILE_CONSTANT: + dim_idx = src->Dimension.Index ? src->Dimension.Index + 2 : 1; + assert(dim_idx < 14); + assert(dim_idx == 1); /* for now */ + + res = new_value(bld->pc, NV_FILE_MEM_C(dim_idx), type); + res->reg.type = type; + res->reg.id = (idx * 4 + swz) & 127; + res = bld_insn_1(bld, NV_OP_LDA, res); + + if (src->Register.Indirect) + res->insn->src[4] = new_ref(bld->pc, FETCH_ADDR(ind_idx, ind_swz)); + if (idx >= (128 / 4)) + res->insn->src[4] = + new_ref(bld->pc, bld_get_address(bld, (idx * 16) & ~0x1ff, NULL)); + break; + case TGSI_FILE_IMMEDIATE: + assert(idx < bld->ti->immd32_nr); + res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]); + res->reg.type = type; + break; + case TGSI_FILE_INPUT: + res = bld_saved_input(bld, idx, swz); + if (res && (insn->Instruction.Opcode != TGSI_OPCODE_TXP)) + return res; + + res = new_value(bld->pc, bld->ti->input_file, type); + res->reg.id = bld->ti->input_map[idx][swz]; + + if (res->reg.file == NV_FILE_MEM_V) { + res = bld_interpolate(bld, bld->ti->interp_mode[idx], res); + } else { + assert(src->Dimension.Dimension == 0); + res = bld_insn_1(bld, NV_OP_LDA, res); + } + assert(res->reg.type == type); + + bld->saved_inputs[bld->ti->input_map[idx][swz]] = res; + break; + case TGSI_FILE_TEMPORARY: + /* this should be load from l[], with reload elimination later on */ + res = bld_fetch_global(bld, &bld->tvs[idx][swz]); + break; + case TGSI_FILE_ADDRESS: + res = bld_fetch_global(bld, &bld->avs[idx][swz]); + break; + case TGSI_FILE_PREDICATE: + res = bld_fetch_global(bld, &bld->pvs[idx][swz]); + break; + default: + NOUVEAU_ERR("illegal/unhandled src reg file: %d\n", src->Register.File); + abort(); + break; + } + + switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) { + case TGSI_UTIL_SIGN_KEEP: + break; + case TGSI_UTIL_SIGN_CLEAR: + res = bld_insn_1(bld, NV_OP_ABS, res); + break; + case TGSI_UTIL_SIGN_TOGGLE: + res = bld_insn_1(bld, NV_OP_NEG, res); + break; + case TGSI_UTIL_SIGN_SET: + res = bld_insn_1(bld, NV_OP_ABS, res); + res = bld_insn_1(bld, NV_OP_NEG, res); + break; + default: + NOUVEAU_ERR("illegal/unhandled src reg sign mode\n"); + abort(); + break; + } + + return res; +} + +static void +bld_lit(struct bld_context *bld, struct nv_value *dst0[4], + const struct tgsi_full_instruction *insn) +{ + struct nv_value *val0, *zero; + unsigned mask = insn->Dst[0].Register.WriteMask; + + if (mask & ((1 << 0) | (1 << 3))) + dst0[3] = dst0[0] = bld_load_imm_f32(bld, 1.0f); + + if (mask & (3 << 1)) { + zero = bld_load_imm_f32(bld, 0.0f); + val0 = bld_insn_2(bld, NV_OP_MAX, emit_fetch(bld, insn, 0, 0), zero); + + if (mask & (1 << 1)) + dst0[1] = val0; + } + + if (mask & (1 << 2)) { + struct nv_value *val1, *val3, *src1, *src3; + struct nv_value *pos128 = bld_load_imm_f32(bld, 127.999999f); + struct nv_value *neg128 = bld_load_imm_f32(bld, -127.999999f); + + src1 = emit_fetch(bld, insn, 0, 1); + src3 = emit_fetch(bld, insn, 0, 3); + + val0->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + val0->insn->flags_def->insn = val0->insn; + + val1 = bld_insn_2(bld, NV_OP_MAX, src1, zero); + val3 = bld_insn_2(bld, NV_OP_MAX, src3, neg128); + val3 = bld_insn_2(bld, NV_OP_MIN, val3, pos128); + val3 = bld_pow(bld, val1, val3); + + dst0[2] = bld_insn_1(bld, NV_OP_MOV, zero); + dst0[2]->insn->cc = NV_CC_LE; + dst0[2]->insn->flags_src = new_ref(bld->pc, val0->insn->flags_def); + + dst0[2] = bld_insn_2(bld, NV_OP_SELECT, val3, dst0[2]); + } +} + +static INLINE void +get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg) +{ + switch (insn->Texture.Texture) { + case TGSI_TEXTURE_1D: + *arg = *dim = 1; + break; + case TGSI_TEXTURE_SHADOW1D: + *dim = 1; + *arg = 2; + break; + case TGSI_TEXTURE_UNKNOWN: + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + *arg = *dim = 2; + break; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + *dim = 2; + *arg = 3; + break; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + *dim = *arg = 3; + break; + default: + assert(0); + break; + } +} + +static void +load_proj_tex_coords(struct bld_context *bld, + struct nv_value *t[4], int dim, + const struct tgsi_full_instruction *insn) +{ + int c, mask = 0; + + t[3] = emit_fetch(bld, insn, 0, 3); + + if (t[3]->insn->opcode == NV_OP_PINTERP) { + t[3]->insn->opcode = NV_OP_LINTERP; + nv_reference(bld->pc, &t[3]->insn->src[1], NULL); + } + + t[3] = bld_insn_1(bld, NV_OP_RCP, t[3]); + + for (c = 0; c < dim; ++c) { + t[c] = emit_fetch(bld, insn, 0, c); + if (t[c]->insn->opcode == NV_OP_LINTERP) + t[c]->insn->opcode = NV_OP_PINTERP; + + if (t[c]->insn->opcode == NV_OP_PINTERP) + nv_reference(bld->pc, &t[c]->insn->src[1], t[3]); + else + mask |= 1 << c; + } + + for (c = 0; mask; ++c, mask >>= 1) { + if (!(mask & 1)) + continue; + t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], t[3]); + } +} + +static void +bld_tex(struct bld_context *bld, struct nv_value *dst0[4], + const struct tgsi_full_instruction *insn) +{ + struct nv_value *t[4]; + struct nv_instruction *nvi; + uint opcode = translate_opcode(insn->Instruction.Opcode); + int arg, dim, c; + + get_tex_dim(insn, &dim, &arg); + + if (insn->Texture.Texture == TGSI_TEXTURE_CUBE) { + } + // else + if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) { + load_proj_tex_coords(bld, t, dim, insn); + } else + for (c = 0; c < dim; ++c) + t[c] = emit_fetch(bld, insn, 0, c); + + if (arg != dim) + t[dim] = emit_fetch(bld, insn, 0, 2); + + if (insn->Instruction.Opcode == TGSI_OPCODE_TXB || + insn->Instruction.Opcode == TGSI_OPCODE_TXL) { + t[arg++] = emit_fetch(bld, insn, 0, 3); + } + + for (c = 0; c < arg; ++c) { + t[c] = bld_insn_1(bld, NV_OP_MOV, t[c]); + t[c]->reg.type = NV_TYPE_F32; + } + + nvi = new_instruction(bld->pc, opcode); + + for (c = 0; c < 4; ++c) { + nvi->def[c] = dst0[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32); + nvi->def[c]->insn = nvi; + } + for (c = 0; c < arg; ++c) + nvi->src[c] = new_ref(bld->pc, t[c]); + + nvi->tex_t = insn->Src[1].Register.Index; + nvi->tex_s = 0; + nvi->tex_mask = 0xf; + nvi->tex_cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; + nvi->tex_live = 0; + nvi->tex_argc = arg; +} + +#define FOR_EACH_DST0_ENABLED_CHANNEL(chan, inst) \ + for (chan = 0; chan < 4; ++chan) \ + if ((inst)->Dst[0].Register.WriteMask & (1 << chan)) + +static void +bld_instruction(struct bld_context *bld, + const struct tgsi_full_instruction *insn) +{ + struct nv_value *src0; + struct nv_value *src1; + struct nv_value *src2; + struct nv_value *dst0[4]; + struct nv_value *temp; + int c; + uint opcode = translate_opcode(insn->Instruction.Opcode); + + tgsi_dump_instruction(insn, 1); + + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_MUL: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, opcode, src0, src1); + } + break; + case TGSI_OPCODE_CMP: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + src0 = bld_predicate(bld, src0); + + src1 = bld_insn_1(bld, NV_OP_MOV, src1); + src1->insn->flags_src = new_ref(bld->pc, src0); + src1->insn->cc = NV_CC_LT; + + src2 = bld_insn_1(bld, NV_OP_MOV, src2); + src2->insn->flags_src = new_ref(bld->pc, src0); + src2->insn->cc = NV_CC_GE; + + dst0[c] = bld_insn_2(bld, NV_OP_SELECT, src1, src2); + } + break; + case TGSI_OPCODE_COS: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + if (insn->Dst[0].Register.WriteMask & 7) + temp = bld_insn_1(bld, NV_OP_COS, temp); + for (c = 0; c < 3; ++c) + if (insn->Dst[0].Register.WriteMask & (1 << c)) + dst0[c] = temp; + if (!(insn->Dst[0].Register.WriteMask & (1 << 3))) + break; + /* XXX: if src0.x is src0.w, don't emit new insns */ + src0 = emit_fetch(bld, insn, 0, 3); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + dst0[3] = bld_insn_1(bld, NV_OP_COS, temp); + break; + case TGSI_OPCODE_DP3: + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); + for (c = 1; c < 3; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); + } + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_DP4: + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); + for (c = 1; c < 4; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); + } + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_EX2: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PREEX2, src0); + temp = bld_insn_1(bld, NV_OP_EX2, temp); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_FRC: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + dst0[c] = bld_insn_1(bld, NV_OP_FLOOR, src0); + dst0[c] = bld_insn_2(bld, NV_OP_SUB, src0, dst0[c]); + } + break; + case TGSI_OPCODE_KIL: + for (c = 0; c < 4; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + bld_kil(bld, src0); + } + break; + case TGSI_OPCODE_IF: + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + nvbb_attach_block(bld->pc->current_block, b); + + bld->join_bb[bld->cond_lvl] = bld->pc->current_block; + bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + + src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0)); + + bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, FALSE); + + ++bld->cond_lvl; + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_ELSE: + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + --bld->cond_lvl; + nvbb_attach_block(bld->join_bb[bld->cond_lvl], b); + + bld->cond_bb[bld->cond_lvl]->exit->target = b; + bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + + new_instruction(bld->pc, NV_OP_BRA)->is_terminator = 1; + + ++bld->cond_lvl; + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_ENDIF: /* XXX: deal with ENDIF; ENDIF; */ + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + --bld->cond_lvl; + nvbb_attach_block(bld->pc->current_block, b); + nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b); + + bld->cond_bb[bld->cond_lvl]->exit->target = b; + + if (0 && bld->join_bb[bld->cond_lvl]) { + bld->join_bb[bld->cond_lvl]->exit->prev->target = b; + + new_instruction(bld->pc, NV_OP_NOP)->is_join = TRUE; + } + + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_BGNLOOP: + assert(0); + break; + case TGSI_OPCODE_BRK: + assert(0); + break; + case TGSI_OPCODE_CONT: + assert(0); + break; + case TGSI_OPCODE_ENDLOOP: + assert(0); + break; + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_CEIL: + case TGSI_OPCODE_FLR: + case TGSI_OPCODE_TRUNC: + case TGSI_OPCODE_DDX: + case TGSI_OPCODE_DDY: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + dst0[c] = bld_insn_1(bld, opcode, src0); + } + break; + case TGSI_OPCODE_LIT: + bld_lit(bld, dst0, insn); + break; + case TGSI_OPCODE_LRP: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + dst0[c] = bld_insn_2(bld, NV_OP_SUB, src1, src2); + dst0[c] = bld_insn_3(bld, NV_OP_MAD, dst0[c], src0, src2); + } + break; + case TGSI_OPCODE_MOV: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = emit_fetch(bld, insn, 0, c); + break; + case TGSI_OPCODE_MAD: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + dst0[c] = bld_insn_3(bld, opcode, src0, src1, src2); + } + break; + case TGSI_OPCODE_POW: + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + temp = bld_pow(bld, src0, src1); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_LG2: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, opcode, src0); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_RSQ: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_ABS, src0); + temp = bld_insn_1(bld, NV_OP_RSQ, temp); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, NV_OP_SET, src0, src1); + dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode); + dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode); + + if (dst0[c]->reg.type != NV_TYPE_F32) + break; + dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]); + dst0[c]->insn->src[0]->typecast = NV_TYPE_S32; + dst0[c]->reg.type = NV_TYPE_S32; + dst0[c] = bld_insn_1(bld, NV_OP_CVT, dst0[c]); + dst0[c]->reg.type = NV_TYPE_F32; + } + break; + case TGSI_OPCODE_SUB: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, NV_OP_ADD, src0, src1); + dst0[c]->insn->src[1]->mod ^= NV_MOD_NEG; + } + break; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + bld_tex(bld, dst0, insn); + break; + case TGSI_OPCODE_XPD: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + if (c == 3) { + dst0[3] = bld_imm_f32(bld, 1.0f); + break; + } + src0 = emit_fetch(bld, insn, 0, (c + 1) % 3); + src1 = emit_fetch(bld, insn, 1, (c + 2) % 3); + dst0[c] = bld_insn_2(bld, NV_OP_MUL, src0, src1); + + src0 = emit_fetch(bld, insn, 0, (c + 2) % 3); + src1 = emit_fetch(bld, insn, 1, (c + 1) % 3); + dst0[c] = bld_insn_3(bld, NV_OP_MAD, src0, src1, dst0[c]); + + dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG; + } + break; + case TGSI_OPCODE_END: + if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) + bld_export_outputs(bld); + break; + default: + NOUVEAU_ERR("nv_bld: unhandled opcode %u\n", insn->Instruction.Opcode); + abort(); + break; + } + + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + emit_store(bld, insn, c, dst0[c]); +} + +int +nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) +{ + struct bld_context *bld = CALLOC_STRUCT(bld_context); + int c; + + pc->root = pc->current_block = new_basic_block(pc); + + bld->pc = pc; + bld->ti = ti; + + pc->loop_nesting_bound = 1; /* XXX: should work with 0 */ + + c = util_bitcount(bld->ti->p->fp.interp >> 24); + if (c && ti->p->type == PIPE_SHADER_FRAGMENT) { + bld->frgcrd[3] = new_value(pc, NV_FILE_MEM_V, NV_TYPE_F32); + bld->frgcrd[3]->reg.id = c - 1; + bld->frgcrd[3] = bld_insn_1(bld, NV_OP_LINTERP, bld->frgcrd[3]); + bld->frgcrd[3] = bld_insn_1(bld, NV_OP_RCP, bld->frgcrd[3]); + } + + tgsi_parse_init(&bld->parse[0], ti->p->pipe.tokens); + + while (!tgsi_parse_end_of_tokens(&bld->parse[bld->call_lvl])) { + const union tgsi_full_token *tok = &bld->parse[bld->call_lvl].FullToken; + + tgsi_parse_token(&bld->parse[bld->call_lvl]); + + switch (tok->Token.Type) { + case TGSI_TOKEN_TYPE_INSTRUCTION: + bld_instruction(bld, &tok->FullInstruction); + break; + default: + break; + } + } + + FREE(bld); + return 0; +} + +#if 0 +/* If a variable is assigned in a loop, replace all references to the value + * from outside the loop with a phi value. + */ +static void +bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b, + struct nv_value *old_val, + struct nv_value *new_val) +{ + struct nv_instruction *nvi; + + for (nvi = b->entry; nvi; nvi = nvi->next) { + int s; + for (s = 0; s < 5; ++s) { + if (!nvi->src[s]) + continue; + if (nvi->src[s]->value == old_val) + nv_reference(pc, &nvi->src[s], new_val); + } + if (nvi->flags_src && nvi->flags_src->value == old_val) + nv_reference(pc, &nvi->flags_src, new_val); + } + b->pass_seq = pc->pass_seq; + + if (b->out[0] && b->out[0]->pass_seq < pc->pass_seq) + bld_adjust_nv_refs(pc, b, old_val, new_val); + + if (b->out[1] && b->out[1]->pass_seq < pc->pass_seq) + bld_adjust_nv_refs(pc, b, old_val, new_val); +} +#endif diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index 864cb09352..6bd52884b5 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -519,7 +519,7 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib, so_data (so, fui(v[1])); break; case 1: - if (attrib == nv50->vertprog->cfg.edgeflag_in) { + if (attrib == nv50->vertprog->vp.edgeflag) { so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1); so_data (so, v[0] ? 1 : 0); } @@ -560,7 +560,7 @@ nv50_vbo_validate(struct nv50_context *nv50) nv50->vbo_fifo = 0; if (nv50->screen->force_push || - nv50->vertprog->cfg.edgeflag_in < 16) + nv50->vertprog->vp.edgeflag < 16) nv50->vbo_fifo = 0xffff; for (i = 0; i < nv50->vtxbuf_nr; i++) { |