From 633f5ac6124b1b57152c09becba92d176e905ae9 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Fri, 23 Jul 2010 21:21:25 +0200 Subject: nv50: import new compiler --- src/gallium/drivers/nv50/Makefile | 11 +- src/gallium/drivers/nv50/nv50_pc.c | 433 ++ src/gallium/drivers/nv50/nv50_pc.h | 431 ++ src/gallium/drivers/nv50/nv50_pc_emit.c | 1139 ++++++ src/gallium/drivers/nv50/nv50_pc_optimize.c | 717 ++++ src/gallium/drivers/nv50/nv50_pc_print.c | 287 ++ src/gallium/drivers/nv50/nv50_pc_regalloc.c | 973 +++++ src/gallium/drivers/nv50/nv50_program.c | 5117 +++--------------------- src/gallium/drivers/nv50/nv50_program.h | 169 +- src/gallium/drivers/nv50/nv50_push.c | 2 +- src/gallium/drivers/nv50/nv50_shader_state.c | 619 +++ src/gallium/drivers/nv50/nv50_state.c | 3 - src/gallium/drivers/nv50/nv50_state_validate.c | 9 + src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 1266 ++++++ src/gallium/drivers/nv50/nv50_vbo.c | 4 +- 15 files changed, 6489 insertions(+), 4691 deletions(-) create mode 100644 src/gallium/drivers/nv50/nv50_pc.c create mode 100644 src/gallium/drivers/nv50/nv50_pc.h create mode 100644 src/gallium/drivers/nv50/nv50_pc_emit.c create mode 100644 src/gallium/drivers/nv50/nv50_pc_optimize.c create mode 100644 src/gallium/drivers/nv50/nv50_pc_print.c create mode 100644 src/gallium/drivers/nv50/nv50_pc_regalloc.c create mode 100644 src/gallium/drivers/nv50/nv50_shader_state.c create mode 100644 src/gallium/drivers/nv50/nv50_tgsi_to_nc.c (limited to 'src') diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile index e31e6f8662..3943a9e257 100644 --- a/src/gallium/drivers/nv50/Makefile +++ b/src/gallium/drivers/nv50/Makefile @@ -10,7 +10,6 @@ C_SOURCES = \ nv50_draw.c \ nv50_miptree.c \ nv50_query.c \ - nv50_program.c \ nv50_resource.c \ nv50_screen.c \ nv50_state.c \ @@ -19,6 +18,14 @@ C_SOURCES = \ nv50_tex.c \ nv50_transfer.c \ nv50_vbo.c \ - nv50_push.c + nv50_push.c \ + nv50_program.c \ + nv50_shader_state.c \ + nv50_pc.c \ + nv50_pc_print.c \ + nv50_pc_emit.c \ + nv50_tgsi_to_nc.c \ + nv50_pc_optimize.c \ + nv50_pc_regalloc.c include ../../Makefile.template diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c new file mode 100644 index 0000000000..8aba0a32b7 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -0,0 +1,433 @@ + +#include "nv50_pc.h" +#include "nv50_program.h" + +#include + +/* returns TRUE if operands 0 and 1 can be swapped */ +boolean +nv_op_commutative(uint opcode) +{ + switch (opcode) { + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_MAD: + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + case NV_OP_MIN: + case NV_OP_MAX: + case NV_OP_SAD: + return TRUE; + default: + return FALSE; + } +} + +/* return operand to which the address register applies */ +int +nv50_indirect_opnd(struct nv_instruction *i) +{ + if (!i->src[4]) + return -1; + + switch (i->opcode) { + case NV_OP_MOV: + case NV_OP_LDA: + return 0; + default: + return 1; + } +} + +boolean +nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s) +{ + if (nvi->flags_src || nvi->flags_def) + return FALSE; + + switch (nvi->opcode) { + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + case NV_OP_SHL: + case NV_OP_SHR: + return (s == 1) && (nvi->def[0]->reg.file == NV_FILE_GPR); + case NV_OP_MOV: + assert(s == 0); + return (nvi->def[0]->reg.file == NV_FILE_GPR); + default: + return FALSE; + } +} + +boolean +nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) +{ + switch (nvi->opcode) { + case NV_OP_ABS: + case NV_OP_ADD: + case NV_OP_CEIL: + case NV_OP_FLOOR: + case NV_OP_TRUNC: + case NV_OP_CVT: + case NV_OP_MAD: + case NV_OP_MUL: + case NV_OP_SAT: + case NV_OP_SUB: + case NV_OP_MAX: + case NV_OP_MIN: + if (s == 0 && (value->reg.file == NV_FILE_MEM_S || + value->reg.file == NV_FILE_MEM_P)) + return TRUE; + if (s == 1 && + value->reg.file >= NV_FILE_MEM_C(0) && + value->reg.file <= NV_FILE_MEM_C(15)) + return TRUE; + if (s == 2 && nvi->src[1]->value->reg.file == NV_FILE_GPR) + return TRUE; + return FALSE; + case NV_OP_MOV: + assert(s == 0); + return TRUE; + default: + return FALSE; + } +} + +ubyte +nv50_supported_src_mods(uint opcode, int s) +{ + switch (opcode) { + case NV_OP_ABS: + return NV_MOD_NEG | NV_MOD_ABS; /* obviously */ + case NV_OP_ADD: + case NV_OP_MUL: + case NV_OP_MAD: + return NV_MOD_NEG; + case NV_OP_DFDX: + case NV_OP_DFDY: + assert(s == 0); + return NV_MOD_NEG; + case NV_OP_MAX: + case NV_OP_MIN: + return NV_MOD_ABS; + case NV_OP_CVT: + case NV_OP_LG2: + case NV_OP_NEG: + case NV_OP_PREEX2: + case NV_OP_PRESIN: + case NV_OP_RCP: + case NV_OP_RSQ: + return NV_MOD_ABS | NV_MOD_NEG; + default: + return 0; + } +} + +int +nv_nvi_refcount(struct nv_instruction *nvi) +{ + int i, rc; + + rc = nvi->flags_def ? nvi->flags_def->refc : 0; + + for (i = 0; i < 4; ++i) { + if (!nvi->def[i]) + return rc; + rc += nvi->def[i]->refc; + } + return rc; +} + +static void +nv_pc_free_refs(struct nv_pc *pc) +{ + int i; + for (i = 0; i < pc->num_refs; i += 64) + FREE(pc->refs[i]); +} + +void +nv_print_program(struct nv_basic_block *b) +{ + struct nv_instruction *i = b->phi; + + b->priv = 0; + + debug_printf("=== BB %i ", b->id); + if (b->out[0]) + debug_printf("(--0> %i) ", b->out[0]->id); + if (b->out[1]) + debug_printf("(--1> %i) ", b->out[1]->id); + debug_printf("===\n"); + + if (!i) + i = b->entry; + for (; i; i = i->next) + nv_print_instruction(i); + + if (!b->out[0]) { + debug_printf("END\n\n"); + return; + } + if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) + return; + + if (b->out[0] != b) + nv_print_program(b->out[0]); + + if (b->out[1] && b->out[1] != b) + nv_print_program(b->out[1]); +} + +static INLINE void +nvcg_show_bincode(struct nv_pc *pc) +{ + int i; + + for (i = 0; i < pc->bin_size / 4; ++i) + debug_printf("0x%08x ", pc->emit[i]); + debug_printf("\n"); +} + +static int +nv50_emit_program(struct nv_pc *pc) +{ + uint32_t *code = pc->emit; + int n; + + debug_printf("emitting program: size = %u\n", pc->bin_size); + + for (n = 0; n < pc->num_blocks; ++n) { + struct nv_instruction *i; + struct nv_basic_block *b = pc->bb_list[n]; + + for (i = b->entry; i; i = i->next) { + nv50_emit_instruction(pc, i); + + pc->bin_pos += 1 + (pc->emit[0] & 1); + pc->emit += 1 + (pc->emit[0] & 1); + } + } + assert(pc->emit == &code[pc->bin_size / 4]); + + /* XXX: we can do better than this ... */ + if ((pc->emit[-1] & 3) == 3) { + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0xe0000000; + pc->bin_size += 8; + } + + pc->emit = code; + code[pc->bin_size / 4 - 1] |= 1; + + nvcg_show_bincode(pc); + + return 0; +} + +int +nv50_generate_code(struct nv50_translation_info *ti) +{ + struct nv_pc *pc; + int ret; + + pc = CALLOC_STRUCT(nv_pc); + if (!pc) + return 1; + + ret = nv50_tgsi_to_nc(pc, ti); + if (ret) + goto out; + + /* optimization */ + ret = nv_pc_exec_pass0(pc); + if (ret) + goto out; + + /* register allocation */ + ret = nv_pc_exec_pass1(pc); + if (ret) + goto out; + + /* prepare for emission */ + ret = nv_pc_exec_pass2(pc); + if (ret) + goto out; + + pc->emit = CALLOC(pc->bin_size / 4 + 2, 4); + if (!pc->emit) { + ret = 3; + goto out; + } + ret = nv50_emit_program(pc); + if (ret) + goto out; + + ti->p->code_size = pc->bin_size; + ti->p->code = pc->emit; + + ti->p->immd_size = pc->immd_count * 4; + ti->p->immd = pc->immd_buf; + + ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] + 1) >> 1; + ti->p->max_gpr++; + + ti->p->fixups = pc->fixups; + ti->p->num_fixups = pc->num_fixups; + + debug_printf("SHADER TRANSLATION - %s\n", ret ? "failure" : "success"); + +out: + nv_pc_free_refs(pc); + if (ret) { + if (pc->emit) + free(pc->emit); + if (pc->immd_buf) + free(pc->immd_buf); + if (pc->fixups) + free(pc->fixups); + } + free(pc); + + return ret; +} + +static void +nvbb_insert_phi(struct nv_basic_block *b, struct nv_instruction *i) +{ + if (!b->phi) { + i->prev = NULL; + b->phi = i; + i->next = b->entry; + if (b->entry) { + assert(!b->entry->prev && b->exit); + b->entry->prev = i; + } else { + b->entry = i; + b->exit = i; + } + } else { + assert(b->entry); + if (b->entry->opcode == NV_OP_PHI) { /* insert after entry */ + assert(b->entry == b->exit); + b->entry->next = i; + i->prev = b->entry; + b->entry = i; + b->exit = i; + } else { /* insert before entry */ + assert(b->entry->prev && b->exit); + i->next = b->entry; + i->prev = b->entry->prev; + b->entry->prev = i; + i->prev->next = i; + } + } +} + +void +nvbb_insert_tail(struct nv_basic_block *b, struct nv_instruction *i) +{ + if (i->opcode == NV_OP_PHI) { + nvbb_insert_phi(b, i); + } else { + i->prev = b->exit; + if (b->exit) + b->exit->next = i; + b->exit = i; + if (!b->entry) + b->entry = i; + else + if (i->prev && i->prev->opcode == NV_OP_PHI) + b->entry = i; + } + + i->bb = b; + b->num_instructions++; +} + +void +nv_nvi_delete(struct nv_instruction *nvi) +{ + struct nv_basic_block *b = nvi->bb; + int j; + + debug_printf("REM: "); nv_print_instruction(nvi); + + for (j = 0; j < 4; ++j) { + if (!nvi->src[j]) + break; + --(nvi->src[j]->value->refc); + nvi->src[j] = NULL; + } + + if (nvi->next) + nvi->next->prev = nvi->prev; + else { + assert(nvi == b->exit); + b->exit = nvi->prev; + } + + if (nvi->prev) + nvi->prev->next = nvi->next; + + if (nvi == b->entry) { + assert(nvi->opcode != NV_OP_PHI || !nvi->next); + + if (!nvi->next || (nvi->opcode == NV_OP_PHI)) + b->entry = nvi->prev; + else + b->entry = nvi->next; + } + + if (nvi == b->phi) { + assert(!nvi->prev); + if (nvi->opcode != NV_OP_PHI) + debug_printf("WARN: b->phi points to non-PHI instruction\n"); + + if (!nvi->next || nvi->next->opcode != NV_OP_PHI) + b->phi = NULL; + else + b->phi = nvi->next; + } +} + +void +nv_nvi_permute(struct nv_instruction *i1, struct nv_instruction *i2) +{ + struct nv_basic_block *b = i1->bb; + + assert(i1->opcode != NV_OP_PHI && + i2->opcode != NV_OP_PHI); + assert(i1->next == i2); + + if (b->exit == i2) + b->exit = i1; + + if (b->entry == i1) + b->entry = i2; + + i2->prev = i1->prev; + i1->next = i2->next; + i2->next = i1; + i1->prev = i2; + + if (i2->prev) + i2->prev->next = i2; + if (i1->next) + i1->next->prev = i1; +} + +void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b) +{ + if (parent->out[0]) { + assert(!parent->out[1]); + parent->out[1] = b; + } else + parent->out[0] = b; + + b->in[b->num_in++] = parent; +} diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h new file mode 100644 index 0000000000..3ab48d0afd --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -0,0 +1,431 @@ +/*************************************************************************/ +/* Copyright (C) 2010 I */ +/* */ +/* This program is free software: you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation, either version 3 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program. If not, see . */ +/*************************************************************************/ + +#ifndef __NV50_COMPILER_H__ +#define __NV50_COMPILER_H__ + +#include "pipe/p_defines.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" + +#define NV_OP_PHI 0 +#define NV_OP_EXTRACT 1 +#define NV_OP_COMBINE 2 +#define NV_OP_LDA 3 +#define NV_OP_STA 4 +#define NV_OP_MOV 5 +#define NV_OP_ADD 6 +#define NV_OP_SUB 7 +#define NV_OP_NEG 8 +#define NV_OP_MUL 9 +#define NV_OP_MAD 10 +#define NV_OP_CVT 11 +#define NV_OP_SAT 12 +#define NV_OP_NOT 13 +#define NV_OP_AND 14 +#define NV_OP_OR 15 +#define NV_OP_XOR 16 +#define NV_OP_SHL 17 +#define NV_OP_SHR 18 +#define NV_OP_RCP 19 +/* gap */ +#define NV_OP_RSQ 21 +#define NV_OP_LG2 22 +#define NV_OP_SIN 23 +#define NV_OP_COS 24 +#define NV_OP_EX2 25 +#define NV_OP_PRESIN 26 +#define NV_OP_PREEX2 27 +#define NV_OP_MIN 28 +#define NV_OP_MAX 29 +#define NV_OP_SET 30 +#define NV_OP_SAD 31 +#define NV_OP_KIL 32 +#define NV_OP_BRA 33 +#define NV_OP_CALL 34 +#define NV_OP_RET 35 +#define NV_OP_BREAK 36 +#define NV_OP_BREAKADDR 37 +#define NV_OP_JOINAT 38 +#define NV_OP_TEX 39 +#define NV_OP_TXB 40 +#define NV_OP_TXL 41 +#define NV_OP_TXF 42 +#define NV_OP_TXQ 43 +#define NV_OP_DFDX 44 +#define NV_OP_DFDY 45 +#define NV_OP_QUADOP 46 +#define NV_OP_LINTERP 47 +#define NV_OP_PINTERP 48 +#define NV_OP_ABS 49 +#define NV_OP_CEIL 50 +#define NV_OP_FLOOR 51 +#define NV_OP_TRUNC 52 +#define NV_OP_NOP 53 +#define NV_OP_SELECT 54 +#define NV_OP_EXPORT 55 +#define NV_OP_COUNT 56 + +#define NV_FILE_GPR 0 +#define NV_FILE_OUT 1 +#define NV_FILE_ADDR 2 +#define NV_FILE_FLAGS 3 +#define NV_FILE_IMM 16 +#define NV_FILE_MEM_S 32 +#define NV_FILE_MEM_P 33 +#define NV_FILE_MEM_V 34 +#define NV_FILE_MEM_L 48 +#define NV_FILE_MEM_G(i) (64 + i) +#define NV_FILE_MEM_C(i) (80 + i) + +#define NV_MOD_NEG 1 +#define NV_MOD_ABS 2 +#define NV_MOD_NOT 4 +#define NV_MOD_SAT 8 + +#define NV_TYPE_U8 0x00 +#define NV_TYPE_S8 0x01 +#define NV_TYPE_U16 0x02 +#define NV_TYPE_S16 0x03 +#define NV_TYPE_U32 0x04 +#define NV_TYPE_S32 0x05 +#define NV_TYPE_P32 0x07 +#define NV_TYPE_F32 0x09 +#define NV_TYPE_F64 0x0b +#define NV_TYPE_VEC(x, n) (NV_TYPE_##x | (n << 4)) +#define NV_TYPE_LO 0x00 +#define NV_TYPE_HI 0x80 +#define NV_TYPE_ANY 0xff + +#define NV_TYPE_ISINT(t) ((t) <= 5) +#define NV_TYPE_ISFLT(t) ((t) & 0x08) + +#define NV_CC_FL 0x0 +#define NV_CC_LT 0x1 +#define NV_CC_EQ 0x2 +#define NV_CC_LE 0x3 +#define NV_CC_GT 0x4 +#define NV_CC_NE 0x5 +#define NV_CC_GE 0x6 +#define NV_CC_U 0x8 +#define NV_CC_TR 0xf + +#define NV_PC_MAX_INSTRUCTIONS 2048 +#define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4) + +static INLINE boolean +nv_is_vector_op(uint opcode) +{ + return (opcode >= NV_OP_TEX) && (opcode <= NV_OP_TXQ); +} + +static INLINE uint +nv_type_order(ubyte type) +{ + switch (type & 0xf) { + case NV_TYPE_U8: + case NV_TYPE_S8: + return 0; + case NV_TYPE_U16: + case NV_TYPE_S16: + return 1; + case NV_TYPE_U32: + case NV_TYPE_F32: + case NV_TYPE_S32: + case NV_TYPE_P32: + return 2; + case NV_TYPE_F64: + return 3; + } + assert(0); +} + +static INLINE uint +nv_type_sizeof(ubyte type) +{ + if (type & 0xf0) + return (1 << nv_type_order(type)) * (type >> 4); + return 1 << nv_type_order(type); +} + +static INLINE uint +nv_type_sizeof_base(ubyte type) +{ + return 1 << nv_type_order(type); +} + +struct nv_reg { + int id; + ubyte file; + ubyte type; /* type of generating instruction's result */ + union { + float f32; + double f64; + int32_t s32; + uint32_t u32; + } imm; +}; + +struct nv_range { + struct nv_range *next; + int bgn; + int end; +}; + +struct nv_value { + struct nv_reg reg; + struct nv_instruction *insn; + struct nv_value *join; + int n; + struct nv_range *livei; + int refc; + + struct nv_value *next; + struct nv_value *prev; +}; + +struct nv_ref { + struct nv_value *value; + struct nv_instruction *insn; + ubyte mod; + ubyte typecast; + ubyte flags; /* not used yet */ +}; + +struct nv_basic_block; + +struct nv_instruction { + struct nv_instruction *next; + struct nv_instruction *prev; + uint opcode; + int serial; + struct nv_value *def[4]; + struct nv_value *flags_def; + struct nv_ref *src[5]; + struct nv_ref *flags_src; + struct nv_basic_block *bb; + struct nv_basic_block *target; /* target block of control flow insn */ + ubyte cc; + ubyte set_cond : 4; + ubyte fixed : 1; /* don't optimize away */ + ubyte is_terminator : 1; + ubyte is_join : 1; + ubyte is_long : 1; /* for emission */ + /* */ + ubyte saturate : 1; + ubyte centroid : 1; + ubyte flat : 1; + ubyte padding : 4; + ubyte tex_live : 1; + /* */ + ubyte tex_t; /* TIC binding */ + ubyte tex_s; /* TSC binding */ + ubyte tex_argc : 3; + ubyte tex_cube : 1; + ubyte tex_mask : 4; + /* */ + ubyte quadop; +}; + +struct nv_basic_block { + struct nv_instruction *entry; /* first non-phi instruction */ + struct nv_instruction *exit; + struct nv_instruction *phi; /* very first instruction */ + int num_instructions; + + struct nv_basic_block *out[2]; /* no indirect branches -> 2 */ + struct nv_basic_block **in; + uint num_in; + + int id; + struct nv_basic_block *last_visitor; + uint priv; + uint pass_seq; + + uint32_t bin_pos; /* position, size in emitted code */ + uint32_t bin_size; + + uint32_t live_set[NV_PC_MAX_VALUES / 32]; +}; + +#define NV_FIXUP_CFLOW_RELOC 0 +#define NV_FIXUP_PARAM_RELOC 1 + +struct nv_fixup { + ubyte type; + ubyte shift; + uint32_t mask; + uint32_t data; + uint32_t offset; +}; + +static INLINE void +nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data) +{ + uint32_t val; + + val = bin[fixup->offset / 4] & ~fixup->mask; + data = (fixup->shift < 0) ? (data >> fixup->shift) : (data << fixup->shift); + val |= (fixup->data + data) & fixup->mask; + bin[fixup->offset / 4] = val; +} + +struct nv_pc { + struct nv50_translation_info *ti; + + struct nv_basic_block *root; + struct nv_basic_block *current_block; + struct nv_basic_block *parent_block; + + int loop_nesting_bound; + uint pass_seq; + + struct nv_value values[NV_PC_MAX_VALUES]; + struct nv_instruction instructions[NV_PC_MAX_INSTRUCTIONS]; + struct nv_ref **refs; + struct nv_basic_block **bb_list; + int num_values; + int num_instructions; + int num_refs; + int num_blocks; + + int max_reg[4]; + + uint32_t *immd_buf; /* populated on emit */ + unsigned immd_count; + + uint32_t *emit; + unsigned bin_size; + unsigned bin_pos; + + struct nv_fixup *fixups; + int num_fixups; +}; + +void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *); + +static INLINE struct nv_instruction * +new_instruction(struct nv_pc *pc, uint opcode) +{ + struct nv_instruction *insn; + + insn = &pc->instructions[pc->num_instructions++]; + assert(pc->num_instructions < NV_PC_MAX_INSTRUCTIONS); + + insn->cc = NV_CC_TR; + insn->opcode = opcode; + + nvbb_insert_tail(pc->current_block, insn); + return insn; +} + +static INLINE struct nv_value * +new_value(struct nv_pc *pc, ubyte file, ubyte type) +{ + struct nv_value *value = &pc->values[pc->num_values]; + + assert(pc->num_values < NV_PC_MAX_VALUES - 1); + + value->n = pc->num_values++; + value->join = value; + value->reg.id = -1; + value->reg.file = file; + value->reg.type = type; + return value; +} + +static INLINE struct nv_ref * +new_ref(struct nv_pc *pc, struct nv_value *val) +{ + int i; + struct nv_ref *ref; + + if ((pc->num_refs % 64) == 0) { + const unsigned old_size = pc->num_refs * sizeof(struct nv_ref *); + const unsigned new_size = (pc->num_refs + 64) * sizeof(struct nv_ref *); + + pc->refs = REALLOC(pc->refs, old_size, new_size); + + ref = CALLOC(64, sizeof(struct nv_ref)); + for (i = 0; i < 64; ++i) + pc->refs[pc->num_refs + i] = &ref[i]; + } + + ref = pc->refs[pc->num_refs++]; + ref->value = val; + ref->typecast = val->reg.type; + + ++val->refc; + return ref; +} + +static INLINE struct nv_basic_block * +new_basic_block(struct nv_pc *pc) +{ + struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block); + + bb->in = CALLOC(sizeof(struct nv_basic_block *), 4); + bb->id = pc->num_blocks++; + return bb; +} + +static INLINE void +nv_reference(struct nv_pc *pc, struct nv_ref **d, struct nv_value *s) +{ + if (*d) + --(*d)->value->refc; + + if (s) { + if (!*d) + *d = new_ref(pc, s); + else { + (*d)->value = s; + ++(s->refc); + } + } else { + assert(*d); + *d = NULL; + } +} + +/* nv50_emit.c */ +void nv50_emit_instruction(struct nv_pc *, struct nv_instruction *); + +/* nv50_print.c */ +const char *nv_opcode_name(uint opcode); +void nv_print_instruction(struct nv_instruction *); + +/* nv50_pc.c */ +void nv_print_program(struct nv_basic_block *b); + +boolean nv_op_commutative(uint opcode); +int nv50_indirect_opnd(struct nv_instruction *); +boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s); +boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *); +ubyte nv50_supported_src_mods(uint opcode, int s); +int nv_nvi_refcount(struct nv_instruction *); +void nv_nvi_delete(struct nv_instruction *); +void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); +void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *); + +int nv_pc_exec_pass0(struct nv_pc *pc); +int nv_pc_exec_pass1(struct nv_pc *pc); +int nv_pc_exec_pass2(struct nv_pc *pc); + +int nv50_tgsi_to_nc(struct nv_pc *, struct nv50_translation_info *); + +#endif // NV50_COMPILER_H diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c new file mode 100644 index 0000000000..b917d23232 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -0,0 +1,1139 @@ +/*************************************************************************/ +/* Copyright (C) 2009 */ +/* */ +/* This program is free software: you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation, either version 3 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program. If not, see . */ +/*************************************************************************/ + +#include "nv50_context.h" +#include "nv50_pc.h" + +// Definitions + +#define FLAGS_CC_SHIFT 7 +#define FLAGS_ID_SHIFT 12 +#define FLAGS_WR_ID_SHIFT 4 +#define FLAGS_CC_MASK (0x1f << FLAGS_CC_SHIFT) +#define FLAGS_ID_MASK (0x03 << FLAGS_ID_SHIFT) +#define FLAGS_WR_EN (1 << 6) +#define FLAGS_WR_ID_MASK (0x3 << FLAGS_WR_ID_SHIFT) + +const ubyte nv50_inst_min_size_tab[NV_OP_COUNT] = +{ + 0, 0, 0, 8, 8, 4, 4, 4, 8, 4, 4, 8, 8, 8, 8, 8, /* 15 */ + 8, 8, 8, 4, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 31 */ + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 47 */ + 4, 8, 8, 8, 8, 8, 0, 0 +}; + +/* XXX: silence, you ! */ +unsigned +nv50_inst_min_size(struct nv_instruction *i); + +unsigned +nv50_inst_min_size(struct nv_instruction *i) +{ + int n; + + if (nv50_inst_min_size_tab[i->opcode] > 4) + return 8; + + if (i->def[0] && i->def[0]->reg.file != NV_FILE_GPR) + return 8; + if (i->def[0]->join->reg.id > 63) + return 8; + + for (n = 0; n < 3; ++n) { + if (!i->src[n]) + break; + if (i->src[n]->value->reg.file != NV_FILE_GPR && + i->src[n]->value->reg.file != NV_FILE_MEM_V) + return 8; + if (i->src[n]->value->reg.id > 63) + return 8; + } + + if (i->flags_def || i->flags_src || i->src[4]) + return 8; + + if (i->src[2]) { + if (i->saturate || i->src[2]->mod) + return 8; + if (i->src[0]->mod ^ i->src[1]->mod) + return 8; + if ((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS) + return 8; + if (i->def[0]->join->reg.id < 0 || + i->def[0]->join->reg.id != i->src[2]->value->join->reg.id) + return 8; + } + + return nv50_inst_min_size_tab[i->opcode]; +} + +static INLINE ubyte +STYPE(struct nv_instruction *nvi, int s) +{ + return nvi->src[s]->typecast; +} + +static INLINE ubyte +DTYPE(struct nv_instruction *nvi, int d) +{ + return nvi->def[d]->reg.type; +} + +static INLINE struct nv_reg * +SREG(struct nv_ref *ref) +{ + return &ref->value->join->reg; +} + +static INLINE struct nv_reg * +DREG(struct nv_value *val) +{ + return &val->join->reg; +} + +static INLINE ubyte +SFILE(struct nv_instruction *nvi, int s) +{ + return nvi->src[s]->value->reg.file; +} + +static INLINE ubyte +DFILE(struct nv_instruction *nvi, int d) +{ + return nvi->def[0]->reg.file; +} + +static INLINE void +SID(struct nv_pc *pc, struct nv_ref *ref, int pos) +{ + pc->emit[pos / 32] |= SREG(ref)->id << (pos % 32); +} + +static INLINE void +DID(struct nv_pc *pc, struct nv_value *val, int pos) +{ + pc->emit[pos / 32] |= DREG(val)->id << (pos % 32); +} + +static INLINE uint32_t +get_immd_u32(struct nv_ref *ref) +{ + assert(ref->value->reg.file == NV_FILE_IMM); + return ref->value->reg.imm.u32; +} + +static INLINE void +set_immd_u32(struct nv_pc *pc, uint32_t u32) +{ + pc->emit[1] |= 3; + pc->emit[0] |= (u32 & 0x3f) << 16; + pc->emit[1] |= (u32 >> 6) << 2; +} + +static INLINE void +set_immd(struct nv_pc *pc, struct nv_ref *ref) +{ + assert(ref->value->reg.file == NV_FILE_IMM); + set_immd_u32(pc, get_immd_u32(ref)); +} + +static void +new_fixup(struct nv_pc *pc, unsigned type, uint32_t data, uint32_t m, int s) +{ + const unsigned size = sizeof(struct nv_fixup); + const unsigned n = pc->num_fixups; + return; + + if (!(n % 8)) + pc->fixups = REALLOC(pc->fixups, n * size, (n + 8) * size); + + pc->fixups[n].offset = pc->bin_pos + (s / 32); + pc->fixups[n].type = type; + pc->fixups[n].data = data; + pc->fixups[n].mask = m << (s % 32); + pc->fixups[n].shift = s % 32; + + ++pc->num_fixups; + + assert(((data << (s % 32)) & pc->fixups[n].mask) == (data << (s % 32))); +} + +static void +nv_pc_alloc_immd(struct nv_pc *pc, struct nv_ref *ref) +{ + uint32_t i, val = get_immd_u32(ref); + + for (i = 0; i < pc->immd_count; ++i) + if (pc->immd_buf[i] == val) + break; + + if (i == pc->immd_count) { + if (!(pc->immd_count % 8)) + pc->immd_buf = REALLOC(pc->immd_buf, + pc->immd_count * 4, (pc->immd_count + 8) * 4); + pc->immd_buf[pc->immd_count++] = val; + } + + SREG(ref)->id = i; +} + +static INLINE void +set_pred(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!(pc->emit[1] & 0x00003f80)); + + pc->emit[1] |= i->cc << 7; + if (i->flags_src) + pc->emit[1] |= SREG(i->flags_src)->id << 12; +} + +static INLINE void +set_pred_wr(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!(pc->emit[1] & 0x00000070)); + + if (i->flags_def) + pc->emit[1] |= (DREG(i->flags_def)->id << 4) | 0x40; +} + +static INLINE void +set_a16_bits(struct nv_pc *pc, uint id) +{ + ++id; /* $a0 is always 0 */ + pc->emit[0] |= (id & 3) << 26; + pc->emit[1] |= id & 4; +} + +static INLINE void +set_addr(struct nv_pc *pc, struct nv_instruction *i) +{ + if (i->src[4]) + set_a16_bits(pc, SREG(i->src[4])->id); +} + +static void +set_dst(struct nv_pc *pc, struct nv_value *value) +{ + struct nv_reg *reg = &value->join->reg; + + if (reg->id < 0) { + debug_printf("WARNING: unused dst, hope we can bucket it !\n"); + pc->emit[0] |= 127 << 2; + pc->emit[1] |= 0x8; + return; + } + + if (reg->file == NV_FILE_OUT) + pc->emit[1] |= 0x8; + else + if (reg->file == NV_FILE_ADDR) + assert(0); + + pc->emit[0] |= reg->id << 2; +} + +static void +set_src_0(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file == NV_FILE_MEM_S) + pc->emit[1] |= 0x00200000; + else + if (reg->file == NV_FILE_MEM_P) + pc->emit[0] |= 0x01800000; + else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src0 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[0] |= reg->id << 9; +} + +static void +set_src_1(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file >= NV_FILE_MEM_C(0) && + reg->file <= NV_FILE_MEM_C(15)) { + assert(!(pc->emit[1] & 0x01800000)); + + pc->emit[0] |= 0x00800000; + pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; + } else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src1 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[0] |= reg->id << 16; +} + +static void +set_src_2(struct nv_pc *pc, struct nv_ref *ref) +{ + struct nv_reg *reg = SREG(ref); + + if (reg->file >= NV_FILE_MEM_C(0) && + reg->file <= NV_FILE_MEM_C(15)) { + assert(!(pc->emit[1] & 0x01800000)); + + pc->emit[0] |= 0x01000000; + pc->emit[1] |= (reg->file - NV_FILE_MEM_C(0)) << 22; + } else + if (reg->file != NV_FILE_GPR) + NOUVEAU_ERR("invalid src2 register file: %d\n", reg->file); + + assert(reg->id < 128); + pc->emit[1] |= reg->id << 14; +} + +/* the default form: + * - long instruction + * - 1 to 3 sources in slots 0, 1, 2 + * - address & flags + */ +static void +emit_form_MAD(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] |= 1; + + set_pred(pc, i); + set_pred_wr(pc, i); + + if (i->def[0]) + set_dst(pc, i->def[0]); + else { + pc->emit[0] |= 0x01fc; + pc->emit[1] |= 0x0008; + } + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_1(pc, i->src[1]); + + if (i->src[2]) + set_src_2(pc, i->src[2]); + + set_addr(pc, i); +} + +/* like default form, but 2nd source in slot 2, no 3rd source */ +static void +emit_form_ADD(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] |= 1; + + if (i->def[0]) + set_dst(pc, i->def[0]); + else { + pc->emit[0] |= 0x01fc; + pc->emit[1] |= 0x0008; + } + + set_pred(pc, i); + set_pred_wr(pc, i); + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_2(pc, i->src[1]); + + set_addr(pc, i); +} + +/* short mul */ +static void +emit_form_MUL(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!i->is_long && !(pc->emit[0] & 1)); + + assert(i->def[0]); + set_dst(pc, i->def[0]); + + if (i->src[0]) + set_src_0(pc, i->src[0]); + + if (i->src[1]) + set_src_1(pc, i->src[1]); +} + +/* default immediate form + * - 1 to 3 sources where last is immediate + * - no address or predicate possible + */ +static void +emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask) +{ + pc->emit[0] |= 1; + + assert(i->def[0]); + assert(i->src[0]); + set_dst(pc, i->def[0]); + + assert(!i->src[4] && !i->flags_src && !i->flags_def); + + if (i->src[2]) { + set_immd(pc, i->src[2]); + set_src_0(pc, i->src[1]); + set_src_1(pc, i->src[0]); + } else + if (i->src[1]) { + set_immd(pc, i->src[1]); + set_src_0(pc, i->src[0]); + } else + set_immd(pc, i->src[0]); + + assert(!mod_mask); +} + +static void +set_ld_st_size(struct nv_pc *pc, ubyte type) +{ + switch (type) { + case NV_TYPE_F64: + pc->emit[1] |= 0x8000; + break; + case NV_TYPE_F32: + case NV_TYPE_S32: + case NV_TYPE_U32: + pc->emit[1] |= 0xc000; + break; + case NV_TYPE_S16: + pc->emit[1] |= 0x6000; + break; + case NV_TYPE_U16: + pc->emit[1] |= 0x4000; + break; + case NV_TYPE_S8: + pc->emit[1] |= 0x2000; + break; + default: + break; + } +} + +static void +emit_ld(struct nv_pc *pc, struct nv_instruction *i) +{ + ubyte sf = SFILE(i, 0); + + if (sf == NV_FILE_IMM) { + sf = NV_FILE_MEM_C(0); + nv_pc_alloc_immd(pc, i->src[0]); + + new_fixup(pc, NV_FIXUP_PARAM_RELOC, SREG(i->src[0])->id, 0xffff, 9); + } + + if (sf == NV_FILE_MEM_S || + sf == NV_FILE_MEM_P) { + pc->emit[0] = 0x10000001; + pc->emit[1] = 0x04200000 | (0x3c << 12); + if (sf == NV_FILE_MEM_P) + pc->emit[0] |= 0x01800000; + } else + if (sf >= NV_FILE_MEM_C(0) && + sf <= NV_FILE_MEM_C(15)) { + pc->emit[0] = 0x10000001; + pc->emit[1] = 0x24000000; + pc->emit[1] |= (sf - NV_FILE_MEM_C(0)) << 22; + } else + if (sf >= NV_FILE_MEM_G(0) && + sf <= NV_FILE_MEM_G(15)) { + pc->emit[0] = 0xd0000001 | ((sf - NV_FILE_MEM_G(0)) << 16); + pc->emit[1] = 0xa0000000; + + assert(i->src[4] && SREG(i->src[4])->file == NV_FILE_GPR); + SID(pc, i->src[4], 9); + } else + if (sf == NV_FILE_MEM_L) { + pc->emit[0] = 0xd0000001; + pc->emit[1] = 0x40000000; + } else { + NOUVEAU_ERR("invalid ld source file\n"); + abort(); + } + + set_ld_st_size(pc, STYPE(i, 0)); + + set_dst(pc, i->def[0]); + set_pred_wr(pc, i); + + set_pred(pc, i); + + if (sf < NV_FILE_MEM_G(0) || + sf > NV_FILE_MEM_G(15)) { + SID(pc, i->src[0], 9); + set_addr(pc, i); + } +} + +static void +emit_st(struct nv_pc *pc, struct nv_instruction *i) +{ + +} + +static int +verify_mov(struct nv_instruction *i) +{ + ubyte sf = SFILE(i, 0); + ubyte df = DFILE(i, 0); + + if (df == NV_FILE_GPR) + return 0; + + if (df != NV_FILE_OUT && + df != NV_FILE_FLAGS && + df != NV_FILE_ADDR) + return 1; + + if (sf == NV_FILE_FLAGS) + return 2; + if (sf == NV_FILE_ADDR) + return 3; + if (sf == NV_FILE_IMM && df != NV_FILE_OUT) + return 4; + + return 0; +} + +static void +emit_mov(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(!verify_mov(i)); + + if (SFILE(i, 0) >= NV_FILE_MEM_S) + emit_ld(pc, i); + else + if (SFILE(i, 0) == NV_FILE_FLAGS) { + pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); + pc->emit[1] = 0x20000780 | (SREG(i->src[0])->id << 12); + } else + if (SFILE(i, 0) == NV_FILE_ADDR) { + pc->emit[0] = 0x00000001 | (DREG(i->def[0])->id << 2); + pc->emit[1] = 0x40000780; + set_a16_bits(pc, SREG(i->src[0])->id); + } else + if (DFILE(i, 0) == NV_FILE_FLAGS) { + pc->emit[0] = 0x000001fd; + pc->emit[1] = 0xa0000788 | (1 << 6); + pc->emit[0] |= SREG(i->src[0])->id << 9; + pc->emit[1] |= DREG(i->def[0])->id << 4; + } else + if (SFILE(i, 0) == NV_FILE_IMM) { + if (i->opcode == NV_OP_LDA) + emit_ld(pc, i); + else { + pc->emit[0] = 0x10008001; + pc->emit[1] = 0x00000003; + + emit_form_IMM(pc, i, 0); + } + } else { + pc->emit[0] = 0x10000000; + pc->emit[0] |= DREG(i->def[0])->id << 2; + pc->emit[0] |= SREG(i->src[0])->id << 9; + + if (!i->is_long) + pc->emit[0] |= 0x8000; + else { + pc->emit[0] |= 0x00000001; + pc->emit[1] = 0x0403c000; + + set_pred(pc, i); + } + } + + if (DFILE(i, 0) == NV_FILE_OUT) + pc->emit[1] |= 0x8; +} + +static void +emit_interp(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x80000000; + + assert(DFILE(i, 0) == NV_FILE_GPR); + assert(SFILE(i, 0) == NV_FILE_MEM_V); + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 16); + + if (i->flat) + pc->emit[0] |= 1 << 8; + else + if (i->opcode == NV_OP_PINTERP) { + pc->emit[0] |= 1 << 25; + pc->emit[0] |= SREG(i->src[1])->id << 9; + } + + if (i->centroid) + pc->emit[0] |= 1 << 24; + + if (i->is_long) { + pc->emit[1] |= 0x0780 | + (pc->emit[0] & (3 << 24)) >> (24 - 16) | + (pc->emit[0] & (1 << 8)) >> (18 - 8); + + pc->emit[0] |= 1; + pc->emit[0] &= ~0x03000100; + } +} + +static void +emit_minmax(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x30000000; + pc->emit[1] = (i->opcode == NV_OP_MIN) ? (2 << 28) : 0; + + switch (DTYPE(i, 0)) { + case NV_TYPE_F32: + pc->emit[0] |= 0x80000000; + pc->emit[1] |= 0x80000000; + break; + case NV_TYPE_S32: + pc->emit[1] |= 0x8c000000; + break; + case NV_TYPE_U32: + pc->emit[1] |= 0x84000000; + break; + } + + emit_form_MAD(pc, i); + + if (i->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; + if (i->src[1]->mod & NV_MOD_ABS) pc->emit[1] |= 0x00080000; +} + +static void +emit_add_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xb0000000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; + } else + if (i->is_long) { + emit_form_ADD(pc, i); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 26; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 27; + } else { + emit_form_MUL(pc, i); + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 0x8000; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; + } +} + +static void +emit_add_b32(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x20008000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + } else + if (i->is_long) { + pc->emit[0] = 0x20000000; + pc->emit[1] = 0x04000000; + emit_form_ADD(pc, i); + } else { + emit_form_MUL(pc, i); + } + + if (i->src[0]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 28; + if (i->src[1]->mod & NV_MOD_NEG) pc->emit[0] |= 1 << 22; +} + +static void +emit_add_a16(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xd0000001 | (get_immd_u32(i->src[0]) << 9); + pc->emit[1] = 0x20000000; + + pc->emit[0] |= (DREG(i->def[0])->id + 1) << 2; + + set_pred(pc, i); + + if (i->src[1]) + set_a16_bits(pc, SREG(i->src[1])->id); +} + +static void +emit_flow(struct nv_pc *pc, struct nv_instruction *i, ubyte flow_op) +{ + pc->emit[0] = 0x00000003 | (flow_op << 28); + pc->emit[1] = 0x00000000; + + set_pred(pc, i); + + if (i->target) { + new_fixup(pc, NV_FIXUP_CFLOW_RELOC, i->target->bin_pos, 0x7ff800, 11); + pc->emit[0] |= (i->target->bin_pos / 4) << 11; + } +} + +static INLINE void +emit_add(struct nv_pc *pc, struct nv_instruction *i) +{ + if (DFILE(i, 0) == NV_FILE_ADDR) + emit_add_a16(pc, i); + else { + switch (DTYPE(i, 0)) { + case NV_TYPE_F32: + emit_add_f32(pc, i); + break; + case NV_TYPE_U32: + case NV_TYPE_S32: + emit_add_b32(pc, i); + break; + } + } +} + +static void +emit_bitop2(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xd0000000; + + if (SFILE(i, 0) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (i->opcode == NV_OP_OR) + pc->emit[0] |= 0x0100; + else + if (i->opcode == NV_OP_XOR) + pc->emit[0] |= 0x8000; + } else { + emit_form_MAD(pc, i); + + pc->emit[1] |= 0x04000000; + + if (i->opcode == NV_OP_OR) + pc->emit[1] |= 0x4000; + else + if (i->opcode == NV_OP_XOR) + pc->emit[1] |= 0x8000; + } +} + +static void +emit_shift(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0x30000001; + pc->emit[1] = 0xc4000000; + + if (i->opcode == NV_OP_SHR) + pc->emit[1] |= 1 << 29; + + if (SFILE(i, 1) == NV_FILE_IMM) { + pc->emit[1] |= 1 << 20; + pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x7f) << 16; + + set_pred(pc, i); + } else + emit_form_MAD(pc, i); + + if (STYPE(i, 0) == NV_TYPE_S32) + pc->emit[1] |= 1 << 27; +} + +static void +emit_flop(struct nv_pc *pc, struct nv_instruction *i) +{ + struct nv_ref *src0 = i->src[0]; + + pc->emit[0] = 0x90000000; + + assert(SREG(src0)->type == NV_TYPE_F32); + assert(SREG(src0)->file == NV_FILE_GPR); + + if (!i->is_long) { + emit_form_MUL(pc, i); + assert(i->opcode == NV_OP_RCP && !src0->mod); + return; + } + + pc->emit[1] = (i->opcode - NV_OP_RCP) << 29; + + emit_form_MAD(pc, i); + + if (src0->mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; + if (src0->mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_mad_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + const boolean neg_mul = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; + const boolean neg_add = (i->src[2]->mod & NV_MOD_NEG); + + pc->emit[0] = 0xe0000000; + + if (!i->is_long) { + emit_form_MUL(pc, i); + assert(!neg_mul && !neg_add); + return; + } + + emit_form_MAD(pc, i); + + if (neg_mul) pc->emit[1] |= 0x04000000; + if (neg_add) pc->emit[1] |= 0x08000000; + + if (i->saturate) + pc->emit[1] |= 0x20000000; +} + +static INLINE void +emit_mad(struct nv_pc *pc, struct nv_instruction *i) +{ + emit_mad_f32(pc, i); +} + +static void +emit_mul_f32(struct nv_pc *pc, struct nv_instruction *i) +{ + boolean neg = (i->src[0]->mod ^ i->src[1]->mod) & NV_MOD_NEG; + + pc->emit[0] = 0xc0000000; + + if (SFILE(i, 1) == NV_FILE_IMM) { + emit_form_IMM(pc, i, 0); + + if (neg) + pc->emit[0] |= 0x8000; + } else + if (i->is_long) { + emit_form_MAD(pc, i); + + if (neg) + pc->emit[1] |= 0x08 << 24; + } else { + emit_form_MUL(pc, i); + + if (neg) + pc->emit[0] |= 0x8000; + } +} + +static void +emit_set(struct nv_pc *pc, struct nv_instruction *nvi) +{ + assert(nvi->is_long); + + pc->emit[0] = 0x30000000; + pc->emit[1] = 0x60000000; + + pc->emit[1] |= nvi->set_cond << 14; + + switch (STYPE(nvi, 0)) { + case NV_TYPE_U32: pc->emit[1] |= 0x04000000; break; + case NV_TYPE_S32: pc->emit[1] |= 0x0c000000; break; + case NV_TYPE_F32: pc->emit[0] |= 0x80000000; break; + default: + assert(0); + break; + } + + emit_form_MAD(pc, nvi); +} + +#define CVT_RN (0x00 << 16) +#define CVT_FLOOR (0x02 << 16) +#define CVT_CEIL (0x04 << 16) +#define CVT_TRUNC (0x06 << 16) +#define CVT_SAT (0x08 << 16) +#define CVT_ABS (0x10 << 16) + +#define CVT_X32_X32 0x04004000 +#define CVT_X32_S32 0x04014000 +#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) +#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) +#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) +#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) +#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) +#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) +#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) +#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) +#define CVT_U32_U32 ((0x00 << 24) | CVT_X32_X32) + +#define CVT_NEG 0x20000000 +#define CVT_RI 0x08000000 + +static void +emit_cvt(struct nv_pc *pc, struct nv_instruction *nvi) +{ + ubyte dst_type = nvi->def[0] ? DTYPE(nvi, 0) : STYPE(nvi, 0); + + pc->emit[0] = 0xa0000000; + + switch (dst_type) { + case NV_TYPE_F32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_F32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_F32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_F32_U32; break; + } + break; + case NV_TYPE_S32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_S32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_S32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_S32_U32; break; + } + break; + case NV_TYPE_U32: + switch (STYPE(nvi, 0)) { + case NV_TYPE_F32: pc->emit[1] = CVT_U32_F32; break; + case NV_TYPE_S32: pc->emit[1] = CVT_U32_S32; break; + case NV_TYPE_U32: pc->emit[1] = CVT_U32_U32; break; + } + break; + } + if (pc->emit[1] == CVT_F32_F32 && + (nvi->opcode == NV_OP_CEIL || nvi->opcode == NV_OP_FLOOR || + nvi->opcode == NV_OP_TRUNC)) + pc->emit[1] |= CVT_RI; + + switch (nvi->opcode) { + case NV_OP_CEIL: pc->emit[1] |= CVT_CEIL; break; + case NV_OP_FLOOR: pc->emit[1] |= CVT_FLOOR; break; + case NV_OP_TRUNC: pc->emit[1] |= CVT_TRUNC; break; + + case NV_OP_ABS: pc->emit[1] |= CVT_ABS; break; + case NV_OP_SAT: pc->emit[1] |= CVT_SAT; break; + case NV_OP_NEG: pc->emit[1] |= CVT_NEG; break; + default: + assert(nvi->opcode == NV_OP_CVT); + break; + } + assert(nvi->opcode != NV_OP_ABS || !(nvi->src[0]->mod & NV_MOD_NEG)); + + if (nvi->src[0]->mod & NV_MOD_NEG) pc->emit[1] ^= CVT_NEG; + if (nvi->src[0]->mod & NV_MOD_ABS) pc->emit[1] |= CVT_ABS; + + emit_form_MAD(pc, nvi); +} + +static void +emit_tex(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0x00000000; + + DID(pc, i->def[0], 2); + + set_pred(pc, i); + + pc->emit[0] |= i->tex_t << 9; + pc->emit[0] |= i->tex_s << 17; + + pc->emit[0] |= i->tex_argc << 22; + + pc->emit[0] |= (i->tex_mask & 0x3) << 25; + pc->emit[1] |= (i->tex_mask & 0xc) << 12; + + if (i->tex_live) + pc->emit[1] |= 4; + + if (i->tex_cube) + pc->emit[0] |= 0x08000000; + + if (i->opcode == NV_OP_TXB) + pc->emit[1] |= 0x20000000; + else + if (i->opcode == NV_OP_TXL) + pc->emit[1] |= 0x40000000; + else + pc->emit[0] -= 1 << 22; +} + +static void +emit_cvt2fixed(struct nv_pc *pc, struct nv_instruction *i) +{ + ubyte mod = i->src[0]->mod; + + pc->emit[0] = 0xb0000000; + pc->emit[1] = 0xc0000000; + + if (i->opcode == NV_OP_PREEX2) + pc->emit[1] |= 0x4000; + + emit_form_MAD(pc, i); + + if (mod & NV_MOD_NEG) pc->emit[1] |= 0x04000000; + if (mod & NV_MOD_ABS) pc->emit[1] |= 0x00100000; +} + +static void +emit_ddx(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + + pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0240001 : 0xc0140001; + pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x86400000 : 0x89800000; + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 9); + SID(pc, i->src[0], 32 + 14); + + set_pred(pc, i); + set_pred_wr(pc, i); +} + +static void +emit_ddy(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(i->is_long && SFILE(i, 0) == NV_FILE_GPR); + + pc->emit[0] = (i->src[0]->mod & NV_MOD_NEG) ? 0xc0250001 : 0xc0150001; + pc->emit[1] = (i->src[0]->mod & NV_MOD_NEG) ? 0x85800000 : 0x8a400000; + + DID(pc, i->def[0], 2); + SID(pc, i->src[0], 9); + SID(pc, i->src[0], 32 + 14); + + set_pred(pc, i); + set_pred_wr(pc, i); +} + +void +nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) +{ + // nv_print_instruction(i); + + switch (i->opcode) { + case NV_OP_MOV: + if (DFILE(i, 0) == NV_FILE_ADDR) + emit_add_a16(pc, i); + else + emit_mov(pc, i); + break; + case NV_OP_LDA: + emit_mov(pc, i); + break; + case NV_OP_STA: + emit_st(pc, i); + break; + case NV_OP_LINTERP: + case NV_OP_PINTERP: + emit_interp(pc, i); + break; + case NV_OP_ADD: + emit_add(pc, i); + break; + case NV_OP_AND: + case NV_OP_OR: + case NV_OP_XOR: + emit_bitop2(pc, i); + break; + case NV_OP_CVT: + case NV_OP_ABS: + case NV_OP_NEG: + case NV_OP_SAT: + case NV_OP_CEIL: + case NV_OP_FLOOR: + case NV_OP_TRUNC: + emit_cvt(pc, i); + break; + case NV_OP_DFDX: + emit_ddx(pc, i); + break; + case NV_OP_DFDY: + emit_ddy(pc, i); + break; + case NV_OP_RCP: + case NV_OP_RSQ: + case NV_OP_LG2: + case NV_OP_SIN: + case NV_OP_COS: + case NV_OP_EX2: + emit_flop(pc, i); + break; + case NV_OP_PRESIN: + case NV_OP_PREEX2: + emit_cvt2fixed(pc, i); + break; + case NV_OP_MAD: + emit_mad(pc, i); + break; + case NV_OP_MAX: + case NV_OP_MIN: + emit_minmax(pc, i); + break; + case NV_OP_MUL: + emit_mul_f32(pc, i); + break; + case NV_OP_SET: + emit_set(pc, i); + break; + case NV_OP_SHL: + case NV_OP_SHR: + emit_shift(pc, i); + break; + case NV_OP_TEX: + case NV_OP_TXB: + case NV_OP_TXL: + emit_tex(pc, i); + break; + case NV_OP_KIL: + emit_flow(pc, i, 0x0); + break; + case NV_OP_BRA: + emit_flow(pc, i, 0x1); + break; + case NV_OP_CALL: + emit_flow(pc, i, 0x2); + break; + case NV_OP_RET: + emit_flow(pc, i, 0x3); + break; + case NV_OP_BREAKADDR: + emit_flow(pc, i, 0x4); + break; + case NV_OP_BREAK: + emit_flow(pc, i, 0x5); + break; + case NV_OP_JOINAT: + emit_flow(pc, i, 0xa); + break; + case NV_OP_NOP: + pc->emit[0] = 0xf0000001; + pc->emit[1] = 0xe0000000; + break; + case NV_OP_PHI: + case NV_OP_SUB: + NOUVEAU_ERR("operation \"%s\" should have been eliminated\n", + nv_opcode_name(i->opcode)); + break; + default: + NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode); + abort(); + break; + } + + assert((pc->emit[0] & 1) == i->is_long); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c new file mode 100644 index 0000000000..0811420e42 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -0,0 +1,717 @@ + +#include "nv50_pc.h" + +#define DESCEND_ARBITRARY(j, f) \ +do { \ + b->pass_seq = ctx->pc->pass_seq; \ + \ + for (j = 0; j < 2; ++j) \ + if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \ + f(ctx, b->out[j]); \ +} while (0) + +extern unsigned nv50_inst_min_size(struct nv_instruction *); + +struct nv_pc_pass { + struct nv_pc *pc; +}; + +static INLINE boolean +values_equal(struct nv_value *a, struct nv_value *b) +{ + /* XXX: sizes */ + return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id); +} + +static INLINE boolean +inst_commutation_check(struct nv_instruction *a, + struct nv_instruction *b) +{ + int si, di; + + for (di = 0; di < 4; ++di) { + if (!a->def[di]) + break; + for (si = 0; si < 5; ++si) { + if (!b->src[si]) + continue; + if (values_equal(a->def[di], b->src[si]->value)) + return FALSE; + } + } + + if (b->flags_src && b->flags_src->value == a->flags_def) + return FALSE; + + return TRUE; +} + +/* Check whether we can swap the order of the instructions, + * where a & b may be either the earlier or the later one. + */ +static boolean +inst_commutation_legal(struct nv_instruction *a, + struct nv_instruction *b) +{ + return inst_commutation_check(a, b) && inst_commutation_check(b, a); +} + +static INLINE boolean +inst_cullable(struct nv_instruction *nvi) +{ + return (!(nvi->is_terminator || + nvi->target || + nvi->fixed || + nv_nvi_refcount(nvi))); +} + +static INLINE boolean +nvi_isnop(struct nv_instruction *nvi) +{ + if (nvi->opcode == NV_OP_EXPORT) + return TRUE; + + if (nvi->fixed || + nvi->is_terminator || + nvi->flags_src || + nvi->flags_def) + return FALSE; + + if (nvi->def[0]->join->reg.id < 0) + return TRUE; + + if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) + return FALSE; + + if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file) + return FALSE; + + if (nvi->src[0]->value->join->reg.id < 0) { + debug_printf("nvi_isnop: orphaned value detected\n"); + return TRUE; + } + + if (nvi->opcode == NV_OP_SELECT) + if (!values_equal(nvi->def[0], nvi->src[1]->value)) + return FALSE; + + return values_equal(nvi->def[0], nvi->src[0]->value); +} + +static void +nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *next; + int j; + uint size, n32 = 0; + + b->priv = 0; + + if (pc->num_blocks) + b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos + + pc->bb_list[pc->num_blocks - 1]->bin_size; + + pc->bb_list[pc->num_blocks++] = b; + + /* visit node */ + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi_isnop(nvi)) + nv_nvi_delete(nvi); + } + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + + size = nv50_inst_min_size(nvi); + if (nvi->next && size < 8) + ++n32; + else + if ((n32 & 1) && nvi->next && + nv50_inst_min_size(nvi->next) == 4 && + inst_commutation_legal(nvi, nvi->next)) { + ++n32; + debug_printf("permuting: "); + nv_print_instruction(nvi); + nv_print_instruction(nvi->next); + nv_nvi_permute(nvi, nvi->next); + next = nvi; + } else { + nvi->is_long = 1; + + b->bin_size += n32 & 1; + if (n32 & 1) + nvi->prev->is_long = 1; + n32 = 0; + } + b->bin_size += 1 + nvi->is_long; + } + + if (!b->entry) { + debug_printf("block %p is now empty\n", b); + } else + if (!b->exit->is_long) { + assert(n32); + b->exit->is_long = 1; + b->bin_size += 1; + + /* might have del'd a hole tail of instructions */ + if (!b->exit->prev->is_long && !(n32 & 1)) { + b->bin_size += 1; + b->exit->prev->is_long = 1; + } + } + assert(!b->exit || b->exit->is_long); + + pc->bin_size += b->bin_size *= 4; + + /* descend CFG */ + + if (!b->out[0]) + return; + if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) + return; + +#if 0 + /* delete ELSE branch */ + if (b->entry && + b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) { + nv_nvi_delete(b->entry); + b->bin_size -= 2; + pc->bin_size -= 8; + } +#endif + for (j = 0; j < 2; ++j) + if (b->out[j] && b->out[j] != b) + nv_pc_pass_pre_emission(pc, b->out[j]); +} + +int +nv_pc_exec_pass2(struct nv_pc *pc) +{ + debug_printf("preparing %u blocks for emission\n", pc->num_blocks); + + pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); + + pc->num_blocks = 0; + nv_pc_pass_pre_emission(pc, pc->root); + + return 0; +} + +static INLINE boolean +is_cmem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && + nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15)); +} + +static INLINE boolean +is_smem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + (nvi->src[0]->value->reg.file == NV_FILE_MEM_S || + nvi->src[0]->value->reg.file <= NV_FILE_MEM_P)); +} + +static INLINE boolean +is_immd_move(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_MOV && + nvi->src[0]->value->reg.file == NV_FILE_IMM); +} + +static INLINE void +check_swap_src_0_1(struct nv_instruction *nvi) +{ + static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + + struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1]; + + if (!nv_op_commutative(nvi->opcode)) + return; + assert(src0 && src1); + + if (is_cmem_load(src0->value->insn)) { + if (!is_cmem_load(src1->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping cmem load to 1\n"); */ + } + } else + if (is_smem_load(src1->value->insn)) { + if (!is_smem_load(src0->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping smem load to 0\n"); */ + } + } + + if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0) + nvi->set_cond = cc_swapped[nvi->set_cond]; +} + +struct nv_pass { + struct nv_pc *pc; + int n; + void *priv; +}; + +static int +nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *sti; + int j; + + for (sti = b->entry; sti; sti = sti->next) { + if (!sti->def[0]) + continue; + + if (sti->def[0]->reg.file != NV_FILE_OUT) + continue; + if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA) + continue; + + nvi = sti->src[0]->value->insn; + if (!nvi || nvi->opcode == NV_OP_PHI) + continue; + assert(nvi->def[0] == sti->src[0]->value); + + if (nvi->def[0]->refc > 1) + continue; + + nvi->def[0] = sti->def[0]; + nvi->fixed = 1; + sti->fixed = 0; + } + DESCEND_ARBITRARY(j, nv_pass_fold_stores); + + return 0; +} + +static int +nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *ld; + int j; + + for (nvi = b->entry; nvi; nvi = nvi->next) { + check_swap_src_0_1(nvi); + + for (j = 0; j < 3; ++j) { + if (!nvi->src[j]) + break; + ld = nvi->src[j]->value->insn; + if (!ld) + continue; + + if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) { + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + debug_printf("folded immediate %i\n", ld->def[0]->n); + continue; + } + + if (ld->opcode != NV_OP_LDA) + continue; + if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value)) + continue; + + if (j == 0 && ld->src[4]) /* can't load shared mem */ + continue; + + /* fold it ! */ /* XXX: ref->insn */ + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + if (ld->src[4]) + nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); + } + } + DESCEND_ARBITRARY(j, nv_pass_fold_loads); + + return 0; +} + +static int +nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *mi, *next; + ubyte mod; + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi->opcode == NV_OP_SUB) { + nvi->opcode = NV_OP_ADD; + nvi->src[1]->mod ^= NV_MOD_NEG; + } + + /* should not put any modifiers on NEG and ABS */ + assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod); + assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod); + + for (j = 0; j < 4; ++j) { + if (!nvi->src[j]) + break; + + mi = nvi->src[j]->value->insn; + if (!mi) + continue; + if (mi->def[0]->refc > 1) + continue; + + if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG; + else + if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS; + else + continue; + + if (nvi->opcode == NV_OP_ABS) + mod &= ~(NV_MOD_NEG | NV_MOD_ABS); + else + if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) { + nvi->opcode = NV_OP_MOV; + mod = 0; + } + + if (!(nv50_supported_src_mods(nvi->opcode, j) & mod)) + continue; + + nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value); + + nvi->src[j]->mod ^= mod; + } + + if (nvi->opcode == NV_OP_SAT) { + mi = nvi->src[0]->value->insn; + + if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) { + mi->saturate = 1; + mi->def[0] = nvi->def[0]; + nv_nvi_delete(nvi); + } + } + } + DESCEND_ARBITRARY(j, nv_pass_lower_mods); + + return 0; +} + +#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) + +static struct nv_value * +find_immediate(struct nv_ref *ref) +{ + struct nv_value *src; + + if (!ref) + return NULL; + + src = ref->value; + while (src->insn && src->insn->opcode == NV_OP_MOV) { + assert(!src->insn->src[0]->mod); + src = src->insn->src[0]->value; + } + return (src->reg.file == NV_FILE_IMM) ? src : NULL; +} + +static void +constant_operand(struct nv_pc *pc, + struct nv_instruction *nvi, struct nv_value *val, int s) +{ + int t = s ? 0 : 1; + ubyte type; + + if (!nvi->def[0]) + return; + type = nvi->def[0]->reg.type; + + switch (nvi->opcode) { + case NV_OP_MUL: + if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) || + (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[s], NULL); + if (!s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } else + if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) || + (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) { + nvi->opcode = NV_OP_ADD; + nv_reference(pc, &nvi->src[s], NULL); + if (!s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } else + if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) { + nvi->opcode = NV_OP_NEG; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } else + if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) { + nvi->opcode = NV_OP_ADD; + assert(!nvi->src[s]->mod); + nv_reference(pc, &nvi->src[s], nvi->src[t]->value); + nvi->src[t]->mod ^= NV_MOD_NEG; + nvi->src[s]->mod |= NV_MOD_NEG; + } else + if (val->reg.imm.u32 == 0) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[t], NULL); + if (s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } + break; + case NV_OP_ADD: + if (val->reg.imm.u32 == 0) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } + break; + default: + break; + } +} + +static int +nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *next; + int j; + + for (nvi = b->entry; nvi; nvi = next) { + struct nv_value *src0, *src1, *src; + int mod; + + next = nvi->next; + + if ((src = find_immediate(nvi->src[0])) != NULL) + constant_operand(ctx->pc, nvi, src, 0); + else + if ((src = find_immediate(nvi->src[1])) != NULL) + constant_operand(ctx->pc, nvi, src, 1); + + /* try to combine MUL, ADD into MAD */ + if (nvi->opcode != NV_OP_ADD) + continue; + + src0 = nvi->src[0]->value; + src1 = nvi->src[1]->value; + + if (SRC_IS_MUL(src0) && src0->refc == 1) + src = src0; + else + if (SRC_IS_MUL(src1) && src1->refc == 1) + src = src1; + else + continue; + + nvi->opcode = NV_OP_MAD; + mod = nvi->src[(src == src0) ? 0 : 1]->mod; + nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL); + nvi->src[2] = nvi->src[(src == src0) ? 1 : 0]; + + assert(!(mod & ~NV_MOD_NEG)); + nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value); + nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value); + nvi->src[0]->mod = src->insn->src[0]->mod ^ mod; + nvi->src[1]->mod = src->insn->src[1]->mod; + } + DESCEND_ARBITRARY(j, nv_pass_lower_arith); + + return 0; +} + +/* +set $r2 g f32 $r2 $r3 +cvt abs rn f32 $r2 s32 $r2 +cvt f32 $c0 # f32 $r2 +e $c0 bra 0x80 +*/ +#if 0 +static int +nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b) +{ + /* XXX: easier in IR builder for now */ + return 0; +} +#endif + +/* TODO: reload elimination, redundant store elimination */ + +struct nv_pass_reldelim { + struct nv_pc *pc; +}; + +static int +nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *ld, *next; + + for (ld = b->entry; ld; ld = next) { + next = ld->next; + + if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { + + } else + if (ld->opcode == NV_OP_LDA) { + + } else + if (ld->opcode == NV_OP_MOV) { + + } + } + DESCEND_ARBITRARY(j, nv_pass_reload_elim); + + return 0; +} + +static int +nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int i, c, j; + + for (i = 0; i < ctx->pc->num_instructions; ++i) { + struct nv_instruction *nvi = &ctx->pc->instructions[i]; + struct nv_value *def[4]; + + if (!nv_is_vector_op(nvi->opcode)) + continue; + nvi->tex_mask = 0; + + for (c = 0; c < 4; ++c) { + if (nvi->def[c]->refc) + nvi->tex_mask |= 1 << c; + def[c] = nvi->def[c]; + } + + j = 0; + for (c = 0; c < 4; ++c) + if (nvi->tex_mask & (1 << c)) + nvi->def[j++] = def[c]; + for (c = 0; c < 4; ++c) + if (!(nvi->tex_mask & (1 << c))) + nvi->def[j++] = def[c]; + assert(j == 4); + } + return 0; +} + +struct nv_pass_dce { + struct nv_pc *pc; + uint removed; +}; + +static int +nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *next; + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + + if (inst_cullable(nvi)) { + nv_nvi_delete(nvi); + + ++ctx->removed; + } + } + DESCEND_ARBITRARY(j, nv_pass_dce); + + return 0; +} + +static INLINE boolean +bb_simple_if_endif(struct nv_basic_block *bb) +{ + return (bb->out[0] && bb->out[1] && + bb->out[0]->out[0] == bb->out[1] && + !bb->out[0]->out[1]); +} + +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int j; + + if (bb_simple_if_endif(b)) { + ++ctx->n; + debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n); + } + DESCEND_ARBITRARY(j, nv_pass_flatten); + + return 0; +} + +int +nv_pc_exec_pass0(struct nv_pc *pc) +{ + struct nv_pass_reldelim *reldelim; + struct nv_pass pass; + struct nv_pass_dce dce; + int ret; + + reldelim = CALLOC_STRUCT(nv_pass_reldelim); + reldelim->pc = pc; + + ret = nv_pass_reload_elim(reldelim, pc->root); + + FREE(reldelim); + if (ret) + return ret; + + pass.pc = pc; + + pc->pass_seq++; + ret = nv_pass_flatten(&pass, pc->root); + if (ret) + return ret; + + /* Do this first, so we don't have to pay attention + * to whether sources are supported memory loads. + */ + pc->pass_seq++; + ret = nv_pass_lower_arith(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_loads(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_stores(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_lower_mods(&pass, pc->root); + if (ret) + return ret; + + dce.pc = pc; + do { + dce.removed = 0; + pc->pass_seq++; + ret = nv_pass_dce(&dce, pc->root); + if (ret) + return ret; + } while (dce.removed); + + ret = nv_pass_tex_mask(&pass, pc->root); + if (ret) + return ret; + + return ret; +} diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c new file mode 100644 index 0000000000..09512ffb88 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -0,0 +1,287 @@ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#define NVXX_DEBUG 0 + +#define PRINT(args...) debug_printf(args) + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#endif + +static const char *norm = "\x1b[00m"; +static const char *gree = "\x1b[32m"; +static const char *blue = "\x1b[34m"; +static const char *cyan = "\x1b[36m"; +static const char *orng = "\x1b[33m"; +static const char *mgta = "\x1b[35m"; + +static const char *nv_opcode_names[NV_OP_COUNT + 1] = { + "phi", + "extract", + "combine", + "lda", + "sta", + "mov", + "add", + "sub", + "neg", + "mul", + "mad", + "cvt", + "sat", + "not", + "and", + "or", + "xor", + "shl", + "shr", + "rcp", + "(undefined)", + "rsqrt", + "lg2", + "sin", + "cos", + "ex2", + "presin", + "preex2", + "min", + "max", + "set", + "sad", + "kil", + "bra", + "call", + "ret", + "break", + "breakaddr", + "joinat", + "tex", + "texbias", + "texlod", + "texfetch", + "texsize", + "dfdx", + "dfdy", + "quadop", + "linterp", + "pinterp", + "abs", + "ceil", + "floor", + "trunc", + "nop", + "select", + "export", + "BAD_OP" +}; + +static const char *nv_cond_names[] = +{ + "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "", + "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "" +}; + +static const char *nv_modifier_strings[] = +{ + "", + "neg", + "abs", + "neg abs", + "not", + "not neg" + "not abs", + "not neg abs", + "sat", + "BAD_MOD" +}; + +const char * +nv_opcode_name(uint opcode) +{ + return nv_opcode_names[MIN2(opcode, ARRAY_SIZE(nv_opcode_names) - 1)]; +} + +static INLINE const char * +nv_type_name(ubyte type) +{ + switch (type) { + case NV_TYPE_U16: return "u16"; + case NV_TYPE_S16: return "s16"; + case NV_TYPE_F32: return "f32"; + case NV_TYPE_U32: return "u32"; + case NV_TYPE_S32: return "s32"; + case NV_TYPE_P32: return "p32"; + case NV_TYPE_F64: return "f64"; + default: + return "BAD_TYPE"; + } +} + +static INLINE const char * +nv_cond_name(ubyte cc) +{ + return nv_cond_names[MIN2(cc, 15)]; +} + +static INLINE const char * +nv_modifier_string(ubyte mod) +{ + return nv_modifier_strings[MIN2(mod, 9)]; +} + +static INLINE int +nv_value_id(struct nv_value *value) +{ + if (value->join->reg.id >= 0) + return value->join->reg.id; + return value->n; +} + +static INLINE boolean +nv_value_allocated(struct nv_value *value) +{ + return (value->reg.id >= 0) ? TRUE : FALSE; +} + +static INLINE void +nv_print_address(const char c, int buf, struct nv_value *a, int offset) +{ + if (buf >= 0) + PRINT(" %s%c%i[", cyan, c, buf); + else + PRINT(" %s%c[", cyan, c); + if (a) + PRINT("%s$a%i%s+", mgta, nv_value_id(a), cyan); + PRINT("%s0x%x%s]", orng, offset, cyan); +} + +static INLINE void +nv_print_cond(struct nv_instruction *nvi) +{ + PRINT("%s%s%s$c%i ", + gree, nv_cond_name(nvi->cc), + mgta, nv_value_id(nvi->flags_src->value)); +} + +static INLINE void +nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) +{ + char reg_pfx = '$'; + + if (type == NV_TYPE_ANY) + type = value->reg.type; + + if (value->reg.file != NV_FILE_FLAGS) + PRINT(" %s%s", gree, nv_type_name(type)); + + if (!nv_value_allocated(value)) + reg_pfx = '%'; + + switch (value->reg.file) { + case NV_FILE_GPR: + PRINT(" %s%cr%i", blue, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_OUT: + PRINT(" %s%co%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_ADDR: + PRINT(" %s%ca%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_FLAGS: + PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value)); + break; + case NV_FILE_MEM_S: + nv_print_address('s', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_MEM_P: + nv_print_address('p', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_MEM_V: + nv_print_address('v', -1, ind, 4 * nv_value_id(value)); + break; + case NV_FILE_IMM: + switch (type) { + case NV_TYPE_U16: + case NV_TYPE_S16: + PRINT(" %s0x%04x", orng, value->reg.imm.u32); + break; + case NV_TYPE_F32: + PRINT(" %s%f", orng, value->reg.imm.f32); + break; + case NV_TYPE_F64: + PRINT(" %s%f", orng, value->reg.imm.f64); + break; + case NV_TYPE_U32: + case NV_TYPE_S32: + case NV_TYPE_P32: + PRINT(" %s0x%08x", orng, value->reg.imm.u32); + break; + } + break; + default: + if (value->reg.file >= NV_FILE_MEM_G(0) && + value->reg.file <= NV_FILE_MEM_G(15)) + nv_print_address('g', value->reg.file - NV_FILE_MEM_G(0), ind, + nv_value_id(value) * 4); + else + if (value->reg.file >= NV_FILE_MEM_C(0) && + value->reg.file <= NV_FILE_MEM_C(15)) + nv_print_address('c', value->reg.file - NV_FILE_MEM_C(0), ind, + nv_value_id(value) * 4); + else + NOUVEAU_ERR(" BAD_FILE[%i]", nv_value_id(value)); + break; + } +} + +static INLINE void +nv_print_ref(struct nv_ref *ref, struct nv_value *ind) +{ + nv_print_value(ref->value, ind, ref->typecast); +} + +void +nv_print_instruction(struct nv_instruction *i) +{ + int j; + + if (i->flags_src) + nv_print_cond(i); + + PRINT("%s", gree); + if (i->opcode == NV_OP_SET) + PRINT("set %s", nv_cond_name(i->set_cond)); + else + if (i->saturate) + PRINT("sat %s", nv_opcode_name(i->opcode)); + else + PRINT("%s", nv_opcode_name(i->opcode)); + + if (i->flags_def) + nv_print_value(i->flags_def, NULL, NV_TYPE_ANY); + + /* Only STORE & STA can write to MEM, and they do not def + * anything, so the address is thus part of the source. + */ + if (i->def[0]) + nv_print_value(i->def[0], NULL, NV_TYPE_ANY); + else + PRINT(" #"); + + for (j = 0; j < 4; ++j) { + if (!i->src[j]) + continue; + + if (i->src[j]->mod) + PRINT(" %s", nv_modifier_string(i->src[j]->mod)); + + nv_print_ref(i->src[j], + (j == nv50_indirect_opnd(i)) ? + i->src[4]->value : NULL); + } + if (!i->is_long) + PRINT(" %ss", norm); + PRINT("\n"); +} diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c new file mode 100644 index 0000000000..eb446d641a --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -0,0 +1,973 @@ +/* + * XXX: phi function live intervals start at first ordinary instruction, + * add_range should be taking care of that already ... + * + * XXX: TEX must choose TEX's def as representative + * + * XXX: Aieee! Must materialize MOVs if source is in other basic block! + * -- absolutely, or we cannot execute the MOV conditionally at all + * XXX: Aieee! Must include PHIs in LVA so we pull through liveness if + * PHI source is e.g. in dominator block. + * -- seems we lose liveness somehow, track that + */ + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "util/u_simple_list.h" + +#define NUM_REGISTER_FILES 4 + +struct register_set { + struct nv_pc *pc; + + uint32_t last[NUM_REGISTER_FILES]; + uint32_t bits[NUM_REGISTER_FILES][8]; +}; + +struct nv_pc_pass { + struct nv_pc *pc; + + struct nv_instruction **insns; + int num_insns; + + uint pass_seq; +}; + +static void +ranges_coalesce(struct nv_range *range) +{ + while (range->next && range->end >= range->next->bgn) { + struct nv_range *rnn = range->next->next; + assert(range->bgn <= range->next->bgn); + range->end = MAX2(range->end, range->next->end); + FREE(range->next); + range->next = rnn; + } +} + +static boolean +add_range_ex(struct nv_value *val, int bgn, int end, struct nv_range *new_range) +{ + struct nv_range *range, **nextp = &val->livei; + + for (range = val->livei; range; range = range->next) { + if (end < range->bgn) + break; /* insert before */ + + if (bgn > range->end) { + nextp = &range->next; + continue; /* insert after */ + } + + /* overlap */ + if (bgn < range->bgn) { + range->bgn = bgn; + if (end > range->end) + range->end = end; + ranges_coalesce(range); + return TRUE; + } + if (end > range->end) { + range->end = end; + ranges_coalesce(range); + return TRUE; + } + assert(bgn >= range->bgn); + assert(end <= range->end); + return TRUE; + } + + if (!new_range) + new_range = CALLOC_STRUCT(nv_range); + + new_range->bgn = bgn; + new_range->end = end; + new_range->next = range; + *(nextp) = new_range; + return FALSE; +} + +static void +add_range(struct nv_value *val, struct nv_basic_block *b, int end) +{ + int bgn; + + if (!val->insn) /* ignore non-def values */ + return; + assert(b->entry->serial <= b->exit->serial); + assert(b->phi->serial <= end); + assert(b->exit->serial + 1 >= end); + + bgn = val->insn->serial; + if (bgn < b->entry->serial || bgn > b->exit->serial) + bgn = b->entry->serial; + // debug_printf("add_range(value %i): [%i, %i)\n", val->n, bgn, end); + + if (bgn > end) { + debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n", + b->entry->serial, b->exit->serial, bgn, end); + } + assert(bgn <= end); + + if (bgn < val->insn->serial) + debug_printf("WARNING: leaking value %i ?\n", val->n); + + add_range_ex(val, bgn, end, NULL); +} + +#ifdef NV50_RA_DEBUG_JOIN +static void +livei_print(struct nv_value *a) +{ + struct nv_range *r = a->livei; + + debug_printf("livei %i: ", a->n); + while (r) { + debug_printf("[%i, %i) ", r->bgn, r->end); + r = r->next; + } + debug_printf("\n"); +} +#endif + +static void +livei_unify(struct nv_value *dst, struct nv_value *src) +{ + struct nv_range *range, *next; + + for (range = src->livei; range; range = next) { + next = range->next; + if (add_range_ex(dst, range->bgn, range->end, range)) + FREE(range); + } + src->livei = NULL; +} + +static void +livei_release(struct nv_value *val) +{ + struct nv_range *range, *next; + + for (range = val->livei; range; range = next) { + next = range->next; + FREE(range); + } +} + +static boolean +livei_have_overlap(struct nv_value *a, struct nv_value *b) +{ + struct nv_range *r_a, *r_b; + + for (r_a = a->livei; r_a; r_a = r_a->next) { + for (r_b = b->livei; r_b; r_b = r_b->next) { + if (r_b->bgn < r_a->end && + r_b->end > r_a->bgn) + return TRUE; + } + } + return FALSE; +} + +static int +livei_end(struct nv_value *a) +{ + struct nv_range *r = a->livei; + + assert(r); + while (r->next) + r = r->next; + return r->end; +} + +static boolean +livei_contains(struct nv_value *a, int pos) +{ + struct nv_range *r; + + for (r = a->livei; r && r->bgn <= pos; r = r->next) + if (r->end > pos) + return TRUE; + return FALSE; +} + +static boolean +reg_assign(struct register_set *set, struct nv_value **def, int n) +{ + int i, id, s; + uint m; + int f = def[0]->reg.file; + + s = n << (nv_type_order(def[0]->reg.type) - 1); + m = (1 << s) - 1; + + id = set->last[f]; + + for (i = 0; i * 32 < set->last[f]; ++i) { + if (set->bits[f][i] == 0xffffffff) + continue; + + for (id = 0; id < 32; id += s) + if (!(set->bits[f][i] & (m << id))) + break; + if (id < 32) + break; + } + if (i * 32 + id > set->last[f]) + return FALSE; + + set->bits[f][i] |= m << id; + + id += i * 32; + + set->pc->max_reg[f] = MAX2(set->pc->max_reg[f], id + s - 1); + + id >>= nv_type_order(def[0]->reg.type) - 1; + + for (i = 0; i < n; ++i) + if (def[i]->livei) + def[i]->reg.id = id++; + + return TRUE; +} + +static INLINE void +reg_occupy(struct register_set *set, struct nv_value *val) +{ + int s, id = val->reg.id, f = val->reg.file; + uint m; + + if (id < 0) + return; + s = nv_type_order(val->reg.type) - 1; + id <<= s; + m = (1 << (1 << s)) - 1; + + set->bits[f][id / 32] |= m << (id % 32); + + if (set->pc->max_reg[f] < id) + set->pc->max_reg[f] = id; +} + +static INLINE void +reg_release(struct register_set *set, struct nv_value *val) +{ + int s, id = val->reg.id, f = val->reg.file; + uint m; + + if (id < 0) + return; + + s = nv_type_order(val->reg.type) - 1; + id <<= s; + m = (1 << (1 << s)) - 1; + + set->bits[f][id / 32] &= ~(m << (id % 32)); +} + +static INLINE boolean +join_allowed(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + int i; + struct nv_value *val; + + if (a->reg.file != b->reg.file || + nv_type_sizeof(a->reg.type) != nv_type_sizeof(b->reg.type)) + return FALSE; + + if (a->join->reg.id == b->join->reg.id) + return TRUE; + +#if 1 + /* either a or b or both have been assigned */ + + if (a->join->reg.id >= 0 && b->join->reg.id >= 0) + return FALSE; + else + if (b->join->reg.id >= 0) { + if (a->join->reg.id >= 0) + return FALSE; + val = a; + a = b; + b = val; + } + + for (i = 0; i < ctx->pc->num_values; ++i) { + val = &ctx->pc->values[i]; + + if (val->join->reg.id != a->join->reg.id) + continue; + if (val->join != a->join && livei_have_overlap(val->join, b->join)) + return FALSE; + } + return TRUE; +#endif + return FALSE; +} + +static INLINE void +do_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + int j; + struct nv_value *bjoin = b->join; + + if (b->join->reg.id >= 0) + a->join->reg.id = b->join->reg.id; + + livei_unify(a->join, b->join); + +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("joining %i to %i\n", b->n, a->n); +#endif + + /* make a->join the new representative */ + for (j = 0; j < ctx->pc->num_values; ++j) + if (ctx->pc->values[j].join == bjoin) + ctx->pc->values[j].join = a->join; + + assert(b->join == a->join); +} + +static INLINE void +try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) +{ + if (!join_allowed(ctx, a, b)) { +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("cannot join %i to %i: not allowed\n", b->n, a->n); +#endif + return; + } + if (livei_have_overlap(a->join, b->join)) { +#ifdef NV50_RA_DEBUG_JOIN + debug_printf("cannot join %i to %i: livei overlap\n", b->n, a->n); + livei_print(a); + livei_print(b); +#endif + return; + } + + do_join_values(ctx, a, b); +} + +/* For each operand of each phi in b, generate a new value by inserting a MOV + * at the end of the block it is coming from and replace the operand with it. + * This eliminates liveness conflicts. + */ +static int +pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i, *i2; + struct nv_basic_block *p, *pn; + struct nv_value *val; + int n, j; + + b->pass_seq = ctx->pc->pass_seq; + + for (n = 0; n < b->num_in; ++n) { + p = b->in[n]; + assert(p); + + if (b->num_in > 1 && p->out[0] && p->out[1]) { /* if without else */ + pn = new_basic_block(ctx->pc); + + if (p->out[0] == b) + p->out[0] = pn; + else + p->out[1] = pn; + + if (p->exit->target == b) /* target to new else-block */ + p->exit->target = pn; + + for (j = 0; j < b->num_in; ++j) { + if (b->in[j] == p) { + b->in[j] = pn; + break; + } + } + + pn->out[0] = b; + pn->in[0] = p; + pn->num_in = 1; + } else + pn = p; + + ctx->pc->current_block = pn; + + /* every block with PHIs will also have other operations */ + for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { + for (j = 0; j < 4; ++j) { + if (!i->src[j]) + j = 3; + else + if (i->src[j]->value->insn->bb == p) + break; + } + if (j >= 4) + continue; + assert(i->src[j]); + val = i->src[j]->value; + + /* XXX: should probably not insert this after terminator */ + i2 = new_instruction(ctx->pc, NV_OP_MOV); + + i2->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); + i2->src[0] = new_ref (ctx->pc, val); + i2->def[0]->insn = i2; + + nv_reference(ctx->pc, &i->src[j], i2->def[0]); + } + if (pn != p && pn->exit) { + /* XXX: this branch should probably be eliminated */ + ctx->pc->current_block = b->in[n ? 0 : 1]; + i2 = new_instruction(ctx->pc, NV_OP_BRA); + i2->target = b; + i2->is_terminator = 1; + } + } + + if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) { + pass_generate_phi_movs(ctx, b->out[0]); + } + + if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) { + pass_generate_phi_movs(ctx, b->out[1]); + } + + return 0; +} + +static int +pass_join_values(struct nv_pc_pass *ctx, int iter) +{ + int c, n; + + for (n = 0; n < ctx->num_insns; ++n) { + struct nv_instruction *i = ctx->insns[n]; + + switch (i->opcode) { + case NV_OP_PHI: + if (!iter) + continue; + try_join_values(ctx, i->src[0]->value, i->src[1]->value); + try_join_values(ctx, i->def[0], i->src[0]->value); + break; + case NV_OP_MOV: + if (iter && i->src[0]->value->insn && + !nv_is_vector_op(i->src[0]->value->join->insn->opcode)) + try_join_values(ctx, i->def[0], i->src[0]->value); + break; + case NV_OP_SELECT: + if (!iter) + break; + assert(join_allowed(ctx, i->def[0], i->src[0]->value)); + assert(join_allowed(ctx, i->def[0], i->src[1]->value)); + do_join_values(ctx, i->def[0], i->src[0]->value); + do_join_values(ctx, i->def[0], i->src[1]->value); + break; + case NV_OP_TEX: + case NV_OP_TXB: + case NV_OP_TXL: + case NV_OP_TXQ: + if (iter) + break; + for (c = 0; c < 4; ++c) { + if (!i->src[c]) + break; + do_join_values(ctx, i->def[c], i->src[c]->value); + } + break; + default: + break; + } + } + return 0; +} + +static int +pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i; + + b->priv = 0; + + assert(!b->exit || !b->exit->next); + for (i = b->phi; i; i = i->next) { + i->serial = ctx->num_insns; + ctx->insns[ctx->num_insns++] = i; + } + + b->pass_seq = ctx->pc->pass_seq; + + if (!b->out[0]) + return 0; + if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) + return 0; + + if (b->out[0] != b) + pass_order_instructions(ctx, b->out[0]); + if (b->out[1] && b->out[1] != b) + pass_order_instructions(ctx, b->out[1]); + + return 0; +} + +static void +bb_live_set_print(struct nv_pc *pc, struct nv_basic_block *b) +{ +#ifdef NV50_RA_DEBUG_LIVE_SETS + int j; + struct nv_value *val; + + debug_printf("live_set of %p: ", b); + + for (j = 0; j < pc->num_values; ++j) { + if (!(b->live_set[j / 32] & (1 << (j % 32)))) + continue; + val = &pc->values[j]; + if (!val->insn) + continue; + debug_printf("%i ", val->n); + } + debug_printf("\n"); +#endif +} + +static INLINE void +live_set_add(struct nv_basic_block *b, struct nv_value *val) +{ + if (!val->insn) /* don't add non-def values */ + return; + /* debug_printf("live[%p] <- %i\n", b, val->n); */ + + b->live_set[val->n / 32] |= 1 << (val->n % 32); +} + +static INLINE void +live_set_rem(struct nv_basic_block *b, struct nv_value *val) +{ + /* if (val->insn) + debug_printf("live[%p] -> %i\n", b, val->n); */ + b->live_set[val->n / 32] &= ~(1 << (val->n % 32)); +} + +static INLINE boolean +live_set_test(struct nv_basic_block *b, struct nv_ref *ref) +{ + int n = ref->value->n; + return b->live_set[n / 32] & (1 << (n % 32)); +} + +/* check if bf (future) can be reached from bp (past) */ +static boolean +bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, + struct nv_basic_block *bt) +{ + if (bf == bp) + return TRUE; + if (bp == bt) + return FALSE; + + if (bp->out[0] && bp->out[0] != bp && + bb_reachable_by(bf, bp->out[0], bt)) + return TRUE; + if (bp->out[1] && bp->out[1] != bp && + bb_reachable_by(bf, bp->out[1], bt)) + return TRUE; + return FALSE; +} + +/* The live set of a block contains those values that are live immediately + * before the beginning of the block. + */ +static int +pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i; + int j, n, ret = 0; + + /* slight hack for undecidedness: set phi = entry if it's undefined */ + if (!b->phi) + b->phi = b->entry; + + for (n = 0; n < 2; ++n) { + if (!b->out[n] || b->out[n] == b) + continue; + ret = pass_build_live_sets(ctx, b->out[n]); + if (ret) + return ret; + + if (n == 0) { + for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) + b->live_set[j] = b->out[n]->live_set[j]; + } else { + for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) + b->live_set[j] |= b->out[n]->live_set[j]; + } + + /* Kick values out of our live set that are created in incoming + * blocks of our successors that are not us. + */ + for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) { + for (j = 0; j < 4; ++j) { + if (!i->src[j]) + break; + assert(i->src[j]->value->insn); + + if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) { + live_set_add(b, i->src[j]->value); + debug_printf("%p: live set + %i\n", b, i->src[j]->value->n); + } else { + live_set_rem(b, i->src[j]->value); + debug_printf("%p: live set - %i\n", b, i->src[j]->value->n); + } + } + } + } + + if (b->pass_seq >= ctx->pc->pass_seq) + return 0; + b->pass_seq = ctx->pc->pass_seq; + + debug_printf("%s: visiting block %p\n", __FUNCTION__, b); + + if (!b->entry) + return 0; + bb_live_set_print(ctx->pc, b); + + for (i = b->exit; i; i = i->prev) { + for (j = 0; j < 4; j++) { + if (!i->def[j]) + break; + live_set_rem(b, i->def[j]); + } + for (j = 0; j < 4; j++) { + if (!i->src[j]) + break; + live_set_add(b, i->src[j]->value); + } + if (i->src[4]) + live_set_add(b, i->src[4]->value); + if (i->flags_def) + live_set_rem(b, i->flags_def); + if (i->flags_src) + live_set_add(b, i->flags_src->value); + } + bb_live_set_print(ctx->pc, b); + + return 0; +} + +static void collect_live_values(struct nv_basic_block *b, const int n) +{ + int i; + + if (b->out[0]) { + if (b->out[1]) { /* what to do about back-edges ? */ + for (i = 0; i < n; ++i) + b->live_set[i] = b->out[0]->live_set[i] | b->out[1]->live_set[i]; + } else { + memcpy(b->live_set, b->out[0]->live_set, n * sizeof(uint32_t)); + } + } else + if (b->out[1]) { + memcpy(b->live_set, b->out[1]->live_set, n * sizeof(uint32_t)); + } else { + memset(b->live_set, 0, n * sizeof(uint32_t)); + } +} + +/* NOTE: the live intervals of phi functions start the the first non-phi instruction */ +static int +pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i, *i_stop; + int j, s; + const int n = (ctx->pc->num_values + 31) / 32; + + debug_printf("building intervals for BB %i\n", b->id); + + /* verify that first block does not have live-in values */ + if (b->num_in == 0) + for (j = 0; j < n; ++j) + assert(b->live_set[j] == 0); + + collect_live_values(b, n); + + /* remove live-outs def'd in a parallel block, hopefully they're all phi'd */ + for (j = 0; j < 2; ++j) { + if (!b->out[j] || !b->out[j]->phi) + continue; + for (i = b->out[j]->phi; i->opcode == NV_OP_PHI; i = i->next) { + live_set_rem(b, i->def[0]); + + for (s = 0; s < 4; ++s) { + if (!i->src[s]) + break; + assert(i->src[s]->value->insn); + if (bb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j])) + live_set_add(b, i->src[s]->value); + else + live_set_rem(b, i->src[s]->value); + } + } + } + + /* remaining live-outs are live until the end */ + for (j = 0; j < ctx->pc->num_values; ++j) { + if (!(b->live_set[j / 32] & (1 << (j % 32)))) + continue; +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for live value %i\n", j); +#endif + add_range(&ctx->pc->values[j], b, b->exit->serial + 1); + } + debug_printf("%s: looping through instructions now\n", __func__); + + i_stop = b->entry ? b->entry->prev : NULL; + + /* don't have to include phi functions here (will have 0 live range) */ + for (i = b->exit; i != i_stop; i = i->prev) { + assert(i->serial >= b->phi->serial && i->serial <= b->exit->serial); + for (j = 0; j < 4; ++j) { + if (i->def[j]) + live_set_rem(b, i->def[j]); + } + if (i->flags_def) + live_set_rem(b, i->flags_def); + + for (j = 0; j < 5; ++j) { + if (i->src[j] && !live_set_test(b, i->src[j])) { + live_set_add(b, i->src[j]->value); +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for source that ends living: %i\n", + i->src[j]->value->n); +#endif + add_range(i->src[j]->value, b, i->serial); + } + } + if (i->flags_src && !live_set_test(b, i->flags_src)) { + live_set_add(b, i->flags_src->value); +#ifdef NV50_RA_DEBUG_LIVEI + debug_printf("adding range for source that ends living: %i\n", + i->flags_src->value->n); +#endif + add_range(i->flags_src->value, b, i->serial); + } + } + + b->pass_seq = ctx->pc->pass_seq; + + if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) + pass_build_intervals(ctx, b->out[0]); + + if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) + pass_build_intervals(ctx, b->out[1]); + + debug_printf("built intervals for block %p\n", b); + + return 0; +} + +static INLINE void +nv50_ctor_register_set(struct nv_pc *pc, struct register_set *set) +{ + memset(set, 0, sizeof(*set)); + + set->last[NV_FILE_GPR] = 255; + set->last[NV_FILE_OUT] = 127; + set->last[NV_FILE_FLAGS] = 4; + set->last[NV_FILE_ADDR] = 4; + + set->pc = pc; +} + +static void +insert_ordered_tail(struct nv_value *list, struct nv_value *nval) +{ + struct nv_value *elem = list->prev; + + // debug_printf("inserting value %i\n", nval->n); + + for (elem = list->prev; + elem != list && elem->livei->bgn > nval->livei->bgn; + elem = elem->prev); + /* now elem begins before or at the same time as val */ + + nval->prev = elem; + nval->next = elem->next; + elem->next->prev = nval; + elem->next = nval; +} + +static int +pass_linear_scan(struct nv_pc_pass *ctx, int iter) +{ + struct nv_instruction *i; + struct register_set f, free; + int k, n; + struct nv_value *cur, *val, *tmp[2]; + struct nv_value active, inactive, handled, unhandled; + + make_empty_list(&active); + make_empty_list(&inactive); + make_empty_list(&handled); + make_empty_list(&unhandled); + + nv50_ctor_register_set(ctx->pc, &free); + + /* joined values should have range = NULL and thus not be added; + * also, fixed memory values won't be added because they're not + * def'd, just used + */ + for (n = 0; n < ctx->num_insns; ++n) { + i = ctx->insns[n]; + + for (k = 0; k < 4; ++k) { + if (i->def[k] && i->def[k]->livei) + insert_ordered_tail(&unhandled, i->def[k]); + else + if (0 && i->def[k]) + debug_printf("skipping def'd value %i: no livei\n", i->def[k]->n); + } + if (i->flags_def && i->flags_def->livei) + insert_ordered_tail(&unhandled, i->flags_def); + } + + for (val = unhandled.next; val != unhandled.prev; val = val->next) { + assert(val->join == val); + assert(val->livei->bgn <= val->next->livei->bgn); + } + + foreach_s(cur, tmp[0], &unhandled) { + remove_from_list(cur); + + /* debug_printf("handling value %i\n", cur->n); */ + + foreach_s(val, tmp[1], &active) { + if (livei_end(val) <= cur->livei->bgn) { + reg_release(&free, val); + move_to_head(&handled, val); + } else + if (!livei_contains(val, cur->livei->bgn)) { + reg_release(&free, val); + move_to_head(&inactive, val); + } + } + + foreach_s(val, tmp[1], &inactive) { + if (livei_end(val) <= cur->livei->bgn) + move_to_head(&handled, val); + else + if (livei_contains(val, cur->livei->bgn)) { + reg_occupy(&free, val); + move_to_head(&active, val); + } + } + + f = free; + + foreach(val, &inactive) + if (livei_have_overlap(val, cur)) + reg_occupy(&f, val); + + foreach(val, &unhandled) + if (val->reg.id >= 0 && livei_have_overlap(val, cur)) + reg_occupy(&f, val); + + if (cur->reg.id < 0) { + boolean mem = FALSE; + + if (nv_is_vector_op(cur->insn->opcode)) + mem = !reg_assign(&f, &cur->insn->def[0], 4); + else + if (iter) + mem = !reg_assign(&f, &cur, 1); + + if (mem) { + NOUVEAU_ERR("out of registers\n"); + abort(); + } + } + insert_at_head(&active, cur); + reg_occupy(&free, cur); + } + + return 0; +} + +static int +pass_eliminate_moves(struct nv_pc_pass *ctx) +{ + return 0; +} + +int +nv_pc_exec_pass1(struct nv_pc *pc) +{ + struct nv_pc_pass *ctx; + int i, ret; + + debug_printf("REGISTER ALLOCATION - entering\n"); + + ctx = CALLOC_STRUCT(nv_pc_pass); + if (!ctx) + return -1; + ctx->pc = pc; + + nv_print_program(ctx->pc->root); + + ctx->insns = CALLOC(pc->num_instructions, sizeof(struct nv_instruction *)); + + pc->pass_seq++; + ret = pass_generate_phi_movs(ctx, pc->root); + assert(!ret); + + nv_print_program(ctx->pc->root); + + for (i = 0; i < pc->loop_nesting_bound; ++i) { + pc->pass_seq++; + ret = pass_build_live_sets(ctx, pc->root); + assert(!ret && "live sets"); + if (ret) { + NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i); + goto out; + } + } + + pc->pass_seq++; + ret = pass_order_instructions(ctx, pc->root); + assert(!ret && "order instructions"); + if (ret) + goto out; + + pc->pass_seq++; + ret = pass_build_intervals(ctx, pc->root); + assert(!ret && "build intervals"); + if (ret) { + NOUVEAU_ERR("failed to build live intervals\n"); + goto out; + } + + for (i = 0; i < 2; ++i) { + ret = pass_join_values(ctx, i); + if (ret) + goto out; + ret = pass_linear_scan(ctx, i); + if (ret) + goto out; + } + assert(!ret && "joining"); + + ret = pass_eliminate_moves(ctx); + + for (i = 0; i < pc->num_values; ++i) + livei_release(&pc->values[i]); + + debug_printf("REGISTER ALLOCATION - leaving\n"); + nv_print_program(ctx->pc->root); + +out: + FREE(ctx); + return ret; +} diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 8cb1639013..26d1be8db8 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Ben Skeggs + * Copyright 2010 Chrsitoph Bumiller * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -20,4674 +20,553 @@ * SOFTWARE. */ -#include "pipe/p_context.h" -#include "pipe/p_defines.h" -#include "pipe/p_state.h" -#include "util/u_inlines.h" +#include "nv50_program.h" +#include "nv50_pc.h" +#include "nv50_context.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" -#include "nv50_context.h" -#include "nv50_transfer.h" - -#define NV50_SU_MAX_TEMP 127 -#define NV50_SU_MAX_ADDR 4 -//#define NV50_PROGRAM_DUMP - -/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ - -/* ARL - gallium craps itself on progs/vp/arl.txt - * - * MSB - Like MAD, but MUL+SUB - * - Fuck it off, introduce a way to negate args for ops that - * support it. - * - * Look into inlining IMMD for ops other than MOV (make it general?) - * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, - * but can emit to P_TEMP first - then MOV later. NVIDIA does this - * - * In ops such as ADD it's possible to construct a bad opcode in the !is_long() - * case, if the emit_src() causes the inst to suddenly become long. - * - * Verify half-insns work where expected - and force disable them where they - * don't work - MUL has it forcibly disabled atm as it fixes POW.. - * - * FUCK! watch dst==src vectors, can overwrite components that are needed. - * ie. SUB R0, R0.yzxw, R0 - * - * Things to check with renouveau: - * FP attr/result assignment - how? - * attrib - * - 0x16bc maps vp output onto fp hpos - * - 0x16c0 maps vp output onto fp col0 - * result - * - colr always 0-3 - * - depr always 4 - * 0x16bc->0x16e8 --> some binding between vp/fp regs - * 0x16b8 --> VP output count - * - * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 - * "MOV rcol.x, fcol.y" = 0x00000004 - * 0x19a8 --> as above but 0x00000100 and 0x00000000 - * - 0x00100000 used when KIL used - * 0x196c --> as above but 0x00000011 and 0x00000000 - * - * 0x1988 --> 0xXXNNNNNN - * - XX == FP high something - */ -struct nv50_reg { - enum { - P_TEMP, - P_ATTR, - P_RESULT, - P_CONST, - P_IMMD, - P_ADDR - } type; - int index; - - int hw; - int mod; - - int rhw; /* result hw for FP outputs, or interpolant index */ - int acc; /* instruction where this reg is last read (first insn == 1) */ - - int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */ - int indirect[2]; /* index into pc->addr, or -1 */ - - ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */ -}; - -#define NV50_MOD_NEG 1 -#define NV50_MOD_ABS 2 -#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS) -#define NV50_MOD_SAT 4 -#define NV50_MOD_I32 8 - -/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */ - -/* STACK: Conditionals and loops have to use the (per warp) stack. - * Stack entries consist of an entry type (divergent path, join at), - * a mask indicating the active threads of the warp, and an address. - * MPs can store 12 stack entries internally, if we need more (and - * we probably do), we have to create a stack buffer in VRAM. - */ -/* impose low limits for now */ -#define NV50_MAX_COND_NESTING 4 -#define NV50_MAX_LOOP_NESTING 3 - -#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2 - -struct nv50_pc { - struct nv50_program *p; - - /* hw resources */ - struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; - struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; - - /* tgsi resources */ - struct nv50_reg *temp; - int temp_nr; - struct nv50_reg *attr; - int attr_nr; - struct nv50_reg *result; - int result_nr; - struct nv50_reg *param; - int param_nr; - struct nv50_reg *immd; - uint32_t *immd_buf; - int immd_nr; - struct nv50_reg **addr; - int addr_nr; - struct nv50_reg *sysval; - int sysval_nr; - - struct nv50_reg *temp_temp[16]; - struct nv50_program_exec *temp_temp_exec[16]; - unsigned temp_temp_nr; - - /* broadcast and destination replacement regs */ - struct nv50_reg *r_brdc; - struct nv50_reg *r_dst[4]; - - struct nv50_reg reg_instances[16]; - unsigned reg_instance_nr; - - unsigned interp_mode[32]; - /* perspective interpolation registers */ - struct nv50_reg *iv_p; - struct nv50_reg *iv_c; - - struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING]; - struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING]; - struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING]; - int if_lvl, loop_lvl; - unsigned loop_pos[NV50_MAX_LOOP_NESTING]; - - unsigned *insn_pos; /* actual program offset of each TGSI insn */ - boolean in_subroutine; - - /* current instruction and total number of insns */ - unsigned insn_cur; - unsigned insn_nr; - - boolean allow32; - - uint8_t edgeflag_out; -}; - -static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *); - -static INLINE void -ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) -{ - reg->type = type; - reg->index = index; - reg->hw = hw; - reg->mod = 0; - reg->rhw = -1; - reg->vtx = -1; - reg->acc = 0; - reg->indirect[0] = reg->indirect[1] = -1; - reg->buf_index = (type == P_CONST) ? 1 : 0; -} - static INLINE unsigned -popcnt4(uint32_t val) -{ - static const unsigned cnt[16] - = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; - return cnt[val & 0xf]; -} - -static void -terminate_mbb(struct nv50_pc *pc) -{ - int i; - - /* remove records of temporary address register values */ - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) - if (pc->r_addr[i].index < 0) - pc->r_addr[i].acc = 0; -} - -static void -alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) -{ - int i = 0; - - if (reg->type == P_RESULT) { - if (pc->p->cfg.high_result < (reg->hw + 1)) - pc->p->cfg.high_result = reg->hw + 1; - } - - if (reg->type != P_TEMP) - return; - - if (reg->hw >= 0) { - /*XXX: do this here too to catch FP temp-as-attr usage.. - * not clean, but works */ - if (pc->p->cfg.high_temp < (reg->hw + 1)) - pc->p->cfg.high_temp = reg->hw + 1; - return; - } - - if (reg->rhw != -1) { - /* try to allocate temporary with index rhw first */ - if (!(pc->r_temp[reg->rhw])) { - pc->r_temp[reg->rhw] = reg; - reg->hw = reg->rhw; - if (pc->p->cfg.high_temp < (reg->rhw + 1)) - pc->p->cfg.high_temp = reg->rhw + 1; - return; - } - /* make sure we don't get things like $r0 needs to go - * in $r1 and $r1 in $r0 - */ - i = pc->result_nr * 4; - } - - for (; i < NV50_SU_MAX_TEMP; i++) { - if (!(pc->r_temp[i])) { - pc->r_temp[i] = reg; - reg->hw = i; - if (pc->p->cfg.high_temp < (i + 1)) - pc->p->cfg.high_temp = i + 1; - return; - } - } - - NOUVEAU_ERR("out of registers\n"); - abort(); -} - -static INLINE struct nv50_reg * -reg_instance(struct nv50_pc *pc, struct nv50_reg *reg) +bitcount4(const uint32_t val) { - struct nv50_reg *ri; - - assert(pc->reg_instance_nr < 16); - ri = &pc->reg_instances[pc->reg_instance_nr++]; - if (reg) { - alloc_reg(pc, reg); - *ri = *reg; - reg->indirect[0] = reg->indirect[1] = -1; - reg->mod = 0; - } - return ri; + static const unsigned cnt[16] + = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + return cnt[val & 0xf]; } -/* XXX: For shaders that aren't executed linearly (e.g. shaders that - * contain loops), we need to assign all hw regs to TGSI TEMPs early, - * lest we risk temp_temps overwriting regs alloc'd "later". - */ -static struct nv50_reg * -alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) -{ - struct nv50_reg *r; - int i; +static unsigned +nv50_tgsi_src_mask(const struct tgsi_full_instruction *inst, int c) +{ + unsigned mask = inst->Dst[0].Register.WriteMask; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); + case TGSI_OPCODE_DP3: + return 0x7; + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + case TGSI_OPCODE_KIL: /* WriteMask ignored */ + return 0xf; + case TGSI_OPCODE_DST: + return mask & (c ? 0xa : 0x6); + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_EXP: + case TGSI_OPCODE_LG2: + case TGSI_OPCODE_LOG: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_SCS: + return 0x1; + case TGSI_OPCODE_IF: + return 0x1; + case TGSI_OPCODE_LIT: + return 0xb; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + { + const struct tgsi_instruction_texture *tex; + + assert(inst->Instruction.Texture); + tex = &inst->Texture; + + mask = 0x7; + if (inst->Instruction.Opcode != TGSI_OPCODE_TEX && + inst->Instruction.Opcode != TGSI_OPCODE_TXD) + mask |= 0x8; /* bias, lod or proj */ + + switch (tex->Texture) { + case TGSI_TEXTURE_1D: + mask &= 0x9; + break; + case TGSI_TEXTURE_SHADOW1D: + mask &= 0x5; + break; + case TGSI_TEXTURE_2D: + mask &= 0xb; + break; + default: + break; + } + } + return mask; + case TGSI_OPCODE_XPD: + { + unsigned x = 0; + if (mask & 1) x |= 0x6; + if (mask & 2) x |= 0x5; + if (mask & 4) x |= 0x3; + return x; + } + default: + break; + } + + return mask; +} + +static void +nv50_indirect_inputs(struct nv50_translation_info *ti, int id) +{ + int i, c; + + for (i = 0; i < PIPE_MAX_SHADER_INPUTS; ++i) + for (c = 0; c < 4; ++c) + ti->input_access[i][c] = id; + + ti->indirect_inputs = TRUE; +} + +static void +nv50_indirect_outputs(struct nv50_translation_info *ti, int id) +{ + int i, c; + + for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) + for (c = 0; c < 4; ++c) + ti->output_access[i][c] = id; + + ti->indirect_outputs = TRUE; +} + +static void +prog_inst(struct nv50_translation_info *ti, + const struct tgsi_full_instruction *inst, int id) +{ + const struct tgsi_dst_register *dst; + const struct tgsi_src_register *src; + int s, c, k; + unsigned mask; + + if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) { + for (c = 0; c < 4; ++c) { + dst = &inst->Dst[0].Register; + if (inst->Dst[0].Register.Indirect) + nv50_indirect_outputs(ti, id); + if (!(dst->WriteMask & (1 << c))) + continue; + ti->output_access[dst->Index][c] = id; + } + + if (inst->Instruction.Opcode == TGSI_OPCODE_MOV && + inst->Src[0].Register.File == TGSI_FILE_INPUT && + dst->Index == ti->edgeflag_out) + ti->p->vp.edgeflag = inst->Src[0].Register.Index; + } - if (dst && dst->type == P_TEMP && dst->hw == -1) - return dst; + for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { + src = &inst->Src[s].Register; + if (src->File != TGSI_FILE_INPUT) + continue; + mask = nv50_tgsi_src_mask(inst, s); - for (i = 0; i < NV50_SU_MAX_TEMP; i++) { - if (!pc->r_temp[i]) { - r = MALLOC_STRUCT(nv50_reg); - ctor_reg(r, P_TEMP, -1, i); - pc->r_temp[i] = r; - return r; - } - } + if (inst->Src[s].Register.Indirect) + nv50_indirect_inputs(ti, id); - NOUVEAU_ERR("out of registers\n"); - abort(); - return NULL; + for (c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) + continue; + k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c); + if (k <= TGSI_SWIZZLE_W) + ti->input_access[src->Index][k] = id; + } + } } -/* release the hardware resource held by r */ static void -release_hw(struct nv50_pc *pc, struct nv50_reg *r) +prog_immediate(struct nv50_translation_info *ti, + const struct tgsi_full_immediate *imm) { - assert(r->type == P_TEMP); - if (r->hw == -1) - return; + int c; + unsigned n = ++ti->immd32_nr; - assert(pc->r_temp[r->hw] == r); - pc->r_temp[r->hw] = NULL; + if (n == (1 << (ffs(n) - 1))) + ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16); - r->acc = 0; - if (r->index == -1) - FREE(r); + for (c = 0; c < 4; ++c) + ti->immd32[(n - 1) * 4 + c] = imm->u[c].Uint; } -static void -free_temp(struct nv50_pc *pc, struct nv50_reg *r) -{ - if (r->index == -1) { - unsigned hw = r->hw; - - FREE(pc->r_temp[hw]); - pc->r_temp[hw] = NULL; - } +static INLINE unsigned +translate_interpolate(const struct tgsi_full_declaration *decl) +{ + unsigned mode; + + if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_CONSTANT) + mode = NV50_INTERP_FLAT; + else + if (decl->Declaration.Interpolate == TGSI_INTERPOLATE_PERSPECTIVE) + mode = 0; + else + mode = NV50_INTERP_LINEAR; + + if (decl->Declaration.Centroid) + mode |= NV50_INTERP_CENTROID; + + return mode; +} + +static void +prog_decl(struct nv50_translation_info *ti, + const struct tgsi_full_declaration *decl) +{ + unsigned i, first, last, sn = 0, si = 0; + + first = decl->Range.First; + last = decl->Range.Last; + + if (decl->Declaration.Semantic) { + sn = decl->Semantic.Name; + si = decl->Semantic.Index; + } + tgsi_dump_declaration(decl); + + switch (decl->Declaration.File) { + case TGSI_FILE_INPUT: + for (i = first; i <= last; ++i) + ti->interp_mode[i] = translate_interpolate(decl); + + if (!decl->Declaration.Semantic) + break; + + for (i = first; i <= last; ++i) { + ti->p->in[i].sn = sn; + ti->p->in[i].si = si; + } + + switch (sn) { + case TGSI_SEMANTIC_FACE: + break; + case TGSI_SEMANTIC_COLOR: + if (ti->p->type == PIPE_SHADER_FRAGMENT) + ti->p->vp.bfc[si] = first; + break; + } + break; + case TGSI_FILE_OUTPUT: + if (!decl->Declaration.Semantic) + break; + + for (i = first; i <= last; ++i) { + ti->p->out[i].sn = sn; + ti->p->out[i].si = si; + } + + switch (sn) { + case TGSI_SEMANTIC_BCOLOR: + ti->p->vp.bfc[si] = first; + break; + case TGSI_SEMANTIC_PSIZE: + ti->p->vp.psiz = first; + break; + case TGSI_SEMANTIC_EDGEFLAG: + ti->edgeflag_out = first; + break; + default: + break; + } + break; + case TGSI_FILE_SYSTEM_VALUE: + switch (decl->Semantic.Name) { + case TGSI_SEMANTIC_FACE: + break; + case TGSI_SEMANTIC_INSTANCEID: + break; + case TGSI_SEMANTIC_PRIMID: + break; + /* + case TGSI_SEMANTIC_PRIMIDIN: + break; + case TGSI_SEMANTIC_VERTEXID: + break; + */ + default: + break; + } + break; + case TGSI_FILE_CONSTANT: + ti->p->parm_size = MAX2(ti->p->parm_size, (last + 1) * 16); + break; + case TGSI_FILE_ADDRESS: + case TGSI_FILE_SAMPLER: + case TGSI_FILE_TEMPORARY: + break; + default: + assert(0); + break; + } } static int -alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) +nv50_vertprog_prepare(struct nv50_translation_info *ti) { - int i; - - if ((idx + 4) >= NV50_SU_MAX_TEMP) - return 1; - - if (pc->r_temp[idx] || pc->r_temp[idx + 1] || - pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) - return alloc_temp4(pc, dst, idx + 4); + struct nv50_program *p = ti->p; + int i, c; + unsigned num_inputs = 0; - for (i = 0; i < 4; i++) { - dst[i] = MALLOC_STRUCT(nv50_reg); - ctor_reg(dst[i], P_TEMP, -1, idx + i); - pc->r_temp[idx + i] = dst[i]; - } + ti->input_file = NV_FILE_MEM_S; + ti->output_file = NV_FILE_OUT; - return 0; -} - -static void -free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) -{ - int i; + for (i = 0; i <= ti->scan.file_max[TGSI_FILE_INPUT]; ++i) { + p->in[i].id = i; + p->in[i].hw = num_inputs; - for (i = 0; i < 4; i++) - free_temp(pc, reg[i]); -} + for (c = 0; c < 4; ++c) { + if (!ti->input_access[i][c]) + continue; + ti->input_map[i][c] = num_inputs++; + p->vp.attrs[(4 * i + c) / 32] |= 1 << ((i * 4 + c) % 32); + } + } -static struct nv50_reg * -temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - if (pc->temp_temp_nr >= 16) - assert(0); + for (i = 0; i <= ti->scan.file_max[TGSI_FILE_OUTPUT]; ++i) { + p->out[i].id = i; + p->out[i].hw = p->max_out; - pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); - pc->temp_temp_exec[pc->temp_temp_nr] = e; - return pc->temp_temp[pc->temp_temp_nr++]; -} + for (c = 0; c < 4; ++c) { + if (!ti->output_access[i][c]) + continue; + ti->output_map[i][c] = p->max_out++; + p->out[i].mask |= 1 << c; + } + } -/* This *must* be called for all nv50_program_exec that have been - * given as argument to temp_temp, or the temps will be leaked ! - */ -static void -kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - int i; + if (p->vp.psiz < 0x40) + p->vp.psiz = p->out[p->vp.psiz].hw; - for (i = 0; i < pc->temp_temp_nr; i++) - if (pc->temp_temp_exec[i] == e) - free_temp(pc, pc->temp_temp[i]); - if (!e) - pc->temp_temp_nr = 0; + return 0; } static int -ctor_immd_4u32(struct nv50_pc *pc, - uint32_t x, uint32_t y, uint32_t z, uint32_t w) -{ - unsigned size = pc->immd_nr * 4 * sizeof(uint32_t); - - pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t)); - - pc->immd_buf[(pc->immd_nr * 4) + 0] = x; - pc->immd_buf[(pc->immd_nr * 4) + 1] = y; - pc->immd_buf[(pc->immd_nr * 4) + 2] = z; - pc->immd_buf[(pc->immd_nr * 4) + 3] = w; - - return pc->immd_nr++; -} - -static INLINE int -ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w) -{ - return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w)); -} - -static struct nv50_reg * -alloc_immd(struct nv50_pc *pc, float f) -{ - struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); - unsigned hw; - - for (hw = 0; hw < pc->immd_nr * 4; hw++) - if (pc->immd_buf[hw] == fui(f)) - break; - - if (hw == pc->immd_nr * 4) - hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4; - - ctor_reg(r, P_IMMD, -1, hw); - return r; -} - -static struct nv50_program_exec * -exec(struct nv50_pc *pc) -{ - struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); - - e->param.index = -1; - return e; -} - -static void -emit(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - struct nv50_program *p = pc->p; - - if (p->exec_tail) - p->exec_tail->next = e; - if (!p->exec_head) - p->exec_head = e; - p->exec_tail = e; - p->exec_size += (e->inst[0] & 1) ? 2 : 1; - - kill_temp_temp(pc, e); -} - -static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); - -static boolean -is_long(struct nv50_program_exec *e) -{ - if (e->inst[0] & 1) - return TRUE; - return FALSE; -} - -static boolean -is_immd(struct nv50_program_exec *e) -{ - if (is_long(e) && (e->inst[1] & 3) == 3) - return TRUE; - return FALSE; -} - -static boolean -is_join(struct nv50_program_exec *e) -{ - if (is_long(e) && (e->inst[1] & 3) == 2) - return TRUE; - return FALSE; -} - -static INLINE boolean -is_control_flow(struct nv50_program_exec *e) -{ - return (e->inst[0] & 2); -} - -static INLINE void -set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, - struct nv50_program_exec *e) -{ - assert(!is_immd(e)); - set_long(pc, e); - e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); - e->inst[1] |= (pred << 7) | (idx << 12); -} - -static INLINE void -set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, - struct nv50_program_exec *e) -{ - set_long(pc, e); - e->inst[1] &= ~((0x3 << 4) | (1 << 6)); - e->inst[1] |= (idx << 4) | (on << 6); -} - -static INLINE void -set_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - if (is_long(e)) - return; - - e->inst[0] |= 1; - set_pred(pc, 0xf, 0, e); - set_pred_wr(pc, 0, 0, e); -} - -static INLINE void -set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) -{ - if (dst->type == P_RESULT) { - set_long(pc, e); - e->inst[1] |= 0x00000008; - } - - alloc_reg(pc, dst); - if (dst->hw > 63) - set_long(pc, e); - e->inst[0] |= (dst->hw << 2); +nv50_fragprog_prepare(struct nv50_translation_info *ti) +{ + struct nv50_program *p = ti->p; + int i, j, c; + unsigned nvary, nintp, depr; + unsigned n = 0, m = 0, skip = 0; + ubyte sn[16], si[16]; + + /* FP flags */ + + if (ti->scan.writes_z) { + p->fp.flags[1] = 0x11; + p->fp.flags[0] |= NV50TCL_FP_CONTROL_EXPORTS_Z; + } + + if (ti->scan.uses_kill) + p->fp.flags[0] |= NV50TCL_FP_CONTROL_USES_KIL; + + /* FP inputs */ + + ti->input_file = NV_FILE_MEM_V; + ti->output_file = NV_FILE_GPR; + + /* count non-flat inputs, save semantic info */ + for (i = 0; i < p->in_nr; ++i) { + m += (ti->interp_mode[i] & NV50_INTERP_FLAT) ? 0 : 1; + sn[i] = p->in[i].sn; + si[i] = p->in[i].si; + } + + /* reorder p->in[] so that non-flat inputs are first and + * kick out special inputs that don't use VP/GP_RESULT_MAP + */ + nintp = 0; + for (i = 0; i < p->in_nr; ++i) { + if (sn[i] == TGSI_SEMANTIC_POSITION) { + for (c = 0; c < 4; ++c) { + ti->input_map[i][c] = nintp; + if (ti->input_access[i][c]) { + p->fp.interp |= 1 << (24 + c); + ++nintp; + } + } + skip++; + continue; + } else + if (sn[i] == TGSI_SEMANTIC_FACE) { + ti->input_map[i][0] = 255; + skip++; + continue; + } + + j = (ti->interp_mode[i] & NV50_INTERP_FLAT) ? m++ : n++; + + if (sn[i] == TGSI_SEMANTIC_COLOR) + p->vp.bfc[si[i]] = j; + + p->in[j].linear = (ti->interp_mode[i] & NV50_INTERP_LINEAR) ? 1 : 0; + p->in[j].id = i; + p->in[j].sn = sn[i]; + p->in[j].si = si[i]; + } + assert(n <= m); + p->in_nr -= skip; + + if (!(p->fp.interp & (8 << 24))) { + p->fp.interp |= (8 << 24); + ++nintp; + } + + p->fp.colors = (1 << 24) | 4; /* CLAMP, FFC0_ID = 4 */ + + for (i = 0; i < p->in_nr; ++i) { + int j = p->in[i].id; + p->in[i].hw = nintp; + + for (c = 0; c < 4; ++c) { + if (!ti->input_access[j][c]) + continue; + p->in[i].mask |= 1 << c; + ti->input_map[j][c] = nintp++; + } + /* count color inputs */ + if (i == p->vp.bfc[0] || i == p->vp.bfc[1]) + p->fp.colors += bitcount4(p->in[i].mask) << 16; + } + nintp -= bitcount4(p->fp.interp >> 24); /* subtract position inputs */ + nvary = nintp; + if (n < m) + nvary -= p->in[n].hw; + + p->fp.interp |= nvary << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_NONFLAT_SHIFT; + p->fp.interp |= nintp << NV50TCL_FP_INTERPOLANT_CTRL_COUNT_SHIFT; + + /* FP outputs */ + + if (p->out_nr > (1 + (ti->scan.writes_z ? 1 : 0))) + p->fp.flags[0] |= NV50TCL_FP_CONTROL_MULTIPLE_RESULTS; + + depr = p->out_nr; + for (i = 0; i < p->out_nr; ++i) { + p->out[i].id = i; + if (p->out[i].sn == TGSI_SEMANTIC_POSITION) { + depr = i; + continue; + } + p->out[i].hw = p->max_out; + p->out[i].mask = 0xf; + + for (c = 0; c < 4; ++c) + ti->output_map[i][c] = p->max_out++; + } + if (depr < p->out_nr) { + p->out[depr].mask = 0x4; + p->out[depr].hw = p->max_out++; + } + + return 0; } -static INLINE void -set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) -{ - set_long(pc, e); - /* XXX: can't be predicated - bits overlap; cases where both - * are required should be avoided by using pc->allow32 */ - set_pred(pc, 0, 0, e); - set_pred_wr(pc, 0, 0, e); - - e->inst[1] |= 0x00000002 | 0x00000001; - e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16; - e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2; -} - -static INLINE void -set_addr(struct nv50_program_exec *e, struct nv50_reg *a) -{ - assert(a->type == P_ADDR); - - assert(!(e->inst[0] & 0x0c000000)); - assert(!(e->inst[1] & 0x00000004)); - - e->inst[0] |= (a->hw & 3) << 26; - e->inst[1] |= a->hw & 4; -} - -static void -emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t); - -static void -emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int); - -static void -emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[1] = 0x40000000; - set_long(pc, e); - set_dst(pc, dst, e); - set_addr(e, src); - - emit(pc, e); -} - -static void -emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, uint16_t src1_val) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000 | (src1_val << 9); - e->inst[1] = 0x20000000; - set_long(pc, e); - e->inst[0] |= dst->hw << 2; - if (src0) /* otherwise will add to $a0, which is always 0 */ - set_addr(e, src0); - - emit(pc, e); -} - -#define INTERP_LINEAR 0 -#define INTERP_FLAT 1 -#define INTERP_PERSPECTIVE 2 -#define INTERP_CENTROID 4 - -/* interpolant index has been stored in dst->rhw */ -static void -emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, - unsigned mode) -{ - struct nv50_program_exec *e = exec(pc); - assert(dst->rhw != -1); - - e->inst[0] |= 0x80000000; - set_dst(pc, dst, e); - e->inst[0] |= (dst->rhw << 16); - - if (mode & INTERP_FLAT) { - e->inst[0] |= (1 << 8); - } else { - if (mode & INTERP_PERSPECTIVE) { - e->inst[0] |= (1 << 25); - alloc_reg(pc, iv); - e->inst[0] |= (iv->hw << 9); - } - - if (mode & INTERP_CENTROID) - e->inst[0] |= (1 << 24); - } - - emit(pc, e); -} - -static void -set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, - struct nv50_program_exec *e) -{ - set_long(pc, e); - - e->param.index = src->hw & 127; - e->param.shift = s; - e->param.mask = m << (s % 32); - - if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */ - set_addr(e, get_address_reg(pc, src)); - else - if (src->acc < 0) { - assert(src->type == P_CONST); - set_addr(e, pc->addr[src->indirect[0]]); - } - - e->inst[1] |= (src->buf_index << 22); -} - -/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */ -static void -emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x10000000; - if (!pc->allow32) - set_long(pc, e); - - set_dst(pc, dst, e); - - if (!is_long(e) && src->type == P_IMMD) { - set_immd(pc, src, e); - /*XXX: 32-bit, but steals part of "half" reg space - need to - * catch and handle this case if/when we do half-regs - */ - } else - if (src->type == P_IMMD || src->type == P_CONST) { - set_long(pc, e); - set_data(pc, src, 0x7f, 9, e); - e->inst[1] |= 0x20000000; /* mov from c[] */ - } else { - if (src->type == P_ATTR) { - set_long(pc, e); - e->inst[1] |= 0x00200000; - - if (src->vtx >= 0) { - /* indirect (vertex base + c) load from p[] */ - e->inst[0] |= 0x01800000; - set_addr(e, get_address_reg(pc, src)); - } - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); - } - - if (is_long(e) && !is_immd(e)) { - e->inst[1] |= 0x04000000; /* 32-bit */ - e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */ - if (!(e->inst[1] & 0x20000000)) - e->inst[1] |= 0x00030000; /* lane mask 2:3 */ - } else - e->inst[0] |= 0x00008000; - - emit(pc, e); -} - -static INLINE void -emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) -{ - struct nv50_reg *imm = alloc_immd(pc, f); - emit_mov(pc, dst, imm); - FREE(imm); -} - -/* Assign the hw of the discarded temporary register src - * to the tgsi register dst and free src. - */ -static void -assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - assert(src->index == -1 && src->hw != -1); - - if (pc->if_lvl || pc->loop_lvl || - (dst->type != P_TEMP) || - (src->hw < pc->result_nr * 4 && - pc->p->type == PIPE_SHADER_FRAGMENT) || - pc->p->info.opcode_count[TGSI_OPCODE_CAL] || - pc->p->info.opcode_count[TGSI_OPCODE_BRA]) { - - emit_mov(pc, dst, src); - free_temp(pc, src); - return; - } - - if (dst->hw != -1) - pc->r_temp[dst->hw] = NULL; - pc->r_temp[src->hw] = dst; - dst->hw = src->hw; - - FREE(src); -} - -static void -emit_nop(struct nv50_pc *pc) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xf0000000; - set_long(pc, e); - e->inst[1] = 0xe0000000; - emit(pc, e); -} - -static boolean -check_swap_src_0_1(struct nv50_pc *pc, - struct nv50_reg **s0, struct nv50_reg **s1) -{ - struct nv50_reg *src0 = *s0, *src1 = *s1; - - if (src0->type == P_CONST) { - if (src1->type != P_CONST) { - *s0 = src1; - *s1 = src0; - return TRUE; - } - } else - if (src1->type == P_ATTR) { - if (src0->type != P_ATTR) { - *s0 = src1; - *s1 = src0; - return TRUE; - } - } - - return FALSE; -} - -static void -set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, - struct nv50_program_exec *e) -{ - struct nv50_reg *temp; - - if (src->type != P_TEMP) { - temp = temp_temp(pc, e); - emit_mov(pc, temp, src); - src = temp; - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); -} - -static void -set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - if (src->type == P_ATTR) { - set_long(pc, e); - e->inst[1] |= 0x00200000; - - if (src->vtx >= 0) { - e->inst[0] |= 0x01800000; /* src from p[] */ - set_addr(e, get_address_reg(pc, src)); - } - } else - if (src->type == P_CONST || src->type == P_IMMD) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= (src->hw << 9); -} - -static void -set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - if (src->type == P_ATTR) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else - if (src->type == P_CONST || src->type == P_IMMD) { - if (e->inst[0] & 0x01800000) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else { - assert(!(e->inst[0] & 0x00800000)); - set_data(pc, src, 0x7f, 16, e); - e->inst[0] |= 0x00800000; - } - } - - alloc_reg(pc, src); - if (src->hw > 63) - set_long(pc, e); - e->inst[0] |= ((src->hw & 127) << 16); -} - -static void -set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) -{ - set_long(pc, e); - - if (src->type == P_ATTR) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else - if (src->type == P_CONST || src->type == P_IMMD) { - if (e->inst[0] & 0x01800000) { - struct nv50_reg *temp = temp_temp(pc, e); - - emit_mov(pc, temp, src); - src = temp; - } else { - assert(!(e->inst[0] & 0x01000000)); - set_data(pc, src, 0x7f, 32+14, e); - e->inst[0] |= 0x01000000; - } - } - - alloc_reg(pc, src); - e->inst[1] |= ((src->hw & 127) << 14); -} - -static void -set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh, - struct nv50_program_exec *e, int pos) -{ - struct nv50_reg *r = src; - - alloc_reg(pc, r); - if (r->type != P_TEMP) { - r = temp_temp(pc, e); - emit_mov(pc, r, src); - } - - if (r->hw > (NV50_SU_MAX_TEMP / 2)) { - NOUVEAU_ERR("out of low GPRs\n"); - abort(); - } - - e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32); -} - -static void -emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred) +static int +nv50_geomprog_prepare(struct nv50_translation_info *ti) { - struct nv50_program_exec *e = exec(pc); - - assert(dst->type == P_TEMP); - e->inst[1] = 0x20000000 | (pred << 12); - set_long(pc, e); - set_dst(pc, dst, e); + ti->input_file = NV_FILE_MEM_S; + ti->output_file = NV_FILE_OUT; - emit(pc, e); + assert(0); + return 1; } -static void -emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src) +static int +nv50_prog_scan(struct nv50_translation_info *ti) +{ + struct nv50_program *p = ti->p; + struct tgsi_parse_context parse; + int ret; + + p->vp.psiz = 0x40; + p->vp.bfc[0] = 0x40; + p->vp.bfc[1] = 0x40; + p->gp.primid = 0x80; + + tgsi_scan_shader(p->pipe.tokens, &ti->scan); + + tgsi_parse_init(&parse, p->pipe.tokens); + while (!tgsi_parse_end_of_tokens(&parse)) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: + prog_immediate(ti, &parse.FullToken.FullImmediate); + break; + case TGSI_TOKEN_TYPE_DECLARATION: + prog_decl(ti, &parse.FullToken.FullDeclaration); + break; + case TGSI_TOKEN_TYPE_INSTRUCTION: + prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr); + break; + } + } + + p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1; + p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1; + + switch (p->type) { + case PIPE_SHADER_VERTEX: + ret = nv50_vertprog_prepare(ti); + break; + case PIPE_SHADER_FRAGMENT: + ret = nv50_fragprog_prepare(ti); + break; + case PIPE_SHADER_GEOMETRY: + ret = nv50_geomprog_prepare(ti); + break; + default: + assert(!"unsupported program type"); + ret = -1; + break; + } + + assert(!ret); + return ret; +} + +boolean +nv50_program_tx(struct nv50_program *p) { - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x000001fc; - e->inst[1] = 0xa0000008; - set_long(pc, e); - set_pred_wr(pc, 1, pred, e); - set_src_0_restricted(pc, src, e); + struct nv50_translation_info *ti; + int ret; - emit(pc, e); -} - -static void -emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); + ti = CALLOC_STRUCT(nv50_translation_info); + ti->p = p; - e->inst[0] |= 0xc0000000; + ti->edgeflag_out = PIPE_MAX_SHADER_OUTPUTS; - if (!pc->allow32) - set_long(pc, e); + ret = nv50_prog_scan(ti); + if (ret) { + NOUVEAU_ERR("unsupported shader program\n"); + goto out; + } - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - if (src1->type == P_IMMD && !is_long(e)) { - if (src0->mod ^ src1->mod) - e->inst[0] |= 0x00008000; - set_immd(pc, src1, e); - } else { - set_src_1(pc, src1, e); - if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { - if (is_long(e)) - e->inst[1] |= 0x08000000; - else - e->inst[0] |= 0x00008000; - } - } + ret = nv50_generate_code(ti); + if (ret) { + NOUVEAU_ERR("error during shader translation\n"); + goto out; + } - emit(pc, e); +out: + if (ti->immd32) + FREE(ti->immd32); + FREE(ti); + return ret ? FALSE : TRUE; } -static void -emit_add(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1) +void +nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) { - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xb0000000; - - alloc_reg(pc, src1); - check_swap_src_0_1(pc, &src0, &src1); - - if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { - set_long(pc, e); - e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | - ((src1->mod & NV50_MOD_NEG) << 27); - } - - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) - set_src_2(pc, src1, e); - else - if (src1->type == P_IMMD) - set_immd(pc, src1, e); - else - set_src_1(pc, src1, e); - - emit(pc, e); -} - -static void -emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, - uint8_t s) -{ - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[1] |= 0xc0000000; - - e->inst[0] |= dst->hw << 2; - e->inst[0] |= s << 16; /* shift left */ - set_src_0(pc, src, e); - - emit(pc, e); -} - -static boolean -address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r) -{ - if (!r) - return FALSE; - - if (r->vtx != a->vtx) - return FALSE; - if (r->vtx >= 0) - return (r->indirect[1] == a->indirect[1]); - - if (r->hw < a->rhw || (r->hw - a->rhw) >= 128) - return FALSE; - - if (a->index >= 0) - return (a->index == r->indirect[0]); - return (a->indirect[0] == r->indirect[0]); -} - -static void -load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *a, int shift) -{ - struct nv50_reg mem, *temp; - - ctor_reg(&mem, P_ATTR, -1, dst->vtx); - - assert(dst->type == P_ADDR); - if (!a) { - emit_arl(pc, dst, &mem, 0); - return; - } - temp = alloc_temp(pc, NULL); - - if (shift) { - emit_mov_from_addr(pc, temp, a); - if (shift < 0) - emit_shl_imm(pc, temp, temp, shift); - emit_arl(pc, dst, temp, MAX2(shift, 0)); - } - emit_mov(pc, temp, &mem); - set_addr(pc->p->exec_tail, dst); - - emit_arl(pc, dst, temp, 0); - free_temp(pc, temp); -} - -/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS - * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX - * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX - * case (vtx < 0, acc >= 0): memory address too high to encode - * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS - */ -static struct nv50_reg * -get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref) -{ - int i; - struct nv50_reg *a_ref, *a = NULL; - - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { - if (pc->r_addr[i].acc == 0) - a = &pc->r_addr[i]; /* an unused address reg */ - else - if (address_reg_suitable(&pc->r_addr[i], ref)) { - pc->r_addr[i].acc = pc->insn_cur; - return &pc->r_addr[i]; - } else - if (!a && pc->r_addr[i].index < 0 && - pc->r_addr[i].acc < pc->insn_cur) - a = &pc->r_addr[i]; - } - if (!a) { - /* We'll be able to spill address regs when this - * mess is replaced with a proper compiler ... - */ - NOUVEAU_ERR("out of address regs\n"); - abort(); - return NULL; - } - - /* initialize and reserve for this TGSI instruction */ - a->rhw = 0; - a->index = a->indirect[0] = a->indirect[1] = -1; - a->acc = pc->insn_cur; - - if (!ref) { - a->vtx = -1; - return a; - } - a->vtx = ref->vtx; - - /* now put in the correct value ... */ - - if (ref->vtx >= 0) { - a->indirect[1] = ref->indirect[1]; - - /* For an indirect vertex index, we need to shift address right - * by 2, the address register will contain vtx * 16, we need to - * load from a[vtx * 4]. - */ - load_vertex_base(pc, a, (ref->acc < 0) ? - pc->addr[ref->indirect[1]] : NULL, -2); - } else { - assert(ref->acc < 0 || ref->indirect[0] < 0); - - a->rhw = ref->hw & ~0x7f; - a->indirect[0] = ref->indirect[0]; - a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL; - - emit_add_addr_imm(pc, a, a_ref, a->rhw * 4); - } - return a; -} - -#define NV50_MAX_F32 0x880 -#define NV50_MAX_S32 0x08c -#define NV50_MAX_U32 0x084 -#define NV50_MIN_F32 0x8a0 -#define NV50_MIN_S32 0x0ac -#define NV50_MIN_U32 0x0a4 - -static void -emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1) -{ - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20); - e->inst[1] |= (sub << 24); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - - if (src0->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - if (src1->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00080000; - - emit(pc, e); -} - -static INLINE void -emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1) -{ - src1->mod ^= NV50_MOD_NEG; - emit_add(pc, dst, src0, src1); - src1->mod ^= NV50_MOD_NEG; -} - -static void -emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, unsigned op) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000; - set_long(pc, e); - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && - op != TGSI_OPCODE_XOR) - assert(!"invalid bit op"); - - assert(!(src0->mod | src1->mod)); - - if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { - set_immd(pc, src1, e); - if (op == TGSI_OPCODE_OR) - e->inst[0] |= 0x0100; - else - if (op == TGSI_OPCODE_XOR) - e->inst[0] |= 0x8000; - } else { - set_src_1(pc, src1, e); - e->inst[1] |= 0x04000000; /* 32 bit */ - if (op == TGSI_OPCODE_OR) - e->inst[1] |= 0x4000; - else - if (op == TGSI_OPCODE_XOR) - e->inst[1] |= 0x8000; - } - - emit(pc, e); -} - -static void -emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xd0000000; - e->inst[1] = 0x0402c000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_1(pc, src, e); - - emit(pc, e); -} - -static void -emit_shift(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x30000000; - e->inst[1] = 0xc4000000; - - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (src1->type == P_IMMD) { - e->inst[1] |= (1 << 20); - e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16; - } else - set_src_1(pc, src1, e); - - if (dir != TGSI_OPCODE_SHL) - e->inst[1] |= (1 << 29); - - if (dir == TGSI_OPCODE_ISHR) - e->inst[1] |= (1 << 27); - - emit(pc, e); -} - -static void -emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src, int s) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x30000000; - e->inst[1] = 0xc4100000; - if (s < 0) { - e->inst[1] |= 1 << 29; - s = -s; - } - e->inst[1] |= ((s & 0x7f) << 16); - - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - - emit(pc, e); -} - -static void -emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xe0000000; - - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - set_src_2(pc, src2, e); - - if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src2->mod & NV50_MOD_NEG) - e->inst[1] |= 0x08000000; - - emit(pc, e); -} - -static INLINE void -emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, - struct nv50_reg *src1, struct nv50_reg *src2) -{ - src2->mod ^= NV50_MOD_NEG; - emit_mad(pc, dst, src0, src1, src2); - src2->mod ^= NV50_MOD_NEG; -} - -#define NV50_FLOP_RCP 0 -#define NV50_FLOP_RSQ 2 -#define NV50_FLOP_LG2 3 -#define NV50_FLOP_SIN 4 -#define NV50_FLOP_COS 5 -#define NV50_FLOP_EX2 6 - -/* rcp, rsqrt, lg2 support neg and abs */ -static void -emit_flop(struct nv50_pc *pc, unsigned sub, - struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0x90000000; - if (sub || src->mod) { - set_long(pc, e); - e->inst[1] |= (sub << 29); - } - - set_dst(pc, dst, e); - set_src_0_restricted(pc, src, e); - - assert(!src->mod || sub < 4); - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -static void -emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xb0000000; - - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_long(pc, e); - e->inst[1] |= (6 << 29) | 0x00004000; - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -static void -emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] |= 0xb0000000; - - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_long(pc, e); - e->inst[1] |= (6 << 29); - - if (src->mod & NV50_MOD_NEG) - e->inst[1] |= 0x04000000; - if (src->mod & NV50_MOD_ABS) - e->inst[1] |= 0x00100000; - - emit(pc, e); -} - -#define CVT_RN (0x00 << 16) -#define CVT_FLOOR (0x02 << 16) -#define CVT_CEIL (0x04 << 16) -#define CVT_TRUNC (0x06 << 16) -#define CVT_SAT (0x08 << 16) -#define CVT_ABS (0x10 << 16) - -#define CVT_X32_X32 0x04004000 -#define CVT_X32_S32 0x04014000 -#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32) -#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32) -#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32) -#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32) -#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32) -#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32) -#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32) -#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32) - -#define CVT_NEG 0x20000000 -#define CVT_RI 0x08000000 - -static void -emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, - int wp, uint32_t cvn) -{ - struct nv50_program_exec *e; - - e = exec(pc); - - if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG; - if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS; - - e->inst[0] = 0xa0000000; - e->inst[1] = cvn; - set_long(pc, e); - set_src_0(pc, src, e); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - emit(pc, e); -} - -/* nv50 Condition codes: - * 0x1 = LT - * 0x2 = EQ - * 0x3 = LE - * 0x4 = GT - * 0x5 = NE - * 0x6 = GE - * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) - * 0x8 = unordered bit (allows NaN) - * - * mode = 0x04 (u32), 0x0c (s32), 0x80 (f32) - */ -static void -emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, - struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode) -{ - static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; - - struct nv50_program_exec *e = exec(pc); - struct nv50_reg *rdst; - - assert(ccode < 16); - if (check_swap_src_0_1(pc, &src0, &src1)) - ccode = cc_swapped[ccode & 7] | (ccode & 8); - - rdst = dst; - if (dst && dst->type != P_TEMP) - dst = alloc_temp(pc, NULL); - - set_long(pc, e); - e->inst[0] |= 0x30000000 | (mode << 24); - e->inst[1] |= 0x60000000 | (ccode << 14); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - - emit(pc, e); - - if (rdst && mode == 0x80) /* convert to float ? */ - emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32); - if (rdst && rdst != dst) - free_temp(pc, dst); -} - -static INLINE void -map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty) -{ - switch (op) { - case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break; - case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break; - case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break; - case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break; - case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break; - case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break; - - case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break; - case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break; - case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break; - case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break; - case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break; - case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break; - default: - assert(0); - return; - } -} - -static void -emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *rsrc1) -{ - struct nv50_program_exec *e = exec(pc); - struct nv50_reg *src1; - - e->inst[0] = 0x20000000; - - alloc_reg(pc, rsrc1); - check_swap_src_0_1(pc, &src0, &rsrc1); - - src1 = rsrc1; - if (src0->mod & rsrc1->mod & NV50_MOD_NEG) { - src1 = temp_temp(pc, e); - emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32); - } - - if (!pc->allow32 || src1->hw > 63 || - (src1->type != P_TEMP && src1->type != P_IMMD)) - set_long(pc, e); - - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - - if (is_long(e)) { - e->inst[1] |= 1 << 26; - set_src_2(pc, src1, e); - } else { - e->inst[0] |= 0x8000; - if (src1->type == P_IMMD) - set_immd(pc, src1, e); - else - set_src_1(pc, src1, e); - } - - if (src0->mod & NV50_MOD_NEG) - e->inst[0] |= 1 << 28; - else - if (src1->mod & NV50_MOD_NEG) - e->inst[0] |= 1 << 22; - - emit(pc, e); -} - -static void -emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1, - struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x60000000; - if (!pc->allow32) - set_long(pc, e); - set_dst(pc, dst, e); - - set_half_src(pc, src0, lh_0, e, 9); - set_half_src(pc, src1, lh_1, e, 16); - alloc_reg(pc, src2); - if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw)) - set_src_2(pc, src2, e); - - emit(pc, e); -} - -static void -emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x40000000; - set_long(pc, e); - set_dst(pc, dst, e); - - set_half_src(pc, src0, lh_0, e, 9); - set_half_src(pc, src1, lh_1, e, 16); - - emit(pc, e); -} - -static void -emit_sad(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0x50000000; - if (!pc->allow32) - set_long(pc, e); - check_swap_src_0_1(pc, &src0, &src1); - set_dst(pc, dst, e); - set_src_0(pc, src0, e); - set_src_1(pc, src1, e); - alloc_reg(pc, src2); - if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw)) - set_src_2(pc, src2, e); - - if (is_long(e)) - e->inst[1] |= 0x0c << 24; - else - e->inst[0] |= 0x81 << 8; - - emit(pc, e); -} - -static INLINE void -emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI); -} - -static void -emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *v, struct nv50_reg *e) -{ - struct nv50_reg *temp = alloc_temp(pc, NULL); - - emit_flop(pc, NV50_FLOP_LG2, temp, v); - emit_mul(pc, temp, temp, e); - emit_preex2(pc, temp, temp); - emit_flop(pc, NV50_FLOP_EX2, dst, temp); - - free_temp(pc, temp); -} - -static INLINE void -emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32); -} - -static void -emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, - struct nv50_reg **src) -{ - struct nv50_reg *one = alloc_immd(pc, 1.0); - struct nv50_reg *zero = alloc_immd(pc, 0.0); - struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); - struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); - struct nv50_reg *tmp[4] = { 0 }; - boolean allow32 = pc->allow32; - - pc->allow32 = FALSE; - - if (mask & (3 << 1)) { - tmp[0] = alloc_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero); - } - - if (mask & (1 << 2)) { - set_pred_wr(pc, 1, 0, pc->p->exec_tail); - - tmp[1] = temp_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero); - - tmp[3] = temp_temp(pc, NULL); - emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128); - emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128); - - emit_pow(pc, dst[2], tmp[1], tmp[3]); - emit_mov(pc, dst[2], zero); - set_pred(pc, 3, 0, pc->p->exec_tail); - } - - if (mask & (1 << 1)) - assimilate_temp(pc, dst[1], tmp[0]); - else - if (mask & (1 << 2)) - free_temp(pc, tmp[0]); - - pc->allow32 = allow32; - - /* do this last, in case src[i,j] == dst[0,3] */ - if (mask & (1 << 0)) - emit_mov(pc, dst[0], one); - - if (mask & (1 << 3)) - emit_mov(pc, dst[3], one); - - FREE(pos128); - FREE(neg128); - FREE(zero); - FREE(one); -} - -static void -emit_kil(struct nv50_pc *pc, struct nv50_reg *src) -{ - struct nv50_program_exec *e; - const int r_pred = 1; - - e = exec(pc); - e->inst[0] = 0x00000002; /* discard */ - set_long(pc, e); /* sets cond code to ALWAYS */ - - if (src) { - set_pred(pc, 0x1 /* cc = LT */, r_pred, e); - /* write to predicate reg */ - emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32); - } - - emit(pc, e); -} - -static struct nv50_program_exec * -emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = (op << 28) | 2; - set_long(pc, e); - if (pred >= 0) - set_pred(pc, cc, pred, e); - - emit(pc, e); - return e; -} - -static INLINE struct nv50_program_exec * -emit_breakaddr(struct nv50_pc *pc) -{ - return emit_control_flow(pc, 0x4, -1, 0); -} - -static INLINE void -emit_break(struct nv50_pc *pc, int pred, unsigned cc) -{ - emit_control_flow(pc, 0x5, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_joinat(struct nv50_pc *pc) -{ - return emit_control_flow(pc, 0xa, -1, 0); -} - -static INLINE struct nv50_program_exec * -emit_branch(struct nv50_pc *pc, int pred, unsigned cc) -{ - return emit_control_flow(pc, 0x1, pred, cc); -} - -static INLINE struct nv50_program_exec * -emit_call(struct nv50_pc *pc, int pred, unsigned cc) -{ - return emit_control_flow(pc, 0x2, pred, cc); -} - -static INLINE void -emit_ret(struct nv50_pc *pc, int pred, unsigned cc) -{ - emit_control_flow(pc, 0x3, pred, cc); -} - -static void -emit_prim_cmd(struct nv50_pc *pc, unsigned cmd) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xf0000000 | (cmd << 9); - e->inst[1] = 0xc0000000; - set_long(pc, e); - - emit(pc, e); -} - -#define QOP_ADD 0 -#define QOP_SUBR 1 -#define QOP_SUB 2 -#define QOP_MOV_SRC1 3 - -/* For a quad of threads / top left, top right, bottom left, bottom right - * pixels, do a different operation, and take src0 from a specific thread. - */ -static void -emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0, - struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop) -{ - struct nv50_program_exec *e = exec(pc); - - e->inst[0] = 0xc0000000; - e->inst[1] = 0x80000000; - set_long(pc, e); - e->inst[0] |= lane_src0 << 16; - set_src_0(pc, src0, e); - set_src_2(pc, src1, e); - - if (wp >= 0) - set_pred_wr(pc, 1, wp, e); - - if (dst) - set_dst(pc, dst, e); - else { - e->inst[0] |= 0x000001fc; - e->inst[1] |= 0x00000008; - } - - e->inst[0] |= (qop & 3) << 20; - e->inst[1] |= (qop >> 2) << 22; - - emit(pc, e); -} - -static void -load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], - struct nv50_reg **src, unsigned arg, boolean proj) -{ - int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; - - src[0]->mod |= NV50_MOD_ABS; - src[1]->mod |= NV50_MOD_ABS; - src[2]->mod |= NV50_MOD_ABS; - - emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]); - emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]); - - src[0]->mod = mod[0]; - src[1]->mod = mod[1]; - src[2]->mod = mod[2]; - - if (proj && 0 /* looks more correct without this */) - emit_mul(pc, t[2], t[2], src[3]); - else - if (arg == 4) /* there is no textureProj(samplerCubeShadow) */ - emit_mov(pc, t[3], src[3]); - - emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]); - - emit_mul(pc, t[0], src[0], t[2]); - emit_mul(pc, t[1], src[1], t[2]); - emit_mul(pc, t[2], src[2], t[2]); -} - -static void -load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], - struct nv50_reg **src, unsigned dim, unsigned arg) -{ - unsigned c, mode; - - if (src[0]->type == P_TEMP && src[0]->rhw != -1) { - mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE; - - t[3]->rhw = src[3]->rhw; - emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); - emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]); - - for (c = 0; c < dim; ++c) { - t[c]->rhw = src[c]->rhw; - emit_interp(pc, t[c], t[3], mode); - } - if (arg != dim) { /* depth reference value */ - t[dim]->rhw = src[2]->rhw; - emit_interp(pc, t[dim], t[3], mode); - } - } else { - /* XXX: for some reason the blob sometimes uses MAD - * (mad f32 $rX $rY $rZ neg $r63) - */ - emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]); - for (c = 0; c < dim; ++c) - emit_mul(pc, t[c], src[c], t[3]); - if (arg != dim) /* depth reference value */ - emit_mul(pc, t[dim], src[2], t[3]); - } -} - -static INLINE void -get_tex_dim(unsigned type, unsigned *dim, unsigned *arg) -{ - switch (type) { - case TGSI_TEXTURE_1D: - *arg = *dim = 1; - break; - case TGSI_TEXTURE_SHADOW1D: - *dim = 1; - *arg = 2; - break; - case TGSI_TEXTURE_UNKNOWN: - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - *arg = *dim = 2; - break; - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - *dim = 2; - *arg = 3; - break; - case TGSI_TEXTURE_3D: - case TGSI_TEXTURE_CUBE: - *dim = *arg = 3; - break; - default: - assert(0); - break; - } -} - -/* We shouldn't execute TEXLOD if any of the pixels in a quad have - * different LOD values, so branch off groups of equal LOD. - */ -static void -emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod, - struct nv50_reg *src, struct nv50_program_exec *tex) -{ - struct nv50_program_exec *join_at; - unsigned i, target = pc->p->exec_size + 9 * 2; - - if (pc->p->type != PIPE_SHADER_FRAGMENT) { - emit(pc, tex); - return; - } - pc->allow32 = FALSE; - - /* Subtract lod of each pixel from lod of top left pixel, jump - * texlod insn if result is 0, then repeat for 2 other pixels. - */ - join_at = emit_joinat(pc); - emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55); - emit_branch(pc, 0, 2)->param.index = target; - - for (i = 1; i < 4; ++i) { - emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55); - emit_branch(pc, 0, 2)->param.index = target; - } - - emit_mov(pc, tlod, src); /* target */ - emit(pc, tex); /* texlod */ - - join_at->param.index = target + 2 * 2; - JOIN_ON(emit_nop(pc)); /* join _after_ tex */ -} - -static void -emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg, - struct nv50_program_exec *tex) -{ - struct nv50_program_exec *e; - struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL); - int r_pred = 0; - unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 }; - - pc->allow32 = FALSE; - ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4); - - /* Subtract bias value of thread i from bias values of each thread, - * store result in r_pred, and set bit i in r_bits if result was 0. - */ - assert(arg < 4); - for (i = 0; i < 4; ++i, ++imm_1248.hw) { - emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55); - emit_mov(pc, r_bits, &imm_1248); - set_pred(pc, 2, r_pred, pc->p->exec_tail); - } - emit_mov_to_pred(pc, r_pred, r_bits); - - /* The lanes of a quad are now grouped by the bit in r_pred they have - * set. Put the input values for TEX into a new register set for each - * group and execute TEX only for a specific group. - * We cannot use the same register set for each group because we need - * the derivatives, which are implicitly calculated, to be correct. - */ - for (i = 1; i < 4; ++i) { - alloc_temp4(pc, t123[i], 0); - - for (c = 0; c <= arg; ++c) - emit_mov(pc, t123[i][c], t[c]); - - *(e = exec(pc)) = *(tex); - e->inst[0] &= ~0x01fc; - set_dst(pc, t123[i][0], e); - set_pred(pc, cc[i], r_pred, e); - emit(pc, e); - } - /* finally TEX on the original regs (where we kept the input) */ - set_pred(pc, cc[0], r_pred, tex); - emit(pc, tex); - - /* put the 3 * n other results into regs for lane 0 */ - n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc)); - for (i = 1; i < 4; ++i) { - for (c = 0; c < n; ++c) { - emit_mov(pc, t[c], t123[i][c]); - set_pred(pc, cc[i], r_pred, pc->p->exec_tail); - } - free_temp4(pc, t123[i]); - } - - emit_nop(pc); - free_temp(pc, r_bits); -} - -static void -emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, - struct nv50_reg **src, unsigned unit, unsigned type, - boolean proj, int bias_lod) -{ - struct nv50_reg *t[4]; - struct nv50_program_exec *e; - unsigned c, dim, arg; - - /* t[i] must be within a single 128 bit super-reg */ - alloc_temp4(pc, t, 0); - - e = exec(pc); - e->inst[0] = 0xf0000000; - set_long(pc, e); - set_dst(pc, t[0], e); - - /* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */ - e->inst[0] |= (unit << 9) /* | (unit << 17) */; - - /* live flag (don't set if TEX results affect input to another TEX): */ - /* e->inst[0] |= 0x00000004; */ - - get_tex_dim(type, &dim, &arg); - - if (type == TGSI_TEXTURE_CUBE) { - e->inst[0] |= 0x08000000; - load_cube_tex_coords(pc, t, src, arg, proj); - } else - if (proj) - load_proj_tex_coords(pc, t, src, dim, arg); - else { - for (c = 0; c < dim; c++) - emit_mov(pc, t[c], src[c]); - if (arg != dim) /* depth reference value (always src.z here) */ - emit_mov(pc, t[dim], src[2]); - } - - e->inst[0] |= (mask & 0x3) << 25; - e->inst[1] |= (mask & 0xc) << 12; - - if (!bias_lod) { - e->inst[0] |= (arg - 1) << 22; - emit(pc, e); - } else - if (bias_lod < 0) { - assert(pc->p->type == PIPE_SHADER_FRAGMENT); - e->inst[0] |= arg << 22; - e->inst[1] |= 0x20000000; /* texbias */ - emit_mov(pc, t[arg], src[3]); - emit_texbias_sequence(pc, t, arg, e); - } else { - e->inst[0] |= arg << 22; - e->inst[1] |= 0x40000000; /* texlod */ - emit_mov(pc, t[arg], src[3]); - emit_texlod_sequence(pc, t[arg], src[3], e); - } - -#if 1 - c = 0; - if (mask & 1) emit_mov(pc, dst[0], t[c++]); - if (mask & 2) emit_mov(pc, dst[1], t[c++]); - if (mask & 4) emit_mov(pc, dst[2], t[c++]); - if (mask & 8) emit_mov(pc, dst[3], t[c]); - - free_temp4(pc, t); -#else - /* XXX: if p.e. MUL is used directly after TEX, it would still use - * the texture coordinates, not the fetched values: latency ? */ - - for (c = 0; c < 4; c++) { - if (mask & (1 << c)) - assimilate_temp(pc, dst[c], t[c]); - else - free_temp(pc, t[c]); - } -#endif -} - -static void -emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - assert(src->type == P_TEMP); - - e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000; - e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_src_2(pc, src, e); - - emit(pc, e); -} - -static void -emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) -{ - struct nv50_program_exec *e = exec(pc); - - assert(src->type == P_TEMP); - - e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000; - e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000; - set_long(pc, e); - set_dst(pc, dst, e); - set_src_0(pc, src, e); - set_src_2(pc, src, e); - - emit(pc, e); -} - -static void -convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) -{ - unsigned q = 0, m = ~0; - - assert(!is_long(e)); - - switch (e->inst[0] >> 28) { - case 0x1: - /* MOV */ - q = 0x0403c000; - m = 0xffff7fff; - break; - case 0x2: - case 0x3: - /* ADD, SUB, SUBR b32 */ - m = ~(0x8000 | (127 << 16)); - q = ((e->inst[0] & (~m)) >> 2) | (1 << 26); - break; - case 0x5: - /* SAD */ - m = ~(0x81 << 8); - q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12); - break; - case 0x6: - /* MAD u16 */ - q = (e->inst[0] & (0x7f << 2)) << 12; - break; - case 0x8: - /* INTERP (move centroid, perspective and flat bits) */ - m = ~0x03000100; - q = (e->inst[0] & (3 << 24)) >> (24 - 16); - q |= (e->inst[0] & (1 << 8)) << (18 - 8); - break; - case 0x9: - /* RCP */ - break; - case 0xB: - /* ADD */ - m = ~(127 << 16); - q = ((e->inst[0] & (~m)) >> 2); - break; - case 0xC: - /* MUL */ - m = ~0x00008000; - q = ((e->inst[0] & (~m)) << 12); - break; - case 0xE: - /* MAD (if src2 == dst) */ - q = ((e->inst[0] & 0x1fc) << 12); - break; - default: - assert(0); - break; - } - - set_long(pc, e); - pc->p->exec_size++; - - e->inst[0] &= m; - e->inst[1] |= q; -} - -/* Some operations support an optional negation flag. */ -static int -get_supported_mods(const struct tgsi_full_instruction *insn, int i) -{ - switch (insn->Instruction.Opcode) { - case TGSI_OPCODE_ADD: - case TGSI_OPCODE_COS: - case TGSI_OPCODE_DDX: - case TGSI_OPCODE_DDY: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_KIL: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_MAD: - case TGSI_OPCODE_MUL: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */ - case TGSI_OPCODE_SCS: - case TGSI_OPCODE_SIN: - case TGSI_OPCODE_SUB: - return NV50_MOD_NEG; - case TGSI_OPCODE_MAX: - case TGSI_OPCODE_MIN: - case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */ - return NV50_MOD_ABS; - case TGSI_OPCODE_CEIL: - case TGSI_OPCODE_FLR: - case TGSI_OPCODE_TRUNC: - return NV50_MOD_NEG | NV50_MOD_ABS; - case TGSI_OPCODE_F2I: - case TGSI_OPCODE_F2U: - case TGSI_OPCODE_I2F: - case TGSI_OPCODE_U2F: - return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32; - case TGSI_OPCODE_UADD: - return NV50_MOD_NEG | NV50_MOD_I32; - case TGSI_OPCODE_SAD: - case TGSI_OPCODE_SHL: - case TGSI_OPCODE_IMAX: - case TGSI_OPCODE_IMIN: - case TGSI_OPCODE_ISHR: - case TGSI_OPCODE_NOT: - case TGSI_OPCODE_UMAD: - case TGSI_OPCODE_UMAX: - case TGSI_OPCODE_UMIN: - case TGSI_OPCODE_UMUL: - case TGSI_OPCODE_USHR: - return NV50_MOD_I32; - default: - return 0; - } -} - -/* Return a read mask for source registers deduced from opcode & write mask. */ -static unsigned -nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) -{ - unsigned x, mask = insn->Dst[0].Register.WriteMask; - - switch (insn->Instruction.Opcode) { - case TGSI_OPCODE_COS: - case TGSI_OPCODE_SIN: - return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); - case TGSI_OPCODE_DP3: - return 0x7; - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_KIL: /* WriteMask ignored */ - return 0xf; - case TGSI_OPCODE_DST: - return mask & (c ? 0xa : 0x6); - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_SCS: - return 0x1; - case TGSI_OPCODE_IF: - return 0x1; - case TGSI_OPCODE_LIT: - return 0xb; - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXP: - { - const struct tgsi_instruction_texture *tex; - - assert(insn->Instruction.Texture); - tex = &insn->Texture; - - mask = 0x7; - if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && - insn->Instruction.Opcode != TGSI_OPCODE_TXD) - mask |= 0x8; /* bias, lod or proj */ - - switch (tex->Texture) { - case TGSI_TEXTURE_1D: - mask &= 0x9; - break; - case TGSI_TEXTURE_SHADOW1D: - mask &= 0x5; - break; - case TGSI_TEXTURE_2D: - mask &= 0xb; - break; - default: - break; - } - } - return mask; - case TGSI_OPCODE_XPD: - x = 0; - if (mask & 1) x |= 0x6; - if (mask & 2) x |= 0x5; - if (mask & 4) x |= 0x3; - return x; - default: - break; - } - - return mask; -} - -static struct nv50_reg * -tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) -{ - switch (dst->Register.File) { - case TGSI_FILE_TEMPORARY: - return &pc->temp[dst->Register.Index * 4 + c]; - case TGSI_FILE_OUTPUT: - return &pc->result[dst->Register.Index * 4 + c]; - case TGSI_FILE_ADDRESS: - { - struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c]; - if (!r) { - r = get_address_reg(pc, NULL); - r->index = dst->Register.Index * 4 + c; - pc->addr[r->index] = r; - } - assert(r); - return r; - } - case TGSI_FILE_NULL: - return NULL; - case TGSI_FILE_SYSTEM_VALUE: - assert(pc->sysval[dst->Register.Index].type == P_RESULT); - assert(c == 0); - return &pc->sysval[dst->Register.Index]; - default: - break; - } - - return NULL; -} - -static struct nv50_reg * -tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, - int mod) -{ - struct nv50_reg *r = NULL; - struct nv50_reg *temp = NULL; - unsigned sgn, c, swz, cvn; - - if (src->Register.File != TGSI_FILE_CONSTANT) - assert(!src->Register.Indirect); - - sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); - - c = tgsi_util_get_full_src_register_swizzle(src, chan); - switch (c) { - case TGSI_SWIZZLE_X: - case TGSI_SWIZZLE_Y: - case TGSI_SWIZZLE_Z: - case TGSI_SWIZZLE_W: - switch (src->Register.File) { - case TGSI_FILE_INPUT: - r = &pc->attr[src->Register.Index * 4 + c]; - - if (!src->Dimension.Dimension) - break; - r = reg_instance(pc, r); - r->vtx = src->Dimension.Index; - - if (!src->Dimension.Indirect) - break; - swz = tgsi_util_get_src_register_swizzle( - &src->DimIndirect, 0); - r->acc = -1; - r->indirect[1] = src->DimIndirect.Index * 4 + swz; - break; - case TGSI_FILE_TEMPORARY: - r = &pc->temp[src->Register.Index * 4 + c]; - break; - case TGSI_FILE_CONSTANT: - if (!src->Register.Indirect) { - r = &pc->param[src->Register.Index * 4 + c]; - break; - } - /* Indicate indirection by setting r->acc < 0 and - * use the index field to select the address reg. - */ - r = reg_instance(pc, NULL); - ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c); - - swz = tgsi_util_get_src_register_swizzle( - &src->Indirect, 0); - r->acc = -1; - r->indirect[0] = src->Indirect.Index * 4 + swz; - break; - case TGSI_FILE_IMMEDIATE: - r = &pc->immd[src->Register.Index * 4 + c]; - break; - case TGSI_FILE_SAMPLER: - return NULL; - case TGSI_FILE_ADDRESS: - r = pc->addr[src->Register.Index * 4 + c]; - assert(r); - break; - case TGSI_FILE_SYSTEM_VALUE: - assert(c == 0); - r = &pc->sysval[src->Register.Index]; - break; - default: - assert(0); - break; - } - break; - default: - assert(0); - break; - } - - cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32; - - switch (sgn) { - case TGSI_UTIL_SIGN_CLEAR: - r->mod = NV50_MOD_ABS; - break; - case TGSI_UTIL_SIGN_SET: - r->mod = NV50_MOD_NEG_ABS; - break; - case TGSI_UTIL_SIGN_TOGGLE: - r->mod = NV50_MOD_NEG; - break; - default: - assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP); - break; - } - - if ((r->mod & mod) != r->mod) { - temp = temp_temp(pc, NULL); - emit_cvt(pc, temp, r, -1, cvn); - r->mod = 0; - r = temp; - } else - r->mod |= mod & NV50_MOD_I32; - - assert(r); - if (r->acc >= 0 && r->vtx < 0 && r != temp) - return reg_instance(pc, r); /* will clear r->mod */ - return r; -} - -/* return TRUE for ops that produce only a single result */ -static boolean -is_scalar_op(unsigned op) -{ - switch (op) { - case TGSI_OPCODE_COS: - case TGSI_OPCODE_DP2: - case TGSI_OPCODE_DP3: - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DPH: - case TGSI_OPCODE_EX2: - case TGSI_OPCODE_LG2: - case TGSI_OPCODE_POW: - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_SIN: - /* - case TGSI_OPCODE_KIL: - case TGSI_OPCODE_LIT: - case TGSI_OPCODE_SCS: - */ - return TRUE; - default: - return FALSE; - } -} - -/* Returns a bitmask indicating which dst components depend - * on source s, component c (reverse of nv50_tgsi_src_mask). - */ -static unsigned -nv50_tgsi_dst_revdep(unsigned op, int s, int c) -{ - if (is_scalar_op(op)) - return 0x1; - - switch (op) { - case TGSI_OPCODE_DST: - return (1 << c) & (s ? 0xa : 0x6); - case TGSI_OPCODE_XPD: - switch (c) { - case 0: return 0x6; - case 1: return 0x5; - case 2: return 0x3; - case 3: return 0x0; - default: - assert(0); - return 0x0; - } - case TGSI_OPCODE_EXP: - case TGSI_OPCODE_LOG: - case TGSI_OPCODE_LIT: - case TGSI_OPCODE_SCS: - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXB: - case TGSI_OPCODE_TXL: - case TGSI_OPCODE_TXP: - /* these take care of dangerous swizzles themselves */ - return 0x0; - case TGSI_OPCODE_IF: - case TGSI_OPCODE_KIL: - /* don't call this function for these ops */ - assert(0); - return 0; - default: - /* linear vector instruction */ - return (1 << c); - } -} - -static INLINE boolean -has_pred(struct nv50_program_exec *e, unsigned cc) -{ - if (!is_long(e) || is_immd(e)) - return FALSE; - return ((e->inst[1] & 0x780) == (cc << 7)); -} - -/* on ENDIF see if we can do "@p0.neu single_op" instead of: - * join_at ENDIF - * @p0.eq bra ENDIF - * single_op - * ENDIF: nop.join - */ -static boolean -nv50_kill_branch(struct nv50_pc *pc) -{ - int lvl = pc->if_lvl; - - if (pc->if_insn[lvl]->next != pc->p->exec_tail) - return FALSE; - if (is_immd(pc->p->exec_tail)) - return FALSE; - - /* if ccode == 'true', the BRA is from an ELSE and the predicate - * reg may no longer be valid, since we currently always use $p0 - */ - if (has_pred(pc->if_insn[lvl], 0xf)) - return FALSE; - assert(pc->if_insn[lvl] && pc->if_join[lvl]); - - /* We'll use the exec allocated for JOIN_AT (we can't easily - * access nv50_program_exec's prev). - */ - pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ - - *pc->if_join[lvl] = *pc->p->exec_tail; - - FREE(pc->if_insn[lvl]); - FREE(pc->p->exec_tail); - - pc->p->exec_tail = pc->if_join[lvl]; - pc->p->exec_tail->next = NULL; - set_pred(pc, 0xd, 0, pc->p->exec_tail); - - return TRUE; -} - -static void -nv50_fp_move_results(struct nv50_pc *pc) -{ - struct nv50_reg reg; - unsigned i; - - ctor_reg(®, P_TEMP, -1, -1); - - for (i = 0; i < pc->result_nr * 4; ++i) { - if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) - continue; - if (pc->result[i].rhw != pc->result[i].hw) { - reg.hw = pc->result[i].rhw; - emit_mov(pc, ®, &pc->result[i]); - } - } -} - -static boolean -nv50_program_tx_insn(struct nv50_pc *pc, - const struct tgsi_full_instruction *inst) -{ - struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; - unsigned mask, sat, unit = 0; - int i, c; - - mask = inst->Dst[0].Register.WriteMask; - sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; - - memset(src, 0, sizeof(src)); - - for (c = 0; c < 4; c++) { - if ((mask & (1 << c)) && !pc->r_dst[c]) - dst[c] = tgsi_dst(pc, c, &inst->Dst[0]); - else - dst[c] = pc->r_dst[c]; - rdst[c] = dst[c]; - } - - for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *fs = &inst->Src[i]; - unsigned src_mask; - int mod_supp; - - src_mask = nv50_tgsi_src_mask(inst, i); - mod_supp = get_supported_mods(inst, i); - - if (fs->Register.File == TGSI_FILE_SAMPLER) - unit = fs->Register.Index; - - for (c = 0; c < 4; c++) - if (src_mask & (1 << c)) - src[i][c] = tgsi_src(pc, c, fs, mod_supp); - } - - brdc = temp = pc->r_brdc; - if (brdc && brdc->type != P_TEMP) { - temp = temp_temp(pc, NULL); - if (sat) - brdc = temp; - } else - if (sat) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) - continue; - /* rdst[c] = dst[c]; */ /* done above */ - dst[c] = temp_temp(pc, NULL); - } - } - - assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); - - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_ABS: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_ABS | CVT_F32_F32); - } - break; - case TGSI_OPCODE_ADD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_add(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_AND: - case TGSI_OPCODE_XOR: - case TGSI_OPCODE_OR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_bitop2(pc, dst[c], src[0][c], src[1][c], - inst->Instruction.Opcode); - } - break; - case TGSI_OPCODE_ARL: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, temp, src[0][c], -1, - CVT_FLOOR | CVT_S32_F32); - emit_arl(pc, dst[c], temp, 4); - } - break; - case TGSI_OPCODE_BGNLOOP: - pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc); - pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; - terminate_mbb(pc); - break; - case TGSI_OPCODE_BGNSUB: - assert(!pc->in_subroutine); - pc->in_subroutine = TRUE; - /* probably not necessary, but align to 8 byte boundary */ - if (!is_long(pc->p->exec_tail)) - convert_to_long(pc, pc->p->exec_tail); - break; - case TGSI_OPCODE_BRK: - assert(pc->loop_lvl > 0); - emit_break(pc, -1, 0); - break; - case TGSI_OPCODE_CAL: - assert(inst->Label.Label < pc->insn_nr); - emit_call(pc, -1, 0)->param.index = inst->Label.Label; - /* replaced by actual offset in nv50_program_fixup_insns */ - break; - case TGSI_OPCODE_CEIL: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_CEIL | CVT_F32_F32 | CVT_RI); - } - break; - case TGSI_OPCODE_CMP: - pc->allow32 = FALSE; - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32); - emit_mov(pc, dst[c], src[1][c]); - set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ - emit_mov(pc, dst[c], src[2][c]); - set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ - } - break; - case TGSI_OPCODE_CONT: - assert(pc->loop_lvl > 0); - emit_branch(pc, -1, 0)->param.index = - pc->loop_pos[pc->loop_lvl - 1]; - break; - case TGSI_OPCODE_COS: - if (mask & 8) { - emit_precossin(pc, temp, src[0][3]); - emit_flop(pc, NV50_FLOP_COS, dst[3], temp); - if (!(mask &= 7)) - break; - if (temp == dst[3]) - temp = brdc = temp_temp(pc, NULL); - } - emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_COS, brdc, temp); - break; - case TGSI_OPCODE_DDX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_ddx(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_DDY: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_ddy(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_DP3: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, brdc, src[0][2], src[1][2], temp); - break; - case TGSI_OPCODE_DP4: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_mad(pc, brdc, src[0][3], src[1][3], temp); - break; - case TGSI_OPCODE_DPH: - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_add(pc, brdc, src[1][3], temp); - break; - case TGSI_OPCODE_DST: - if (mask & (1 << 1)) - emit_mul(pc, dst[1], src[0][1], src[1][1]); - if (mask & (1 << 2)) - emit_mov(pc, dst[2], src[0][2]); - if (mask & (1 << 3)) - emit_mov(pc, dst[3], src[1][3]); - if (mask & (1 << 0)) - emit_mov_immdval(pc, dst[0], 1.0f); - break; - case TGSI_OPCODE_ELSE: - emit_branch(pc, -1, 0); - pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; - pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; - terminate_mbb(pc); - break; - case TGSI_OPCODE_EMIT: - emit_prim_cmd(pc, 1); - break; - case TGSI_OPCODE_ENDIF: - pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; - - /* try to replace branch over 1 insn with a predicated insn */ - if (nv50_kill_branch(pc) == TRUE) - break; - - if (pc->if_join[pc->if_lvl]) { - pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size; - pc->if_join[pc->if_lvl] = NULL; - } - terminate_mbb(pc); - /* emit a NOP as join point, we could set it on the next - * one, but would have to make sure it is long and !immd - */ - JOIN_ON(emit_nop(pc)); - break; - case TGSI_OPCODE_ENDLOOP: - emit_branch(pc, -1, 0)->param.index = - pc->loop_pos[--pc->loop_lvl]; - pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size; - terminate_mbb(pc); - break; - case TGSI_OPCODE_ENDPRIM: - emit_prim_cmd(pc, 2); - break; - case TGSI_OPCODE_ENDSUB: - assert(pc->in_subroutine); - terminate_mbb(pc); - pc->in_subroutine = FALSE; - break; - case TGSI_OPCODE_EX2: - emit_preex2(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_EX2, brdc, temp); - break; - case TGSI_OPCODE_EXP: - { - struct nv50_reg *t[2]; - - assert(!temp); - t[0] = temp_temp(pc, NULL); - t[1] = temp_temp(pc, NULL); - - if (mask & 0x6) - emit_mov(pc, t[0], src[0][0]); - if (mask & 0x3) - emit_flr(pc, t[1], src[0][0]); - - if (mask & (1 << 1)) - emit_sub(pc, dst[1], t[0], t[1]); - if (mask & (1 << 0)) { - emit_preex2(pc, t[1], t[1]); - emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]); - } - if (mask & (1 << 2)) { - emit_preex2(pc, t[0], t[0]); - emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0f); - } - break; - case TGSI_OPCODE_F2I: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_S32_F32); - } - break; - case TGSI_OPCODE_F2U: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_U32_F32); - } - break; - case TGSI_OPCODE_FLR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_flr(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_FRC: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_flr(pc, temp, src[0][c]); - emit_sub(pc, dst[c], src[0][c], temp); - } - break; - case TGSI_OPCODE_I2F: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32); - } - break; - case TGSI_OPCODE_IF: - assert(pc->if_lvl < NV50_MAX_COND_NESTING); - emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32); - pc->if_join[pc->if_lvl] = emit_joinat(pc); - pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);; - terminate_mbb(pc); - break; - case TGSI_OPCODE_IMAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_IMIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_INEG: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_S32_S32 | CVT_NEG); - } - break; - case TGSI_OPCODE_KIL: - assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]); - emit_kil(pc, src[0][0]); - emit_kil(pc, src[0][1]); - emit_kil(pc, src[0][2]); - emit_kil(pc, src[0][3]); - break; - case TGSI_OPCODE_KILP: - emit_kil(pc, NULL); - break; - case TGSI_OPCODE_LIT: - emit_lit(pc, &dst[0], mask, &src[0][0]); - break; - case TGSI_OPCODE_LG2: - emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]); - break; - case TGSI_OPCODE_LOG: - { - struct nv50_reg *t[2]; - - t[0] = temp_temp(pc, NULL); - if (mask & (1 << 1)) - t[1] = temp_temp(pc, NULL); - else - t[1] = t[0]; - - emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32); - emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]); - if (mask & (1 << 2)) - emit_mov(pc, dst[2], t[1]); - emit_flr(pc, t[1], t[1]); - if (mask & (1 << 0)) - emit_mov(pc, dst[0], t[1]); - if (mask & (1 << 1)) { - t[1]->mod = NV50_MOD_NEG; - emit_preex2(pc, t[1], t[1]); - t[1]->mod = 0; - emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]); - emit_mul(pc, dst[1], t[0], t[1]); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0f); - } - break; - case TGSI_OPCODE_LRP: - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sub(pc, temp, src[1][c], src[2][c]); - emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); - } - break; - case TGSI_OPCODE_MAD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); - } - break; - case TGSI_OPCODE_MAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_MIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_MOV: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_MUL: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_NOT: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_not(pc, dst[c], src[0][c]); - } - break; - case TGSI_OPCODE_POW: - emit_pow(pc, brdc, src[0][0], src[1][0]); - break; - case TGSI_OPCODE_RCP: - if (!sat && popcnt4(mask) == 1) - brdc = dst[ffs(mask) - 1]; - emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]); - break; - case TGSI_OPCODE_RET: - if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine) - nv50_fp_move_results(pc); - emit_ret(pc, -1, 0); - break; - case TGSI_OPCODE_RSQ: - if (!sat && popcnt4(mask) == 1) - brdc = dst[ffs(mask) - 1]; - src[0][0]->mod |= NV50_MOD_ABS; - emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]); - break; - case TGSI_OPCODE_SAD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]); - } - break; - case TGSI_OPCODE_SCS: - temp = temp_temp(pc, NULL); - if (mask & 3) - emit_precossin(pc, temp, src[0][0]); - if (mask & (1 << 0)) - emit_flop(pc, NV50_FLOP_COS, dst[0], temp); - if (mask & (1 << 1)) - emit_flop(pc, NV50_FLOP_SIN, dst[1], temp); - if (mask & (1 << 2)) - emit_mov_immdval(pc, dst[2], 0.0); - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0); - break; - case TGSI_OPCODE_SHL: - case TGSI_OPCODE_ISHR: - case TGSI_OPCODE_USHR: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_shift(pc, dst[c], src[0][c], src[1][c], - inst->Instruction.Opcode); - } - break; - case TGSI_OPCODE_SIN: - if (mask & 8) { - emit_precossin(pc, temp, src[0][3]); - emit_flop(pc, NV50_FLOP_SIN, dst[3], temp); - if (!(mask &= 7)) - break; - if (temp == dst[3]) - temp = brdc = temp_temp(pc, NULL); - } - emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, NV50_FLOP_SIN, brdc, temp); - break; - case TGSI_OPCODE_SLT: - case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SGT: - case TGSI_OPCODE_SLE: - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_ISLT: - case TGSI_OPCODE_ISGE: - case TGSI_OPCODE_USEQ: - case TGSI_OPCODE_USGE: - case TGSI_OPCODE_USLT: - case TGSI_OPCODE_USNE: - { - uint8_t cc, ty; - - map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty); - - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty); - } - } - break; - case TGSI_OPCODE_SUB: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_sub(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_TEX: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, 0); - break; - case TGSI_OPCODE_TXB: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, -1); - break; - case TGSI_OPCODE_TXL: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, FALSE, 1); - break; - case TGSI_OPCODE_TXP: - emit_tex(pc, dst, mask, src[0], unit, - inst->Texture.Texture, TRUE, 0); - break; - case TGSI_OPCODE_TRUNC: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, - CVT_TRUNC | CVT_F32_F32 | CVT_RI); - } - break; - case TGSI_OPCODE_U2F: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32); - } - break; - case TGSI_OPCODE_UADD: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_add_b32(pc, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMAX: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMIN: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]); - } - break; - case TGSI_OPCODE_UMAD: - { - assert(!temp); - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); - emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, - temp); - emit_shl_imm(pc, temp, temp, 16); - emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0, - temp); - emit_add_b32(pc, dst[c], temp, src[2][c]); - } - } - break; - case TGSI_OPCODE_UMUL: - { - assert(!temp); - temp = temp_temp(pc, NULL); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1); - emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0, - temp); - emit_shl_imm(pc, temp, temp, 16); - emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0, - temp); - } - } - break; - case TGSI_OPCODE_XPD: - temp = temp_temp(pc, NULL); - if (mask & (1 << 0)) { - emit_mul(pc, temp, src[0][2], src[1][1]); - emit_msb(pc, dst[0], src[0][1], src[1][2], temp); - } - if (mask & (1 << 1)) { - emit_mul(pc, temp, src[0][0], src[1][2]); - emit_msb(pc, dst[1], src[0][2], src[1][0], temp); - } - if (mask & (1 << 2)) { - emit_mul(pc, temp, src[0][1], src[1][0]); - emit_msb(pc, dst[2], src[0][0], src[1][1], temp); - } - if (mask & (1 << 3)) - emit_mov_immdval(pc, dst[3], 1.0); - break; - case TGSI_OPCODE_END: - if (pc->p->type == PIPE_SHADER_FRAGMENT) - nv50_fp_move_results(pc); - - if (!pc->p->exec_tail || - is_immd(pc->p->exec_tail) || - is_join(pc->p->exec_tail) || - is_control_flow(pc->p->exec_tail)) - emit_nop(pc); - - /* last insn must be long so it can have the exit bit set */ - if (!is_long(pc->p->exec_tail)) - convert_to_long(pc, pc->p->exec_tail); - - pc->p->exec_tail->inst[1] |= 1; /* set exit bit */ - - terminate_mbb(pc); - break; - default: - NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); - return FALSE; - } - - if (brdc) { - if (sat) - emit_sat(pc, brdc, brdc); - for (c = 0; c < 4; c++) - if ((mask & (1 << c)) && dst[c] != brdc) - emit_mov(pc, dst[c], brdc); - } else - if (sat) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - /* In this case we saturate later, and dst[c] won't - * be another temp_temp (and thus lost), since rdst - * already is TEMP (see above). */ - if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) - continue; - emit_sat(pc, rdst[c], dst[c]); - } - } - - kill_temp_temp(pc, NULL); - pc->reg_instance_nr = 0; - - return TRUE; -} - -static void -prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) -{ - struct nv50_reg *r, *reg = NULL; - const struct tgsi_full_src_register *src; - const struct tgsi_dst_register *dst; - unsigned i, c, k, mask; - - dst = &insn->Dst[0].Register; - mask = dst->WriteMask; - - if (dst->File == TGSI_FILE_TEMPORARY) - reg = pc->temp; - else - if (dst->File == TGSI_FILE_OUTPUT) { - reg = pc->result; - - if (insn->Instruction.Opcode == TGSI_OPCODE_MOV && - dst->Index == pc->edgeflag_out && - insn->Src[0].Register.File == TGSI_FILE_INPUT) - pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index; - } - - if (reg) { - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - reg[dst->Index * 4 + c].acc = pc->insn_nr; - } - } - - for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { - src = &insn->Src[i]; - - if (src->Register.File == TGSI_FILE_TEMPORARY) - reg = pc->temp; - else - if (src->Register.File == TGSI_FILE_INPUT) - reg = pc->attr; - else - continue; - - mask = nv50_tgsi_src_mask(insn, i); - - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - k = tgsi_util_get_full_src_register_swizzle(src, c); - - r = ®[src->Register.Index * 4 + k]; - - /* If used before written, pre-allocate the reg, - * lest we overwrite results from a subroutine. - */ - if (!r->acc && r->type == P_TEMP) - alloc_reg(pc, r); - - r->acc = pc->insn_nr; - } - } -} - -/* Returns a bitmask indicating which dst components need to be - * written to temporaries first to avoid 'corrupting' sources. - * - * m[i] (out) indicate component to write in the i-th position - * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source - */ -static unsigned -nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) -{ - unsigned i, c, x, unsafe = 0; - - for (c = 0; c < 4; c++) - m[c] = c; - - /* Swap as long as a dst component written earlier is depended on - * by one written later, but the next one isn't depended on by it. - */ - for (c = 0; c < 3; c++) { - if (rdep[m[c + 1]] & (1 << m[c])) - continue; /* if next one is depended on by us */ - for (i = c + 1; i < 4; i++) - /* if we are depended on by a later one */ - if (rdep[m[c]] & (1 << m[i])) - break; - if (i == 4) - continue; - /* now, swap */ - x = m[c]; - m[c] = m[c + 1]; - m[c + 1] = x; - - /* restart */ - c = 0; - } - - /* mark dependencies that could not be resolved by reordering */ - for (i = 0; i < 3; ++i) - for (c = i + 1; c < 4; ++c) - if (rdep[m[i]] & (1 << m[c])) - unsafe |= (1 << i); - - /* NOTE: $unsafe is with respect to order, not component */ - return unsafe; -} - -/* Select a suitable dst register for broadcasting scalar results, - * or return NULL if we have to allocate an extra TEMP. - * - * If e.g. only 1 component is written, we may also emit the final - * result to a write-only register. - */ -static struct nv50_reg * -tgsi_broadcast_dst(struct nv50_pc *pc, - const struct tgsi_full_dst_register *fd, unsigned mask) -{ - if (fd->Register.File == TGSI_FILE_TEMPORARY) { - int c = ffs(~mask & fd->Register.WriteMask); - if (c) - return tgsi_dst(pc, c - 1, fd); - } else { - int c = ffs(fd->Register.WriteMask) - 1; - if ((1 << c) == fd->Register.WriteMask) - return tgsi_dst(pc, c, fd); - } - - return NULL; -} - -/* Scan source swizzles and return a bitmask indicating dst regs that - * also occur among the src regs, and fill rdep for nv50_revdep_reoder. - */ -static unsigned -nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, - unsigned rdep[4]) -{ - const struct tgsi_full_dst_register *fd = &insn->Dst[0]; - const struct tgsi_full_src_register *fs; - unsigned i, deqs = 0; - - for (i = 0; i < 4; ++i) - rdep[i] = 0; - - for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { - unsigned chn, mask = nv50_tgsi_src_mask(insn, i); - int ms = get_supported_mods(insn, i); - - fs = &insn->Src[i]; - if (fs->Register.File != fd->Register.File || - fs->Register.Index != fd->Register.Index) - continue; - - for (chn = 0; chn < 4; ++chn) { - unsigned s, c; - - if (!(mask & (1 << chn))) /* src is not read */ - continue; - c = tgsi_util_get_full_src_register_swizzle(fs, chn); - s = tgsi_util_get_full_src_register_sign_mode(fs, chn); - - if (!(fd->Register.WriteMask & (1 << c))) - continue; - - if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG)) - continue; - if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS)) - continue; - if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3)) - continue; - - rdep[c] |= nv50_tgsi_dst_revdep( - insn->Instruction.Opcode, i, chn); - deqs |= (1 << c); - } - } - - return deqs; -} - -static boolean -nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) -{ - struct tgsi_full_instruction insn = tok->FullInstruction; - const struct tgsi_full_dst_register *fd; - unsigned i, deqs, rdep[4], m[4]; - - fd = &tok->FullInstruction.Dst[0]; - deqs = nv50_tgsi_scan_swizzle(&insn, rdep); - - if (is_scalar_op(insn.Instruction.Opcode)) { - pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); - if (!pc->r_brdc) - pc->r_brdc = temp_temp(pc, NULL); - return nv50_program_tx_insn(pc, &insn); - } - pc->r_brdc = NULL; - - if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3])) - return nv50_program_tx_insn(pc, &insn); - - deqs = nv50_revdep_reorder(m, rdep); - - for (i = 0; i < 4; ++i) { - assert(pc->r_dst[m[i]] == NULL); - - insn.Dst[0].Register.WriteMask = - fd->Register.WriteMask & (1 << m[i]); - - if (!insn.Dst[0].Register.WriteMask) - continue; - - if (deqs & (1 << i)) - pc->r_dst[m[i]] = alloc_temp(pc, NULL); - - if (!nv50_program_tx_insn(pc, &insn)) - return FALSE; - } - - for (i = 0; i < 4; i++) { - struct nv50_reg *reg = pc->r_dst[i]; - if (!reg) - continue; - pc->r_dst[i] = NULL; - - if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) - emit_sat(pc, tgsi_dst(pc, i, fd), reg); - else - emit_mov(pc, tgsi_dst(pc, i, fd), reg); - free_temp(pc, reg); - } - - return TRUE; -} - -static void -load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) -{ - struct nv50_reg *iv, **ppiv; - unsigned mode = pc->interp_mode[reg->index]; - - ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; - iv = *ppiv; - - if ((mode & INTERP_PERSPECTIVE) && !iv) { - iv = *ppiv = alloc_temp(pc, NULL); - iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; - - emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); - emit_flop(pc, NV50_FLOP_RCP, iv, iv); - - /* XXX: when loading interpolants dynamically, move these - * to the program head, or make sure it can't be skipped. - */ - } - - emit_interp(pc, reg, iv, mode); -} - -/* The face input is always at v[255] (varying space), with a - * value of 0 for back-facing, and 0xffffffff for front-facing. - */ -static void -load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv) -{ - struct nv50_reg *temp = alloc_temp(pc, NULL); - int r_pred = 0; - - temp->rhw = 255; - emit_interp(pc, temp, NULL, INTERP_FLAT); - - emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32); - - emit_not(pc, temp, temp); - set_pred(pc, 0x2, r_pred, pc->p->exec_tail); - emit_cvt(pc, sv, temp, -1, CVT_F32_S32); - set_pred(pc, 0x2, r_pred, pc->p->exec_tail); - - free_temp(pc, temp); -} - -static void -load_instance_id(struct nv50_pc *pc, unsigned index) -{ - struct nv50_reg reg, mem; - - ctor_reg(®, P_TEMP, -1, -1); - ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */ - mem.buf_index = 2; - - emit_add_b32(pc, ®, &pc->sysval[index], &mem); - pc->sysval[index] = reg; -} - -static void -copy_semantic_info(struct nv50_program *p) -{ - unsigned i, id; - - for (i = 0; i < p->cfg.in_nr; ++i) { - id = p->cfg.in[i].id; - p->cfg.in[i].sn = p->info.input_semantic_name[id]; - p->cfg.in[i].si = p->info.input_semantic_index[id]; - } - - for (i = 0; i < p->cfg.out_nr; ++i) { - id = p->cfg.out[i].id; - p->cfg.out[i].sn = p->info.output_semantic_name[id]; - p->cfg.out[i].si = p->info.output_semantic_index[id]; - } -} - -static boolean -nv50_program_tx_prep(struct nv50_pc *pc) -{ - struct tgsi_parse_context tp; - struct nv50_program *p = pc->p; - boolean ret = FALSE; - unsigned i, c, instance_id = 0, vertex_id = 0, flat_nr = 0; - - tgsi_parse_init(&tp, pc->p->pipe.tokens); - while (!tgsi_parse_end_of_tokens(&tp)) { - const union tgsi_full_token *tok = &tp.FullToken; - - tgsi_parse_token(&tp); - switch (tok->Token.Type) { - case TGSI_TOKEN_TYPE_IMMEDIATE: - { - const struct tgsi_full_immediate *imm = - &tp.FullToken.FullImmediate; - - ctor_immd_4f32(pc, imm->u[0].Float, - imm->u[1].Float, - imm->u[2].Float, - imm->u[3].Float); - } - break; - case TGSI_TOKEN_TYPE_DECLARATION: - { - const struct tgsi_full_declaration *d; - unsigned si, last, first, mode; - - d = &tp.FullToken.FullDeclaration; - first = d->Range.First; - last = d->Range.Last; - - switch (d->Declaration.File) { - case TGSI_FILE_TEMPORARY: - break; - case TGSI_FILE_OUTPUT: - if (!d->Declaration.Semantic || - p->type == PIPE_SHADER_FRAGMENT) - break; - - si = d->Semantic.Index; - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_BCOLOR: - p->cfg.two_side[si].hw = first; - if (p->cfg.out_nr > first) - p->cfg.out_nr = first; - break; - case TGSI_SEMANTIC_PSIZE: - p->cfg.psiz = first; - if (p->cfg.out_nr > first) - p->cfg.out_nr = first; - break; - case TGSI_SEMANTIC_EDGEFLAG: - pc->edgeflag_out = first; - break; - /* - case TGSI_SEMANTIC_CLIP_DISTANCE: - p->cfg.clpd = MIN2(p->cfg.clpd, first); - break; - */ - default: - break; - } - break; - case TGSI_FILE_INPUT: - { - if (p->type != PIPE_SHADER_FRAGMENT) - break; - - switch (d->Declaration.Interpolate) { - case TGSI_INTERPOLATE_CONSTANT: - mode = INTERP_FLAT; - flat_nr++; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - mode = INTERP_PERSPECTIVE; - p->cfg.regs[1] |= 0x08 << 24; - break; - default: - mode = INTERP_LINEAR; - break; - } - if (d->Declaration.Centroid) - mode |= INTERP_CENTROID; - - assert(last < 32); - for (i = first; i <= last; i++) - pc->interp_mode[i] = mode; - } - break; - case TGSI_FILE_SYSTEM_VALUE: - assert(d->Declaration.Semantic); - switch (d->Semantic.Name) { - case TGSI_SEMANTIC_FACE: - assert(p->type == PIPE_SHADER_FRAGMENT); - load_frontfacing(pc, - &pc->sysval[first]); - break; - case TGSI_SEMANTIC_INSTANCEID: - assert(p->type == PIPE_SHADER_VERTEX); - instance_id = first; - p->cfg.regs[0] |= (1 << 4); - break; - case TGSI_SEMANTIC_PRIMID: - assert(p->type != PIPE_SHADER_VERTEX); - p->cfg.prim_id = first; - break; - /* - case TGSI_SEMANTIC_PRIMIDIN: - assert(p->type == PIPE_SHADER_GEOMETRY); - pc->sysval[first].hw = 6; - p->cfg.regs[0] |= (1 << 8); - break; - case TGSI_SEMANTIC_VERTEXID: - assert(p->type == PIPE_SHADER_VERTEX); - vertex_id = first; - p->cfg.regs[0] |= (1 << 12) | (1 << 0); - break; - */ - } - break; - case TGSI_FILE_ADDRESS: - case TGSI_FILE_CONSTANT: - case TGSI_FILE_SAMPLER: - break; - default: - NOUVEAU_ERR("bad decl file %d\n", - d->Declaration.File); - goto out_err; - } - } - break; - case TGSI_TOKEN_TYPE_INSTRUCTION: - pc->insn_nr++; - prep_inspect_insn(pc, &tok->FullInstruction); - break; - default: - break; - } - } - - if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) { - int rid = 0; - - if (p->type == PIPE_SHADER_GEOMETRY) { - for (i = 0; i < pc->attr_nr; ++i) { - p->cfg.in[i].hw = rid; - p->cfg.in[i].id = i; - - for (c = 0; c < 4; ++c) { - int n = i * 4 + c; - if (!pc->attr[n].acc) - continue; - pc->attr[n].hw = rid++; - p->cfg.in[i].mask |= 1 << c; - } - } - } else { - for (i = 0; i < pc->attr_nr * 4; ++i) { - if (pc->attr[i].acc) { - pc->attr[i].hw = rid++; - p->cfg.attr[i / 32] |= 1 << (i % 32); - } - } - if (p->cfg.regs[0] & (1 << 0)) - pc->sysval[vertex_id].hw = rid++; - if (p->cfg.regs[0] & (1 << 4)) { - pc->sysval[instance_id].hw = rid++; - load_instance_id(pc, instance_id); - } - } - - for (i = 0, rid = 0; i < pc->result_nr; ++i) { - p->cfg.out[i].hw = rid; - p->cfg.out[i].id = i; - - for (c = 0; c < 4; ++c) { - int n = i * 4 + c; - if (!pc->result[n].acc) - continue; - pc->result[n].hw = rid++; - p->cfg.out[i].mask |= 1 << c; - } - } - if (p->cfg.prim_id < 0x40) { - /* GP has to write to PrimitiveID */ - ctor_reg(&pc->sysval[p->cfg.prim_id], - P_RESULT, p->cfg.prim_id, rid); - p->cfg.prim_id = rid++; - } - - for (c = 0; c < 2; ++c) - if (p->cfg.two_side[c].hw < 0x40) - p->cfg.two_side[c] = p->cfg.out[ - p->cfg.two_side[c].hw]; - - if (p->cfg.psiz < 0x40) - p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw; - - copy_semantic_info(p); - } else - if (p->type == PIPE_SHADER_FRAGMENT) { - int rid = 0, aid; - unsigned n = 0, m = pc->attr_nr - flat_nr; - - pc->allow32 = TRUE; - - /* do we read FragCoord ? */ - if (pc->attr_nr && - p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) { - /* select FCRD components we want accessible */ - for (c = 0; c < 4; ++c) - if (pc->attr[c].acc) - p->cfg.regs[1] |= 1 << (24 + c); - aid = 0; - } else /* offset by 1 if FCRD.w is needed for pinterp */ - aid = popcnt4(p->cfg.regs[1] >> 24); - - /* non-flat interpolants have to be mapped to - * the lower hardware IDs, so sort them: - */ - for (i = 0; i < pc->attr_nr; i++) { - if (pc->interp_mode[i] == INTERP_FLAT) - p->cfg.in[m++].id = i; - else { - if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) - p->cfg.in[n].linear = TRUE; - p->cfg.in[n++].id = i; - } - } - copy_semantic_info(p); - - for (n = 0; n < pc->attr_nr; ++n) { - p->cfg.in[n].hw = rid = aid; - i = p->cfg.in[n].id; - - if (p->info.input_semantic_name[i] == - TGSI_SEMANTIC_FACE) { - load_frontfacing(pc, &pc->attr[i * 4]); - continue; - } - - for (c = 0; c < 4; ++c) { - if (!pc->attr[i * 4 + c].acc) - continue; - pc->attr[i * 4 + c].rhw = rid++; - p->cfg.in[n].mask |= 1 << c; - - load_interpolant(pc, &pc->attr[i * 4 + c]); - } - aid += popcnt4(p->cfg.in[n].mask); - } - - m = popcnt4(p->cfg.regs[1] >> 24); - - /* set count of non-position inputs and of non-flat - * non-position inputs for FP_INTERPOLANT_CTRL - */ - p->cfg.regs[1] |= aid - m; - - if (flat_nr) { - i = p->cfg.in[pc->attr_nr - flat_nr].hw; - p->cfg.regs[1] |= (i - m) << 16; - } else - p->cfg.regs[1] |= p->cfg.regs[1] << 16; - - /* mark color semantic for light-twoside */ - n = 0x80; - for (i = 0; i < p->cfg.in_nr; i++) { - if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) { - n = MIN2(n, p->cfg.in[i].hw - m); - p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i]; - - p->cfg.regs[0] += /* increase colour count */ - popcnt4(p->cfg.in[i].mask) << 16; - } - } - if (n < 0x80) - p->cfg.regs[0] += n; - - if (p->cfg.prim_id < 0x40) { - pc->sysval[p->cfg.prim_id].rhw = rid++; - emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL, - INTERP_FLAT); - /* increase FP_INTERPOLANT_CTRL_COUNT */ - p->cfg.regs[1] += 1; - } - - /* Initialize FP results: - * FragDepth is always first TGSI and last hw output - */ - i = p->info.writes_z ? 4 : 0; - for (rid = 0; i < pc->result_nr * 4; i++) - pc->result[i].rhw = rid++; - if (p->info.writes_z) - pc->result[2].rhw = rid++; - - p->cfg.high_result = rid; - - /* separate/different colour results for MRTs ? */ - if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) - p->cfg.regs[2] |= 1; - } - - if (pc->immd_nr) { - int rid = 0; - - pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->immd) - goto out_err; - - for (i = 0; i < pc->immd_nr; i++) { - for (c = 0; c < 4; c++, rid++) - ctor_reg(&pc->immd[rid], P_IMMD, i, rid); - } - } - - ret = TRUE; -out_err: - if (pc->iv_p) - free_temp(pc, pc->iv_p); - if (pc->iv_c) - free_temp(pc, pc->iv_c); - - tgsi_parse_free(&tp); - return ret; -} - -static void -free_nv50_pc(struct nv50_pc *pc) -{ - if (pc->immd) - FREE(pc->immd); - if (pc->param) - FREE(pc->param); - if (pc->result) - FREE(pc->result); - if (pc->attr) - FREE(pc->attr); - if (pc->temp) - FREE(pc->temp); - if (pc->sysval) - FREE(pc->sysval); - if (pc->insn_pos) - FREE(pc->insn_pos); - - FREE(pc); -} - -static INLINE uint32_t -nv50_map_gs_output_prim(unsigned pprim) -{ - switch (pprim) { - case PIPE_PRIM_POINTS: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS; - case PIPE_PRIM_LINE_STRIP: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP; - case PIPE_PRIM_TRIANGLE_STRIP: - return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP; - default: - NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim); - abort(); - return 0; - } -} - -static boolean -ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) -{ - int i, c; - unsigned rtype[2] = { P_ATTR, P_RESULT }; - - pc->p = p; - pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; - pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; - pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; - pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; - pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; - assert(pc->addr_nr <= 2); - pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1; - - p->cfg.high_temp = 4; - - p->cfg.two_side[0].hw = 0x40; - p->cfg.two_side[1].hw = 0x40; - p->cfg.prim_id = 0x40; - - p->cfg.edgeflag_in = pc->edgeflag_out = 0xff; - - for (i = 0; i < p->info.num_properties; ++i) { - unsigned *data = &p->info.properties[i].data[0]; - - switch (p->info.properties[i].name) { - case TGSI_PROPERTY_GS_OUTPUT_PRIM: - p->cfg.prim_type = nv50_map_gs_output_prim(data[0]); - break; - case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES: - p->cfg.vert_count = data[0]; - break; - default: - break; - } - } - - switch (p->type) { - case PIPE_SHADER_VERTEX: - p->cfg.psiz = 0x40; - p->cfg.clpd = 0x40; - p->cfg.out_nr = pc->result_nr; - break; - case PIPE_SHADER_GEOMETRY: - assert(p->cfg.prim_type); - assert(p->cfg.vert_count); - - p->cfg.psiz = 0x80; - p->cfg.clpd = 0x80; - p->cfg.prim_id = 0x80; - p->cfg.out_nr = pc->result_nr; - p->cfg.in_nr = pc->attr_nr; - - p->cfg.two_side[0].hw = 0x80; - p->cfg.two_side[1].hw = 0x80; - break; - case PIPE_SHADER_FRAGMENT: - rtype[0] = rtype[1] = P_TEMP; - - p->cfg.regs[0] = 0x01000004; - p->cfg.in_nr = pc->attr_nr; - - if (p->info.writes_z) { - p->cfg.regs[2] |= 0x00000100; - p->cfg.regs[3] |= 0x00000011; - } - if (p->info.uses_kill) - p->cfg.regs[2] |= 0x00100000; - break; - } - - if (pc->temp_nr) { - pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->temp) - return FALSE; - - for (i = 0; i < pc->temp_nr * 4; ++i) - ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); - } - - if (pc->attr_nr) { - pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->attr) - return FALSE; - - for (i = 0; i < pc->attr_nr * 4; ++i) - ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); - } - - if (pc->result_nr) { - unsigned nr = pc->result_nr * 4; - - pc->result = MALLOC(nr * sizeof(struct nv50_reg)); - if (!pc->result) - return FALSE; - - for (i = 0; i < nr; ++i) - ctor_reg(&pc->result[i], rtype[1], i / 4, -1); - } - - if (pc->param_nr) { - int rid = 0; - - pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); - if (!pc->param) - return FALSE; - - for (i = 0; i < pc->param_nr; ++i) - for (c = 0; c < 4; ++c, ++rid) - ctor_reg(&pc->param[rid], P_CONST, i, rid); - } - - if (pc->addr_nr) { - pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); - if (!pc->addr) - return FALSE; - } - for (i = 0; i < NV50_SU_MAX_ADDR; ++i) - ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1); - - if (pc->sysval_nr) { - pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *)); - if (!pc->sysval) - return FALSE; - /* will only ever use SYSTEM_VALUE[i].x (hopefully) */ - for (i = 0; i < pc->sysval_nr; ++i) - ctor_reg(&pc->sysval[i], rtype[0], i, -1); - } - - return TRUE; -} - -static void -nv50_program_fixup_insns(struct nv50_pc *pc) -{ - struct nv50_program_exec *e, **bra_list; - unsigned i, n, pos; - - bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); - - /* Collect branch instructions, we need to adjust their offsets - * when converting 32 bit instructions to 64 bit ones - */ - for (n = 0, e = pc->p->exec_head; e; e = e->next) - if (e->param.index >= 0 && !e->param.mask) - bra_list[n++] = e; - - /* Make sure we don't have any single 32 bit instructions. */ - for (e = pc->p->exec_head, pos = 0; e; e = e->next) { - pos += is_long(e) ? 2 : 1; - - if ((pos & 1) && (!e->next || is_long(e->next))) { - for (i = 0; i < n; ++i) - if (bra_list[i]->param.index >= pos) - bra_list[i]->param.index += 1; - for (i = 0; i < pc->insn_nr; ++i) - if (pc->insn_pos[i] >= pos) - pc->insn_pos[i] += 1; - convert_to_long(pc, e); - ++pos; - } - } - - FREE(bra_list); - - if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL]) - return; - - /* fill in CALL offsets */ - for (e = pc->p->exec_head; e; e = e->next) { - if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2) - e->param.index = pc->insn_pos[e->param.index]; - } -} - -static boolean -nv50_program_tx(struct nv50_program *p) -{ - struct tgsi_parse_context parse; - struct nv50_pc *pc; - boolean ret; - - pc = CALLOC_STRUCT(nv50_pc); - if (!pc) - return FALSE; - - ret = ctor_nv50_pc(pc, p); - if (ret == FALSE) - goto out_cleanup; - - ret = nv50_program_tx_prep(pc); - if (ret == FALSE) - goto out_cleanup; - - pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned)); - - tgsi_parse_init(&parse, pc->p->pipe.tokens); - while (!tgsi_parse_end_of_tokens(&parse)) { - const union tgsi_full_token *tok = &parse.FullToken; - - /* previously allow32 was FALSE for first & last instruction */ - pc->allow32 = TRUE; - - tgsi_parse_token(&parse); - - switch (tok->Token.Type) { - case TGSI_TOKEN_TYPE_INSTRUCTION: - pc->insn_pos[pc->insn_cur] = pc->p->exec_size; - ++pc->insn_cur; - ret = nv50_tgsi_insn(pc, tok); - if (ret == FALSE) - goto out_err; - break; - default: - break; - } - } - - nv50_program_fixup_insns(pc); - - p->param_nr = pc->param_nr * 4; - p->immd_nr = pc->immd_nr * 4; - p->immd = pc->immd_buf; - -out_err: - tgsi_parse_free(&parse); - -out_cleanup: - free_nv50_pc(pc); - return ret; -} - -static void -nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) -{ - if (nv50_program_tx(p) == FALSE) - assert(0); - p->translated = TRUE; -} - -static void -nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map, - unsigned start, unsigned count, unsigned cbuf) -{ - struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; - - while (count) { - unsigned nr = count > 2047 ? 2047 : count; - - BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); - OUT_RING (chan, (cbuf << 0) | (start << 8)); - BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); - OUT_RINGp (chan, map, nr); - - map += nr; - start += nr; - count -= nr; - } -} - -static void -nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) -{ - struct pipe_context *pipe = &nv50->pipe; - struct pipe_transfer *transfer; - - if (!p->data[0] && p->immd_nr) { - struct nouveau_resource *heap = nv50->screen->immd_heap; - - if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { - while (heap->next && heap->size < p->immd_nr) { - struct nv50_program *evict = heap->next->priv; - nouveau_resource_free(&evict->data[0]); - } - - if (nouveau_resource_alloc(heap, p->immd_nr, p, - &p->data[0])) - assert(0); - } - - /* immediates only need to be uploaded again when freed */ - nv50_program_upload_data(nv50, p->immd, p->data[0]->start, - p->immd_nr, NV50_CB_PMISC); - } - - assert(p->param_nr <= 16384); - - if (p->param_nr) { - unsigned cb; - uint32_t *map = pipe_buffer_map(pipe, - nv50->constbuf[p->type], - PIPE_TRANSFER_READ, - &transfer); - switch (p->type) { - case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break; - case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break; - default: - cb = NV50_CB_PVP; - assert(p->type == PIPE_SHADER_VERTEX); - break; - } - - nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); - pipe_buffer_unmap(pipe, nv50->constbuf[p->type], - transfer); - } -} - -static void -nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) -{ - struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program_exec *e; - uint32_t *up, i; - boolean upload = FALSE; - unsigned offset; - int width; - - if (!p->bo) { - nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, - p->exec_size * 4, &p->bo); - upload = TRUE; - } - - if (p->data[0] && p->data[0]->start != p->data_start[0]) - upload = TRUE; - - if (!upload) - return; - - up = MALLOC(p->exec_size * 4); - - for (i = 0, e = p->exec_head; e; e = e->next) { - unsigned ei, ci, bs; - - if (e->param.index >= 0 && e->param.mask) { - bs = (e->inst[1] >> 22) & 0x07; - assert(bs < 2); - ei = e->param.shift >> 5; - ci = e->param.index; - if (bs == 0) - ci += p->data[bs]->start; - - e->inst[ei] &= ~e->param.mask; - e->inst[ei] |= (ci << e->param.shift); - } else - if (e->param.index >= 0) { - /* zero mask means param is a jump/branch offset */ - assert(!(e->param.index & 1)); - /* seem to be 8 byte steps */ - ei = (e->param.index >> 1) + 0 /* START_ID */; - - e->inst[0] &= 0xf0000fff; - e->inst[0] |= ei << 12; - } - - up[i++] = e->inst[0]; - if (is_long(e)) - up[i++] = e->inst[1]; - } - assert(i == p->exec_size); - - if (p->data[0]) - p->data_start[0] = p->data[0]->start; - -#ifdef NV50_PROGRAM_DUMP - NOUVEAU_ERR("-------\n"); - for (e = p->exec_head; e; e = e->next) { - NOUVEAU_ERR("0x%08x\n", e->inst[0]); - if (is_long(e)) - NOUVEAU_ERR("0x%08x\n", e->inst[1]); - } -#endif - - /* SIFC_HEIGHT/SIFC_WIDTH of 65536 do not work, and are not reported - * as data error either. hw bug ? */ -#define SIFC_MAX_WIDTH (65536 - 256) - offset = 0; - width = p->exec_size * 4; - while (width > 0) { - nv50_upload_sifc(nv50, p->bo, offset, NOUVEAU_BO_VRAM, - NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, - &up[offset / 4], NV50_2D_SIFC_FORMAT_R8_UNORM, - 0, 0, 0, MIN2(SIFC_MAX_WIDTH, width), 1, 1); - width -= SIFC_MAX_WIDTH; - offset += SIFC_MAX_WIDTH; - } - BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); - OUT_RING (chan, 0); - - FREE(up); -} - -struct nouveau_stateobj * -nv50_vertprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->vertprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_VERTPROG)) - return NULL; - - so = so_new(5, 7, 2); - so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); - so_data (so, p->cfg.attr[0]); - so_data (so, p->cfg.attr[1]); - so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_VP_START_ID, 1); - so_data (so, 0); /* program start offset */ - return so; -} - -struct nouveau_stateobj * -nv50_fragprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->fragprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_FRAGPROG)) - return NULL; - - so = so_new(6, 7, 2); - so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_FP_CONTROL, 1); - so_data (so, p->cfg.regs[2]); - so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); - so_data (so, p->cfg.regs[3]); - so_method(so, tesla, NV50TCL_FP_START_ID, 1); - so_data (so, 0); /* program start offset */ - return so; -} - -struct nouveau_stateobj * -nv50_geomprog_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *p = nv50->geomprog; - struct nouveau_stateobj *so; - - if (!p->translated) { - nv50_program_validate(nv50, p); - if (!p->translated) - assert(0); - } - - nv50_program_validate_data(nv50, p); - nv50_program_validate_code(nv50, p); - - if (!(nv50->dirty & NV50_NEW_GEOMPROG)) - return NULL; - - so = so_new(6, 7, 2); - so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | - NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); - so_data (so, p->cfg.high_temp); - so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); - so_data (so, p->cfg.high_result); - so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); - so_data (so, p->cfg.prim_type); - so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); - so_data (so, p->cfg.vert_count); - so_method(so, tesla, NV50TCL_GP_START_ID, 1); - so_data (so, 0); - return so; -} - -static uint32_t -nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) -{ - struct nv50_program *vp; - struct nv50_program *fp = nv50->fragprog; - unsigned i, c, m = base; - uint32_t origin = 0x00000010; - - vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog; - - /* XXX: this might not work correctly in all cases yet - we'll - * just assume that an FP generic input that is not written in - * the VP is PointCoord. - */ - memset(pntc, 0, 8 * sizeof(uint32_t)); - - for (i = 0; i < fp->cfg.in_nr; i++) { - unsigned j, n = popcnt4(fp->cfg.in[i].mask); - - if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) { - m += n; - continue; - } - - for (j = 0; j < vp->cfg.out_nr; ++j) - if (vp->cfg.out[j].sn == fp->cfg.in[i].sn && - vp->cfg.out[j].si == fp->cfg.in[i].si) - break; - - if (j < vp->info.num_outputs) { - ubyte enable = - (nv50->rasterizer->pipe.sprite_coord_enable >> vp->cfg.out[j].si) & 1; - - if (enable == 0) { - m += n; - continue; - } - } - - /* this is either PointCoord or replaced by sprite coords */ - for (c = 0; c < 4; c++) { - if (!(fp->cfg.in[i].mask & (1 << c))) - continue; - pntc[m / 8] |= (c + 1) << ((m % 8) * 4); - ++m; - } - } - return (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT ? 0 : origin); -} - -static int -nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4], - struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) -{ - int c; - uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; - uint8_t *map = (uint8_t *)map32; - - for (c = 0; c < 4; ++c) { - if (mf & 1) { - if (fpi->linear == TRUE) - lin[mid / 32] |= 1 << (mid % 32); - if (mv & 1) - map[mid] = oid; - else - map[mid] = (c == 3) ? (zval + 1) : zval; - ++mid; - } - - oid += mv & 1; - mf >>= 1; - mv >>= 1; - } - - return mid; -} - -struct nouveau_stateobj * -nv50_fp_linkage_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nv50_program *vp = nv50->vertprog; - struct nv50_program *fp = nv50->fragprog; - struct nouveau_stateobj *so; - struct nv50_sreg4 dummy; - int i, n, c, m = 0; - uint32_t map[16], lin[4], reg[6], pcrd[8]; - uint8_t zval = 0x40; - - if (nv50->geomprog) { - vp = nv50->geomprog; - zval = 0x80; - } - memset(map, 0, sizeof(map)); - memset(lin, 0, sizeof(lin)); - - reg[1] = 0x00000004; /* low and high clip distance map ids */ - reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ - reg[3] = 0x00000000; /* point size map id & enable */ - reg[5] = 0x00000000; /* primitive ID map slot */ - reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ - reg[4] = fp->cfg.regs[1]; /* interpolant info */ - - dummy.linear = FALSE; - dummy.mask = 0xf; /* map all components of HPOS */ - m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]); - - dummy.mask = 0x0; - - if (vp->cfg.clpd < 0x40) { - for (c = 0; c < vp->cfg.clpd_nr; ++c) { - map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8); - ++m; - } - reg[1] = (m << 8); - } - - reg[0] |= m << 8; /* adjust BFC0 id */ - - /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ - if (nv50->rasterizer->pipe.light_twoside) { - struct nv50_sreg4 *vpo = &vp->cfg.two_side[0]; - struct nv50_sreg4 *fpi = &fp->cfg.two_side[0]; - - m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]); - m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]); - } - - reg[0] += m - 4; /* adjust FFC0 id */ - reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ - - for (i = 0; i < fp->cfg.in_nr; i++) { - /* maybe even remove these from cfg.io */ - if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION || - fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE) - continue; - - for (n = 0; n < vp->cfg.out_nr; ++n) - if (vp->cfg.out[n].sn == fp->cfg.in[i].sn && - vp->cfg.out[n].si == fp->cfg.in[i].si) - break; - - m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i], - (n < vp->cfg.out_nr) ? - &vp->cfg.out[n] : &dummy); - } - /* PrimitiveID either is replaced by the system value, or - * written by the geometry shader into an output register - */ - if (fp->cfg.prim_id < 0x40) { - map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8); - reg[5] = m++; - } - - if (nv50->rasterizer->pipe.point_size_per_vertex) { - map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); - reg[3] = (m++ << 4) | 1; - } - - /* now fill the stateobj (at most 28 so_data) */ - so = so_new(10, 54, 0); - - n = (m + 3) / 4; - assert(m <= 64); - if (vp->type == PIPE_SHADER_GEOMETRY) { - so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); - so_data (so, m); - so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); - so_datap (so, map, n); - } else { - so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); - so_data (so, vp->cfg.regs[0]); - - so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); - so_data (so, reg[5]); - - so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); - so_data (so, m); - so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); - so_datap (so, map, n); - } - - so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); - so_datap (so, reg, 4); - - so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); - so_data (so, reg[4]); - - so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); - so_datap (so, lin, 4); - - if (nv50->rasterizer->pipe.sprite_coord_enable) { - so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); - so_data (so, - nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff)); - - so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); - so_datap (so, pcrd, 8); - } - - so_method(so, tesla, NV50TCL_GP_ENABLE, 1); - so_data (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); - - return so; -} - -static int -construct_vp_gp_mapping(uint32_t *map32, int m, - struct nv50_program *vp, struct nv50_program *gp) -{ - uint8_t *map = (uint8_t *)map32; - int i, j, c; - - for (i = 0; i < gp->cfg.in_nr; ++i) { - uint8_t oid = 0, mv = 0, mg = gp->cfg.in[i].mask; - - for (j = 0; j < vp->cfg.out_nr; ++j) { - if (vp->cfg.out[j].sn == gp->cfg.in[i].sn && - vp->cfg.out[j].si == gp->cfg.in[i].si) { - mv = vp->cfg.out[j].mask; - oid = vp->cfg.out[j].hw; - break; - } - } - - for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { - if (mg & mv & 1) - map[m++] = oid; - else - if (mg & 1) - map[m++] = (c == 3) ? 0x41 : 0x40; - oid += mv & 1; - } - } - return m; -} - -struct nouveau_stateobj * -nv50_gp_linkage_validate(struct nv50_context *nv50) -{ - struct nouveau_grobj *tesla = nv50->screen->tesla; - struct nouveau_stateobj *so; - struct nv50_program *vp = nv50->vertprog; - struct nv50_program *gp = nv50->geomprog; - uint32_t map[16]; - int m = 0; - - if (!gp) - return NULL; - memset(map, 0, sizeof(map)); - - m = construct_vp_gp_mapping(map, m, vp, gp); - - so = so_new(3, 24 - 3, 0); - - so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); - so_data (so, vp->cfg.regs[0] | gp->cfg.regs[0]); - - assert(m <= 32); - so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); - so_data (so, m); - - m = (m + 3) / 4; - so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); - so_datap (so, map, m); - - return so; -} - -void -nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) -{ - while (p->exec_head) { - struct nv50_program_exec *e = p->exec_head; - - p->exec_head = e->next; - FREE(e); - } - p->exec_tail = NULL; - p->exec_size = 0; + nouveau_bo_ref(NULL, &p->bo); - nouveau_bo_ref(NULL, &p->bo); + so_ref(NULL, &p->so); - FREE(p->immd); - nouveau_resource_free(&p->data[0]); + if (p->code) + FREE(p->code); - p->translated = 0; + p->translated = FALSE; } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 1e3ad6bff0..654bce59f3 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -1,75 +1,116 @@ -#ifndef __NV50_PROGRAM_H__ -#define __NV50_PROGRAM_H__ +/* + * Copyright 2010 Ben Skeggs + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __NV50_PROG_H__ +#define __NV50_PROG_H__ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" +#include "nouveau/nouveau_class.h" -struct nv50_program_exec { - struct nv50_program_exec *next; +struct nv50_varying { + uint8_t id; /* tgsi index */ + uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ - unsigned inst[2]; - struct { - int index; - unsigned mask; - unsigned shift; - } param; -}; - -struct nv50_sreg4 { - uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ - uint8_t id; /* tgsi index */ - - uint8_t mask; - boolean linear; + uint8_t mask : 4; + uint8_t linear : 1; + uint8_t pad : 3; - ubyte sn, si; /* semantic name & index */ + ubyte sn; /* semantic name */ + ubyte si; /* semantic index */ }; struct nv50_program { - struct pipe_shader_state pipe; - struct tgsi_shader_info info; - boolean translated; - - unsigned type; - struct nv50_program_exec *exec_head; - struct nv50_program_exec *exec_tail; - unsigned exec_size; - struct nouveau_resource *data[1]; - unsigned data_start[1]; - - struct nouveau_bo *bo; - - uint32_t *immd; - unsigned immd_nr; - unsigned param_nr; - - struct { - unsigned high_temp; - unsigned high_result; - - uint32_t attr[2]; - uint32_t regs[4]; - - /* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */ - unsigned in_nr, out_nr; - struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS]; - struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS]; - - /* FP colour inputs, VP/GP back colour outputs */ - struct nv50_sreg4 two_side[2]; - - /* GP only */ - unsigned vert_count; - uint8_t prim_type; - - /* VP & GP only */ - uint8_t clpd, clpd_nr; - uint8_t psiz; - uint8_t edgeflag_in; - - /* FP & GP only */ - uint8_t prim_id; - } cfg; + struct pipe_shader_state pipe; + + ubyte type; + boolean translated; + + struct nouveau_bo *bo; + struct nouveau_stateobj *so; + + uint32_t *code; + unsigned code_size; + unsigned code_start; /* offset inside bo */ + uint32_t *immd; + unsigned immd_size; + unsigned parm_size; /* size limit of uniform buffer */ + + ubyte max_gpr; /* REG_ALLOC_TEMP */ + ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */ + + ubyte in_nr; + ubyte out_nr; + struct nv50_varying in[16]; + struct nv50_varying out[16]; + + struct { + uint32_t attrs[3]; /* VP_ATTR_EN_0,1 and VP_GP_BUILTIN_ATTR_EN */ + ubyte psiz; + ubyte bfc[2]; + ubyte edgeflag; + ubyte clpd; + ubyte clpd_nr; + } vp; + + struct { + uint32_t flags[2]; /* 0x19a8, 196c */ + uint32_t interp; /* 0x1988 */ + uint32_t colors; /* 0x1904 */ + } fp; + + struct { + ubyte primid; /* primitive id output register */ + uint8_t vert_count; + uint8_t prim_type; /* point, line strip or tri strip */ + } gp; + + void *fixups; + unsigned num_fixups; }; -#endif +#define NV50_INTERP_LINEAR (1 << 0) +#define NV50_INTERP_FLAT (1 << 1) +#define NV50_INTERP_CENTROID (1 << 2) + +struct nv50_translation_info { + struct nv50_program *p; + unsigned inst_nr; + ubyte input_file; + ubyte output_file; + ubyte input_map[PIPE_MAX_SHADER_INPUTS][4]; + ubyte output_map[PIPE_MAX_SHADER_OUTPUTS][4]; + ubyte interp_mode[PIPE_MAX_SHADER_INPUTS]; + int input_access[PIPE_MAX_SHADER_INPUTS][4]; + int output_access[PIPE_MAX_SHADER_OUTPUTS][4]; + boolean indirect_inputs; + boolean indirect_outputs; + struct tgsi_shader_info scan; + uint32_t *immd32; + unsigned immd32_nr; + ubyte edgeflag_out; +}; + +int nv50_generate_code(struct nv50_translation_info *ti); +boolean nv50_program_tx(struct nv50_program *p); + +#endif /* __NV50_PROG_H__ */ diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c index c3ac804146..481182dd8d 100644 --- a/src/gallium/drivers/nv50/nv50_push.c +++ b/src/gallium/drivers/nv50/nv50_push.c @@ -227,7 +227,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe, ctx.idxbuf = NULL; ctx.vtx_size = 0; ctx.edgeflag = 0.5f; - ctx.edgeflag_attr = nv50->vertprog->cfg.edgeflag_in; + ctx.edgeflag_attr = nv50->vertprog->vp.edgeflag; /* map vertex buffers, determine vertex size */ for (i = 0; i < nv50->vtxelt->num_elements; i++) { diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c new file mode 100644 index 0000000000..f7e6355286 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -0,0 +1,619 @@ +/* + * Copyright 2008 Ben Skeggs + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "pipe/p_context.h" +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "util/u_inlines.h" + +#include "nv50_context.h" +#include "nv50_transfer.h" + +static void +nv50_transfer_constbuf(struct nv50_context *nv50, + struct pipe_resource *buf, unsigned size, unsigned cbi) +{ + struct pipe_context *pipe = &nv50->pipe; + struct pipe_transfer *transfer; + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + uint32_t *map; + unsigned count, start; + + map = pipe_buffer_map(pipe, buf, PIPE_TRANSFER_READ, &transfer); + if (!map) + return; + + count = MIN2(buf->width0, size); + start = 0; + + while (count) { + unsigned nr = count; + nr = MIN2(nr, 2047); + + /* FIXME: emit relocs for unsuiTed MM */ + BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); + OUT_RING (chan, (start << 8) | cbi); + BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); + OUT_RINGp (chan, map, nr); + + count -= nr; + start += nr; + map += nr; + } + + pipe_buffer_unmap(pipe, buf, transfer); +} + +static void +nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + unsigned cbi; + + if (p->immd_size) { + uint32_t *data = p->immd; + unsigned count = p->immd_size / 4; + unsigned start = 0; + + while (count) { + unsigned nr = count; + nr = MIN2(nr, 2047); + + BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); + OUT_RING (chan, (start << 8) | NV50_CB_PMISC); + BEGIN_RING_NI(chan, tesla, NV50TCL_CB_DATA(0), nr); + OUT_RINGp (chan, data, nr); + + count -= nr; + start += nr; + data += nr; + } + } + + if (p->parm_size == 0) + return; + + switch (p->type) { + case PIPE_SHADER_VERTEX: + cbi = NV50_CB_PVP; + break; + case PIPE_SHADER_FRAGMENT: + cbi = NV50_CB_PFP; + break; + case PIPE_SHADER_GEOMETRY: + cbi = NV50_CB_PGP; + break; + default: + assert(0); + break; + } + + nv50_transfer_constbuf(nv50, nv50->constbuf[p->type], p->parm_size, cbi); +} + +static void +nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_grobj *eng2d = nv50->screen->eng2d; + int ret; + unsigned offset; + unsigned size = p->code_size; + uint32_t *data = p->code; + + assert(p->translated); + + /* TODO: use a single bo (for each type) for shader code */ + if (p->bo) + return; + ret = nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, size, &p->bo); + assert(!ret); + + offset = p->code_start = 0; + + BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2); + OUT_RING (chan, NV50_2D_DST_FORMAT_R8_UNORM); + OUT_RING (chan, 1); + BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1); + OUT_RING (chan, 0x40000); + BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 2); + OUT_RING (chan, 0x10000); + OUT_RING (chan, 1); + + while (size) { + unsigned nr = size / 4; + + if (AVAIL_RING(chan) < 32) + FIRE_RING(chan); + + nr = MIN2(nr, AVAIL_RING(chan) - 18); + nr = MIN2(nr, 1792); + if (nr < (size / 4)) + nr &= ~0x3f; + assert(!(size & 3)); + + BEGIN_RING(chan, eng2d, NV50_2D_DST_ADDRESS_HIGH, 2); + OUT_RELOCh(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, p->bo, offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2); + OUT_RING (chan, 0); + OUT_RING (chan, NV50_2D_SIFC_FORMAT_R8_UNORM); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10); + OUT_RING (chan, nr * 4); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + OUT_RING (chan, 0); + + BEGIN_RING_NI(chan, eng2d, NV50_2D_SIFC_DATA, nr); + OUT_RINGp (chan, data, nr); + + data += nr; + offset += nr * 4; + size -= nr * 4; + } + + BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1); + OUT_RING (chan, 0); +} + +static void +nv50_vp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(5, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); + so_data (so, p->vp.attrs[0]); + so_data (so, p->vp.attrs[1]); + so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_VP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static void +nv50_fp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(6, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_FP_CONTROL, 1); + so_data (so, p->fp.flags[0]); + so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); + so_data (so, p->fp.flags[1]); + so_method(so, tesla, NV50TCL_FP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static void +nv50_gp_update_stateobj(struct nv50_context *nv50, struct nv50_program *p) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so = so_new(6, 7, 2); + + nv50_program_validate_code(nv50, p); + + so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | + NOUVEAU_BO_LOW, 0, 0); + so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1); + so_data (so, p->max_gpr); + so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1); + so_data (so, p->max_out); + so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1); + so_data (so, p->gp.prim_type); + so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1); + so_data (so, p->gp.vert_count); + so_method(so, tesla, NV50TCL_GP_START_ID, 1); + so_data (so, p->code_start); + + so_ref(so, &p->so); + so_ref(NULL, &so); +} + +static boolean +nv50_program_validate(struct nv50_program *p) +{ + p->translated = nv50_program_tx(p); + assert(p->translated); + return p->translated; +} + +struct nouveau_stateobj * +nv50_vertprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->vertprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_vp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_VERTPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_VERTPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +struct nouveau_stateobj * +nv50_fragprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->fragprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_fp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_FRAGPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_FRAGPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +struct nouveau_stateobj * +nv50_geomprog_validate(struct nv50_context *nv50) +{ + struct nv50_program *p = nv50->geomprog; + struct nouveau_stateobj *so = NULL; + + if (!p->translated) { + if (nv50_program_validate(p)) + nv50_gp_update_stateobj(nv50, p); + else + return NULL; + } + + if (nv50->dirty & NV50_NEW_GEOMPROG_CB) + nv50_program_validate_data(nv50, p); + + if (!(nv50->dirty & NV50_NEW_GEOMPROG)) + return NULL; + + nv50_program_validate_code(nv50, p); + + so_ref(p->so, &so); + return so; +} + +/* XXX: this might not work correctly in all cases yet: we assume that + * an FP generic input that is not written in the VP is gl_PointCoord. + */ +static uint32_t +nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned m) +{ + struct nv50_program *vp = nv50->vertprog; + struct nv50_program *fp = nv50->fragprog; + unsigned i, c; + + memset(pntc, 0, 8 * sizeof(uint32_t)); + + if (nv50->geomprog) + vp = nv50->geomprog; + + for (i = 0; i < fp->in_nr; i++) { + unsigned j, n = util_bitcount(fp->in[i].mask); + + if (fp->in[i].sn != TGSI_SEMANTIC_GENERIC) { + m += n; + continue; + } + + for (j = 0; j < vp->out_nr; ++j) + if (vp->out[j].sn == fp->in[i].sn && vp->out[j].si == fp->in[i].si) + break; + + if (j < vp->out_nr) { + ubyte en = nv50->rasterizer->pipe.sprite_coord_enable; + + if (!(en & (1 << vp->out[j].si))) { + m += n; + continue; + } + } + + /* this is either PointCoord or replaced by sprite coords */ + for (c = 0; c < 4; c++) { + if (!(fp->in[i].mask & (1 << c))) + continue; + pntc[m / 8] |= (c + 1) << ((m % 8) * 4); + ++m; + } + } + if (nv50->rasterizer->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) + return 0; + return (1 << 4); +} + +static int +nv50_vec4_map(uint32_t *map32, int mid, uint32_t lin[4], + struct nv50_varying *in, struct nv50_varying *out) +{ + int c; + uint8_t mv = out->mask, mf = in->mask, oid = out->hw; + uint8_t *map = (uint8_t *)map32; + + for (c = 0; c < 4; ++c) { + if (mf & 1) { + if (in->linear) + lin[mid / 32] |= 1 << (mid % 32); + if (mv & 1) + map[mid] = oid; + else + if (c == 3) + map[mid] |= 1; + ++mid; + } + + oid += mv & 1; + mf >>= 1; + mv >>= 1; + } + + return mid; +} + +struct nouveau_stateobj * +nv50_fp_linkage_validate(struct nv50_context *nv50) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_program *vp; + struct nv50_program *fp = nv50->fragprog; + struct nouveau_stateobj *so; + struct nv50_varying dummy; + int i, n, c, m; + + uint32_t map[16], lin[4], pntc[8]; + + uint32_t interp = fp->fp.interp; + uint32_t colors = fp->fp.colors; + uint32_t clip = 0x04; + uint32_t psiz = 0x000; + uint32_t primid = 0; + uint32_t sysval = 0; + + if (nv50->geomprog) { + vp = nv50->geomprog; + memset(map, 0x80, sizeof(map)); + } else { + vp = nv50->vertprog; + memset(map, 0x40, sizeof(map)); + } + memset(lin, 0, sizeof(lin)); + + dummy.linear = 0; + dummy.mask = 0xf; /* map all components of HPOS */ + m = nv50_vec4_map(map, 0, lin, &dummy, &vp->out[0]); + + if (vp->vp.clpd < 0x40) { + for (c = 0; c < vp->vp.clpd_nr; ++c) { + map[m / 4] |= (vp->vp.clpd + c) << ((m % 4) * 8); + ++m; + } + clip |= vp->vp.clpd_nr << 8; + } + + colors |= m << 8; /* adjust BFC0 id */ + + /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ + if (nv50->rasterizer->pipe.light_twoside) { + for (i = 0; i < 2; ++i) + m = nv50_vec4_map(map, m, lin, + &fp->in[fp->vp.bfc[i]], + &vp->out[vp->vp.bfc[i]]); + } + + colors += m - 4; /* adjust FFC0 id */ + interp |= m << 8; /* set mid where 'normal' FP inputs start */ + + dummy.mask = 0x0; + for (i = 0; i < fp->in_nr; i++) { + for (n = 0; n < vp->out_nr; ++n) + if (vp->out[n].sn == fp->in[i].sn && + vp->out[n].si == fp->in[i].si) + break; + + m = nv50_vec4_map(map, m, lin, + &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy); + } + /* PrimitiveID either is replaced by the system value, or + * written by the geometry shader into an output register + */ + if (fp->gp.primid < 0x40) { + map[m / 4] |= vp->gp.primid << ((m % 4) * 8); + primid = m++; + } + + if (nv50->rasterizer->pipe.point_size_per_vertex) { + map[m / 4] |= vp->vp.psiz << ((m % 4) * 8); + psiz = (m++ << 4) | 1; + } + + /* now fill the stateobj (at most 28 so_data) */ + so = so_new(10, 54, 0); + + n = (m + 3) / 4; + assert(m <= 64); + if (vp->type == PIPE_SHADER_GEOMETRY) { + so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1); + so_data (so, m); + so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n); + so_datap (so, map, n); + } else { + so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); + so_data (so, vp->vp.attrs[2]); + + so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1); + so_data (so, primid); + + so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); + so_data (so, m); + so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); + so_datap (so, map, n); + } + + //colors = 0x01000404; + so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); + so_data (so, colors); + so_data (so, clip); + so_data (so, sysval); + so_data (so, psiz); + + so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); + so_data (so, interp); + + so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); + so_datap (so, lin, 4); + + if (nv50->rasterizer->pipe.sprite_coord_enable) { + so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); + so_data (so, + nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff)); + + so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); + so_datap (so, pntc, 8); + } + + so_method(so, tesla, NV50TCL_GP_ENABLE, 1); + so_data (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0); + + return so; +} + +static int +nv50_vp_gp_mapping(uint32_t *map32, int m, + struct nv50_program *vp, struct nv50_program *gp) +{ + uint8_t *map = (uint8_t *)map32; + int i, j, c; + + for (i = 0; i < gp->in_nr; ++i) { + uint8_t oid = 0, mv = 0, mg = gp->in[i].mask; + + for (j = 0; j < vp->out_nr; ++j) { + if (vp->out[j].sn == gp->in[i].sn && + vp->out[j].si == gp->in[i].si) { + mv = vp->out[j].mask; + oid = vp->out[j].hw; + break; + } + } + + for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) { + if (mg & mv & 1) + map[m++] = oid; + else + if (mg & 1) + map[m++] = (c == 3) ? 0x41 : 0x40; + oid += mv & 1; + } + } + return m; +} + +struct nouveau_stateobj * +nv50_gp_linkage_validate(struct nv50_context *nv50) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_stateobj *so; + struct nv50_program *vp = nv50->vertprog; + struct nv50_program *gp = nv50->geomprog; + uint32_t map[16]; + int m = 0; + + if (!gp) + return NULL; + memset(map, 0, sizeof(map)); + + m = nv50_vp_gp_mapping(map, m, vp, gp); + + so = so_new(3, 24 - 3, 0); + + so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1); + so_data (so, vp->vp.attrs[2] | gp->vp.attrs[2]); + + assert(m <= 32); + so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); + so_data (so, m); + + m = (m + 3) / 4; + so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m); + so_datap (so, map, m); + + return so; +} diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index 42c5a58318..0d744ab788 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -546,7 +546,6 @@ nv50_vp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_VERTEX; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } @@ -578,7 +577,6 @@ nv50_fp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_FRAGMENT; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } @@ -610,7 +608,6 @@ nv50_gp_state_create(struct pipe_context *pipe, p->pipe.tokens = tgsi_dup_tokens(cso->tokens); p->type = PIPE_SHADER_GEOMETRY; - tgsi_scan_shader(p->pipe.tokens, &p->info); return (void *)p; } diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index 524696f35d..8d662d8f60 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -81,6 +81,9 @@ validate_fb(struct nv50_context *nv50) case PIPE_FORMAT_R16G16B16A16_UNORM: so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM); break; + case PIPE_FORMAT_R16G16B16A16_FLOAT: + so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT); + break; case PIPE_FORMAT_R32G32B32A32_FLOAT: so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT); break; @@ -135,6 +138,12 @@ validate_fb(struct nv50_context *nv50) case PIPE_FORMAT_Z32_FLOAT: so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT); break; + case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED: + so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM); + break; + case PIPE_FORMAT_Z16_UNORM: + so_data(so, NV50TCL_ZETA_FORMAT_Z16_UNORM); + break; default: NOUVEAU_ERR("AIIII unknown format %s\n", util_format_name(fb->zsbuf->format)); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c new file mode 100644 index 0000000000..aa15917774 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -0,0 +1,1266 @@ + +#include + +#include "nv50_context.h" +#include "nv50_pc.h" + +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" + +#include "util/u_simple_list.h" +#include "tgsi/tgsi_dump.h" + +#define BLD_MAX_TEMPS 64 +#define BLD_MAX_ADDRS 4 +#define BLD_MAX_PREDS 4 +#define BLD_MAX_IMMDS 128 + +#define BLD_MAX_COND_NESTING 4 +#define BLD_MAX_LOOP_NESTING 4 +#define BLD_MAX_CALL_NESTING 2 + +/* collects all values assigned to the same TGSI register */ +struct bld_value_stack { + struct nv_value *top; + struct nv_value **body; + unsigned size; +}; + +static INLINE void +bld_push_value(struct bld_value_stack *stk) +{ + assert(!stk->size || (stk->body[stk->size - 1] != stk->top)); + + if (!(stk->size % 8)) { + unsigned old_sz = (stk->size + 0) * sizeof(struct nv_value *); + unsigned new_sz = (stk->size + 8) * sizeof(struct nv_value *); + stk->body = (struct nv_value **)REALLOC(stk->body, old_sz, new_sz); + } + stk->body[stk->size++] = stk->top; + stk->top = NULL; +} + +static INLINE void +bld_push_values(struct bld_value_stack *stacks, int n) +{ + int i, c; + + for (i = 0; i < n; ++i) + for (c = 0; c < 4; ++c) + if (stacks[i * 4 + c].top) + bld_push_value(&stacks[i * 4 + c]); +} + +#define FETCH_TEMP(i, c) (bld->tvs[i][c].top) +#define STORE_TEMP(i, c, v) (bld->tvs[i][c].top = (v)) +#define FETCH_ADDR(i, c) (bld->avs[i][c].top) +#define STORE_ADDR(i, c, v) (bld->avs[i][c].top = (v)) +#define FETCH_PRED(i, c) (bld->pvs[i][c].top) +#define STORE_PRED(i, c, v) (bld->pvs[i][c].top = (v)) +#define FETCH_OUTR(i, c) (bld->ovs[i][c].top) +#define STORE_OUTR(i, c, v) \ + do { \ + bld->ovs[i][c].top = (v); \ + bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \ + } while (0) + +struct bld_context { + struct nv50_translation_info *ti; + + struct nv_pc *pc; + struct nv_basic_block *b; + + struct tgsi_parse_context parse[BLD_MAX_CALL_NESTING]; + int call_lvl; + + struct nv_basic_block *cond_bb[BLD_MAX_COND_NESTING]; + struct nv_basic_block *join_bb[BLD_MAX_COND_NESTING]; + struct nv_basic_block *else_bb[BLD_MAX_COND_NESTING]; + int cond_lvl; + struct nv_basic_block *loop_bb[BLD_MAX_LOOP_NESTING]; + int loop_lvl; + + struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */ + struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */ + struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ + struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4]; + + uint32_t outputs_written[PIPE_MAX_SHADER_OUTPUTS / 32]; + + struct nv_value *frgcrd[4]; + struct nv_value *sysval[4]; + + /* wipe on new BB */ + struct nv_value *saved_addr[4][2]; + struct nv_value *saved_inputs[128]; + struct nv_value *saved_immd[BLD_MAX_IMMDS]; + uint num_immds; +}; + +static INLINE struct nv_value * +bld_def(struct nv_instruction *i, int c, struct nv_value *value) +{ + i->def[c] = value; + value->insn = i; + return value; +} + +static INLINE struct nv_value * +find_by_bb(struct bld_value_stack *stack, struct nv_basic_block *b) +{ + int i; + + if (stack->top && stack->top->insn->bb == b) + return stack->top; + + for (i = stack->size - 1; i >= 0; --i) + if (stack->body[i]->insn->bb == b) + return stack->body[i]; + return NULL; +} + +/* fetch value from stack that was defined in the specified basic block, + * or search for first definitions in all of its predecessors + */ +static void +fetch_by_bb(struct bld_value_stack *stack, + struct nv_value **vals, int *n, + struct nv_basic_block *b) +{ + int i; + struct nv_value *val; + + assert(*n < 16); /* MAX_COND_NESTING */ + + val = find_by_bb(stack, b); + if (val) { + for (i = 0; i < *n; ++i) + if (vals[i] == val) + return; + vals[(*n)++] = val; + return; + } + for (i = 0; i < b->num_in; ++i) + fetch_by_bb(stack, vals, n, b->in[i]); +} + +static struct nv_value * +bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) +{ + struct nv_value *vals[16], *phi = NULL; + int j, i = 0, n = 0; + + fetch_by_bb(stack, vals, &n, bld->pc->current_block); + + assert(n); + if (n == 1) + return vals[0]; + + debug_printf("phi required: %i candidates\n", n); + + while (i < n) { + struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_PHI); + + j = phi ? 1 : 0; + if (phi) + insn->src[0] = new_ref(bld->pc, phi); + + phi = new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type); + + bld_def(insn, 0, phi); + + for (; j < 4; ++j) { + insn->src[j] = new_ref(bld->pc, vals[i++]); + if (i == n) + break; + } + debug_printf("new phi: %i, %i in\n", phi->n, j); + } + + /* insert_at_head(list, phi) is done at end of block */ + return phi; +} + +static INLINE struct nv_value * +bld_imm_u32(struct bld_context *bld, uint32_t u) +{ + int i; + unsigned n = bld->num_immds; + + debug_printf("bld_imm_u32: 0x%08x\n", u); + + for (i = 0; i < n; ++i) + if (bld->saved_immd[i]->reg.imm.u32 == u) + return bld->saved_immd[i]; + assert(n < BLD_MAX_IMMDS); + + debug_printf("need new one\n"); + + bld->num_immds++; + + bld->saved_immd[n] = new_value(bld->pc, NV_FILE_IMM, NV_TYPE_U32); + bld->saved_immd[n]->reg.imm.u32 = u; + return bld->saved_immd[n]; +} + +static INLINE struct nv_value * +bld_imm_f32(struct bld_context *bld, float f) +{ + return bld_imm_u32(bld, fui(f)); +} + +#define SET_TYPE(v, t) ((v)->reg.type = NV_TYPE_##t) + +static struct nv_value * +bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + assert(insn); + + nv_reference(bld->pc, &insn->src[0], src0); /* NOTE: new_ref would suffice */ + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +static struct nv_value * +bld_insn_2(struct bld_context *bld, uint opcode, + struct nv_value *src0, struct nv_value *src1) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + + nv_reference(bld->pc, &insn->src[0], src0); + nv_reference(bld->pc, &insn->src[1], src1); + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +static struct nv_value * +bld_insn_3(struct bld_context *bld, uint opcode, + struct nv_value *src0, struct nv_value *src1, + struct nv_value *src2) +{ + struct nv_instruction *insn = new_instruction(bld->pc, opcode); + + nv_reference(bld->pc, &insn->src[0], src0); + nv_reference(bld->pc, &insn->src[1], src1); + nv_reference(bld->pc, &insn->src[2], src2); + + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); +} + +#define BLD_INSN_1_EX(d, op, dt, s0, s0t) \ + do { \ + (d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \ + (d)->reg.type = NV_TYPE_##dt; \ + (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ + } while(0) + +static struct nv_value * +bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e) +{ + struct nv_value *val; + + BLD_INSN_1_EX(val, LG2, F32, x, F32); + BLD_INSN_1_EX(val, MUL, F32, e, F32); + val = bld_insn_1(bld, NV_OP_PREEX2, val); + val = bld_insn_1(bld, NV_OP_EX2, val); + + return val; +} + +static INLINE struct nv_value * +bld_load_imm_f32(struct bld_context *bld, float f) +{ + return bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f)); +} + +static INLINE struct nv_value * +bld_load_imm_u32(struct bld_context *bld, uint32_t u) +{ + return bld_insn_1(bld, NV_OP_MOV, bld_imm_u32(bld, u)); +} + +static struct nv_value * +bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) +{ + int i; + struct nv_instruction *nvi; + + for (i = 0; i < 4; ++i) { + if (!bld->saved_addr[i][0]) + break; + if (bld->saved_addr[i][1] == indirect) { + nvi = bld->saved_addr[i][0]->insn; + if (nvi->src[0]->value->reg.imm.u32 == id) + return bld->saved_addr[i][0]; + } + } + i &= 3; + + bld->saved_addr[i][0] = bld_load_imm_u32(bld, id); + bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR; + bld->saved_addr[i][1] = indirect; + return bld->saved_addr[i][0]; +} + + +static struct nv_value * +bld_predicate(struct bld_context *bld, struct nv_value *src) +{ + struct nv_instruction *nvi = src->insn; + + if (nvi->opcode == NV_OP_LDA || + nvi->opcode == NV_OP_PHI || + nvi->bb != bld->pc->current_block) { + nvi = new_instruction(bld->pc, NV_OP_CVT); + nv_reference(bld->pc, &nvi->src[0], src); + } + + if (!nvi->flags_def) { + nvi->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + nvi->flags_def->insn = nvi; + } + return nvi->flags_def; +} + +static void +bld_kil(struct bld_context *bld, struct nv_value *src) +{ + struct nv_instruction *nvi; + + src = bld_predicate(bld, src); + nvi = new_instruction(bld->pc, NV_OP_KIL); + nvi->fixed = 1; + nvi->flags_src = new_ref(bld->pc, src); + nvi->cc = NV_CC_LT; +} + +static void +bld_flow(struct bld_context *bld, uint opcode, ubyte cc, + struct nv_value *src, boolean plan_reconverge) +{ + struct nv_instruction *nvi; + + if (plan_reconverge) + new_instruction(bld->pc, NV_OP_JOINAT)->fixed = 1; + + nvi = new_instruction(bld->pc, opcode); + nvi->is_terminator = 1; + nvi->cc = cc; + nvi->flags_src = new_ref(bld->pc, src); +} + +static ubyte +translate_setcc(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_SLT: return NV_CC_LT; + case TGSI_OPCODE_SGE: return NV_CC_GE; + case TGSI_OPCODE_SEQ: return NV_CC_EQ; + case TGSI_OPCODE_SGT: return NV_CC_GT; + case TGSI_OPCODE_SLE: return NV_CC_LE; + case TGSI_OPCODE_SNE: return NV_CC_NE | NV_CC_U; + case TGSI_OPCODE_STR: return NV_CC_TR; + case TGSI_OPCODE_SFL: return NV_CC_FL; + + case TGSI_OPCODE_ISLT: return NV_CC_LT; + case TGSI_OPCODE_ISGE: return NV_CC_GE; + case TGSI_OPCODE_USEQ: return NV_CC_EQ; + case TGSI_OPCODE_USGE: return NV_CC_GE; + case TGSI_OPCODE_USLT: return NV_CC_LT; + case TGSI_OPCODE_USNE: return NV_CC_NE; + default: + assert(0); + return NV_CC_FL; + } +} + +static uint +translate_opcode(uint opcode) +{ + switch (opcode) { + case TGSI_OPCODE_ABS: return NV_OP_ABS; + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_UADD: return NV_OP_ADD; + case TGSI_OPCODE_AND: return NV_OP_AND; + case TGSI_OPCODE_EX2: return NV_OP_EX2; + case TGSI_OPCODE_CEIL: return NV_OP_CEIL; + case TGSI_OPCODE_FLR: return NV_OP_FLOOR; + case TGSI_OPCODE_TRUNC: return NV_OP_TRUNC; + case TGSI_OPCODE_DDX: return NV_OP_DFDX; + case TGSI_OPCODE_DDY: return NV_OP_DFDY; + case TGSI_OPCODE_F2I: + case TGSI_OPCODE_F2U: + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_U2F: return NV_OP_CVT; + case TGSI_OPCODE_INEG: return NV_OP_NEG; + case TGSI_OPCODE_LG2: return NV_OP_LG2; + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_USHR: return NV_OP_SHR; + case TGSI_OPCODE_MAD: + case TGSI_OPCODE_UMAD: return NV_OP_MAD; + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_UMAX: return NV_OP_MAX; + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_UMIN: return NV_OP_MIN; + case TGSI_OPCODE_MUL: + case TGSI_OPCODE_UMUL: return NV_OP_MUL; + case TGSI_OPCODE_OR: return NV_OP_OR; + case TGSI_OPCODE_RCP: return NV_OP_RCP; + case TGSI_OPCODE_RSQ: return NV_OP_RSQ; + case TGSI_OPCODE_SAD: return NV_OP_SAD; + case TGSI_OPCODE_SHL: return NV_OP_SHL; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: return NV_OP_SET; + case TGSI_OPCODE_TEX: return NV_OP_TEX; + case TGSI_OPCODE_TXP: return NV_OP_TEX; + case TGSI_OPCODE_TXB: return NV_OP_TXB; + case TGSI_OPCODE_TXL: return NV_OP_TXL; + case TGSI_OPCODE_XOR: return NV_OP_XOR; + default: + return NV_OP_NOP; + } +} + +static ubyte +infer_src_type(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_SAD: + case TGSI_OPCODE_U2F: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + case TGSI_OPCODE_USHR: + return NV_TYPE_U32; + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_ISLT: + return NV_TYPE_S32; + default: + return NV_TYPE_F32; + } +} + +static ubyte +infer_dst_type(unsigned opcode) +{ + switch (opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_F2U: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_SAD: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + case TGSI_OPCODE_USHR: + return NV_TYPE_U32; + case TGSI_OPCODE_F2I: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_ISLT: + return NV_TYPE_S32; + default: + return NV_TYPE_F32; + } +} + +static void +emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, + unsigned chan, struct nv_value *value) +{ + const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + + assert(chan < 4); + + if (inst->Instruction.Opcode != TGSI_OPCODE_MOV) + value->reg.type = infer_dst_type(inst->Instruction.Opcode); + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_NONE: + break; + case TGSI_SAT_ZERO_ONE: + BLD_INSN_1_EX(value, SAT, F32, value, F32); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + value = bld_insn_2(bld, NV_OP_MAX, value, bld_load_imm_f32(bld, -1.0f)); + value = bld_insn_2(bld, NV_OP_MIN, value, bld_load_imm_f32(bld, 1.0f)); + value->reg.type = NV_TYPE_F32; + break; + } + + switch (reg->Register.File) { + case TGSI_FILE_OUTPUT: + value = bld_insn_1(bld, NV_OP_MOV, value); + value->reg.file = bld->ti->output_file; + + if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) { + STORE_OUTR(reg->Register.Index, chan, value); + } else { + value->insn->fixed = 1; + value->reg.id = bld->ti->output_map[reg->Register.Index][chan]; + } + break; + case TGSI_FILE_TEMPORARY: + assert(reg->Register.Index < BLD_MAX_TEMPS); + value->reg.file = NV_FILE_GPR; + if (value->insn->bb != bld->pc->current_block) + value = bld_insn_1(bld, NV_OP_MOV, value); + STORE_TEMP(reg->Register.Index, chan, value); + break; + case TGSI_FILE_ADDRESS: + assert(reg->Register.Index < BLD_MAX_ADDRS); + value->reg.file = NV_FILE_ADDR; + STORE_ADDR(reg->Register.Index, chan, value); + break; + } +} + +static INLINE uint32_t +bld_is_output_written(struct bld_context *bld, int i, int c) +{ + if (c < 0) + return bld->outputs_written[i / 8] & (0xf << ((i * 4) % 32)); + return bld->outputs_written[i / 8] & (1 << ((i * 4 + c) % 32)); +} + +static void +bld_export_outputs(struct bld_context *bld) +{ + struct nv_value *vals[4]; + struct nv_instruction *nvi; + int i, c, n; + + bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + + for (i = 0; i < PIPE_MAX_SHADER_OUTPUTS; ++i) { + if (!bld_is_output_written(bld, i, -1)) + continue; + for (n = 0, c = 0; c < 4; ++c) { + if (!bld_is_output_written(bld, i, c)) + continue; + vals[n] = bld_fetch_global(bld, &bld->ovs[i][c]); + vals[n] = bld_insn_1(bld, NV_OP_MOV, vals[n]); + vals[n++]->reg.id = bld->ti->output_map[i][c]; + } + assert(n); + + (nvi = new_instruction(bld->pc, NV_OP_EXPORT))->fixed = 1; + + for (c = 0; c < n; ++c) + nvi->src[c] = new_ref(bld->pc, vals[c]); + } +} + +static void +bld_new_block(struct bld_context *bld, struct nv_basic_block *b) +{ + int i; + + bld_push_values(&bld->tvs[0][0], BLD_MAX_TEMPS); + bld_push_values(&bld->avs[0][0], BLD_MAX_ADDRS); + bld_push_values(&bld->pvs[0][0], BLD_MAX_PREDS); + bld_push_values(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + + bld->pc->current_block = b; + + for (i = 0; i < 4; ++i) + bld->saved_addr[i][0] = NULL; +} + +static struct nv_value * +bld_saved_input(struct bld_context *bld, unsigned i, unsigned c) +{ + unsigned idx = bld->ti->input_map[i][c]; + + if (bld->ti->p->type != PIPE_SHADER_FRAGMENT) + return NULL; + if (bld->saved_inputs[idx]) + return bld->saved_inputs[idx]; + return NULL; +} + +static struct nv_value * +bld_interpolate(struct bld_context *bld, unsigned mode, struct nv_value *val) +{ + if (mode & (NV50_INTERP_LINEAR | NV50_INTERP_FLAT)) + val = bld_insn_1(bld, NV_OP_LINTERP, val); + else + val = bld_insn_2(bld, NV_OP_PINTERP, val, bld->frgcrd[3]); + + val->insn->flat = (mode & NV50_INTERP_FLAT) ? 1 : 0; + val->insn->centroid = (mode & NV50_INTERP_CENTROID) ? 1 : 0; + return val; +} + +static struct nv_value * +emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, + const unsigned s, const unsigned chan) +{ + const struct tgsi_full_src_register *src = &insn->Src[s]; + struct nv_value *res; + unsigned idx, swz, dim_idx, ind_idx, ind_swz; + ubyte type = infer_src_type(insn->Instruction.Opcode); + + idx = src->Register.Index; + swz = tgsi_util_get_full_src_register_swizzle(src, chan); + dim_idx = -1; + ind_idx = -1; + ind_swz = 0; + + if (src->Register.Indirect) { + ind_idx = src->Indirect.Index; + ind_swz = tgsi_util_get_src_register_swizzle(&src->Indirect, 0); + } + + switch (src->Register.File) { + case TGSI_FILE_CONSTANT: + dim_idx = src->Dimension.Index ? src->Dimension.Index + 2 : 1; + assert(dim_idx < 14); + assert(dim_idx == 1); /* for now */ + + res = new_value(bld->pc, NV_FILE_MEM_C(dim_idx), type); + res->reg.type = type; + res->reg.id = (idx * 4 + swz) & 127; + res = bld_insn_1(bld, NV_OP_LDA, res); + + if (src->Register.Indirect) + res->insn->src[4] = new_ref(bld->pc, FETCH_ADDR(ind_idx, ind_swz)); + if (idx >= (128 / 4)) + res->insn->src[4] = + new_ref(bld->pc, bld_get_address(bld, (idx * 16) & ~0x1ff, NULL)); + break; + case TGSI_FILE_IMMEDIATE: + assert(idx < bld->ti->immd32_nr); + res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]); + res->reg.type = type; + break; + case TGSI_FILE_INPUT: + res = bld_saved_input(bld, idx, swz); + if (res && (insn->Instruction.Opcode != TGSI_OPCODE_TXP)) + return res; + + res = new_value(bld->pc, bld->ti->input_file, type); + res->reg.id = bld->ti->input_map[idx][swz]; + + if (res->reg.file == NV_FILE_MEM_V) { + res = bld_interpolate(bld, bld->ti->interp_mode[idx], res); + } else { + assert(src->Dimension.Dimension == 0); + res = bld_insn_1(bld, NV_OP_LDA, res); + } + assert(res->reg.type == type); + + bld->saved_inputs[bld->ti->input_map[idx][swz]] = res; + break; + case TGSI_FILE_TEMPORARY: + /* this should be load from l[], with reload elimination later on */ + res = bld_fetch_global(bld, &bld->tvs[idx][swz]); + break; + case TGSI_FILE_ADDRESS: + res = bld_fetch_global(bld, &bld->avs[idx][swz]); + break; + case TGSI_FILE_PREDICATE: + res = bld_fetch_global(bld, &bld->pvs[idx][swz]); + break; + default: + NOUVEAU_ERR("illegal/unhandled src reg file: %d\n", src->Register.File); + abort(); + break; + } + + switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) { + case TGSI_UTIL_SIGN_KEEP: + break; + case TGSI_UTIL_SIGN_CLEAR: + res = bld_insn_1(bld, NV_OP_ABS, res); + break; + case TGSI_UTIL_SIGN_TOGGLE: + res = bld_insn_1(bld, NV_OP_NEG, res); + break; + case TGSI_UTIL_SIGN_SET: + res = bld_insn_1(bld, NV_OP_ABS, res); + res = bld_insn_1(bld, NV_OP_NEG, res); + break; + default: + NOUVEAU_ERR("illegal/unhandled src reg sign mode\n"); + abort(); + break; + } + + return res; +} + +static void +bld_lit(struct bld_context *bld, struct nv_value *dst0[4], + const struct tgsi_full_instruction *insn) +{ + struct nv_value *val0, *zero; + unsigned mask = insn->Dst[0].Register.WriteMask; + + if (mask & ((1 << 0) | (1 << 3))) + dst0[3] = dst0[0] = bld_load_imm_f32(bld, 1.0f); + + if (mask & (3 << 1)) { + zero = bld_load_imm_f32(bld, 0.0f); + val0 = bld_insn_2(bld, NV_OP_MAX, emit_fetch(bld, insn, 0, 0), zero); + + if (mask & (1 << 1)) + dst0[1] = val0; + } + + if (mask & (1 << 2)) { + struct nv_value *val1, *val3, *src1, *src3; + struct nv_value *pos128 = bld_load_imm_f32(bld, 127.999999f); + struct nv_value *neg128 = bld_load_imm_f32(bld, -127.999999f); + + src1 = emit_fetch(bld, insn, 0, 1); + src3 = emit_fetch(bld, insn, 0, 3); + + val0->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + val0->insn->flags_def->insn = val0->insn; + + val1 = bld_insn_2(bld, NV_OP_MAX, src1, zero); + val3 = bld_insn_2(bld, NV_OP_MAX, src3, neg128); + val3 = bld_insn_2(bld, NV_OP_MIN, val3, pos128); + val3 = bld_pow(bld, val1, val3); + + dst0[2] = bld_insn_1(bld, NV_OP_MOV, zero); + dst0[2]->insn->cc = NV_CC_LE; + dst0[2]->insn->flags_src = new_ref(bld->pc, val0->insn->flags_def); + + dst0[2] = bld_insn_2(bld, NV_OP_SELECT, val3, dst0[2]); + } +} + +static INLINE void +get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg) +{ + switch (insn->Texture.Texture) { + case TGSI_TEXTURE_1D: + *arg = *dim = 1; + break; + case TGSI_TEXTURE_SHADOW1D: + *dim = 1; + *arg = 2; + break; + case TGSI_TEXTURE_UNKNOWN: + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + *arg = *dim = 2; + break; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + *dim = 2; + *arg = 3; + break; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + *dim = *arg = 3; + break; + default: + assert(0); + break; + } +} + +static void +load_proj_tex_coords(struct bld_context *bld, + struct nv_value *t[4], int dim, + const struct tgsi_full_instruction *insn) +{ + int c, mask = 0; + + t[3] = emit_fetch(bld, insn, 0, 3); + + if (t[3]->insn->opcode == NV_OP_PINTERP) { + t[3]->insn->opcode = NV_OP_LINTERP; + nv_reference(bld->pc, &t[3]->insn->src[1], NULL); + } + + t[3] = bld_insn_1(bld, NV_OP_RCP, t[3]); + + for (c = 0; c < dim; ++c) { + t[c] = emit_fetch(bld, insn, 0, c); + if (t[c]->insn->opcode == NV_OP_LINTERP) + t[c]->insn->opcode = NV_OP_PINTERP; + + if (t[c]->insn->opcode == NV_OP_PINTERP) + nv_reference(bld->pc, &t[c]->insn->src[1], t[3]); + else + mask |= 1 << c; + } + + for (c = 0; mask; ++c, mask >>= 1) { + if (!(mask & 1)) + continue; + t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], t[3]); + } +} + +static void +bld_tex(struct bld_context *bld, struct nv_value *dst0[4], + const struct tgsi_full_instruction *insn) +{ + struct nv_value *t[4]; + struct nv_instruction *nvi; + uint opcode = translate_opcode(insn->Instruction.Opcode); + int arg, dim, c; + + get_tex_dim(insn, &dim, &arg); + + if (insn->Texture.Texture == TGSI_TEXTURE_CUBE) { + } + // else + if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) { + load_proj_tex_coords(bld, t, dim, insn); + } else + for (c = 0; c < dim; ++c) + t[c] = emit_fetch(bld, insn, 0, c); + + if (arg != dim) + t[dim] = emit_fetch(bld, insn, 0, 2); + + if (insn->Instruction.Opcode == TGSI_OPCODE_TXB || + insn->Instruction.Opcode == TGSI_OPCODE_TXL) { + t[arg++] = emit_fetch(bld, insn, 0, 3); + } + + for (c = 0; c < arg; ++c) { + t[c] = bld_insn_1(bld, NV_OP_MOV, t[c]); + t[c]->reg.type = NV_TYPE_F32; + } + + nvi = new_instruction(bld->pc, opcode); + + for (c = 0; c < 4; ++c) { + nvi->def[c] = dst0[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32); + nvi->def[c]->insn = nvi; + } + for (c = 0; c < arg; ++c) + nvi->src[c] = new_ref(bld->pc, t[c]); + + nvi->tex_t = insn->Src[1].Register.Index; + nvi->tex_s = 0; + nvi->tex_mask = 0xf; + nvi->tex_cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; + nvi->tex_live = 0; + nvi->tex_argc = arg; +} + +#define FOR_EACH_DST0_ENABLED_CHANNEL(chan, inst) \ + for (chan = 0; chan < 4; ++chan) \ + if ((inst)->Dst[0].Register.WriteMask & (1 << chan)) + +static void +bld_instruction(struct bld_context *bld, + const struct tgsi_full_instruction *insn) +{ + struct nv_value *src0; + struct nv_value *src1; + struct nv_value *src2; + struct nv_value *dst0[4]; + struct nv_value *temp; + int c; + uint opcode = translate_opcode(insn->Instruction.Opcode); + + tgsi_dump_instruction(insn, 1); + + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_MUL: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, opcode, src0, src1); + } + break; + case TGSI_OPCODE_CMP: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + src0 = bld_predicate(bld, src0); + + src1 = bld_insn_1(bld, NV_OP_MOV, src1); + src1->insn->flags_src = new_ref(bld->pc, src0); + src1->insn->cc = NV_CC_LT; + + src2 = bld_insn_1(bld, NV_OP_MOV, src2); + src2->insn->flags_src = new_ref(bld->pc, src0); + src2->insn->cc = NV_CC_GE; + + dst0[c] = bld_insn_2(bld, NV_OP_SELECT, src1, src2); + } + break; + case TGSI_OPCODE_COS: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + if (insn->Dst[0].Register.WriteMask & 7) + temp = bld_insn_1(bld, NV_OP_COS, temp); + for (c = 0; c < 3; ++c) + if (insn->Dst[0].Register.WriteMask & (1 << c)) + dst0[c] = temp; + if (!(insn->Dst[0].Register.WriteMask & (1 << 3))) + break; + /* XXX: if src0.x is src0.w, don't emit new insns */ + src0 = emit_fetch(bld, insn, 0, 3); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + dst0[3] = bld_insn_1(bld, NV_OP_COS, temp); + break; + case TGSI_OPCODE_DP3: + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); + for (c = 1; c < 3; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); + } + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_DP4: + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); + for (c = 1; c < 4; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); + } + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_EX2: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PREEX2, src0); + temp = bld_insn_1(bld, NV_OP_EX2, temp); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_FRC: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + dst0[c] = bld_insn_1(bld, NV_OP_FLOOR, src0); + dst0[c] = bld_insn_2(bld, NV_OP_SUB, src0, dst0[c]); + } + break; + case TGSI_OPCODE_KIL: + for (c = 0; c < 4; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + bld_kil(bld, src0); + } + break; + case TGSI_OPCODE_IF: + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + nvbb_attach_block(bld->pc->current_block, b); + + bld->join_bb[bld->cond_lvl] = bld->pc->current_block; + bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + + src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0)); + + bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, FALSE); + + ++bld->cond_lvl; + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_ELSE: + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + --bld->cond_lvl; + nvbb_attach_block(bld->join_bb[bld->cond_lvl], b); + + bld->cond_bb[bld->cond_lvl]->exit->target = b; + bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; + + new_instruction(bld->pc, NV_OP_BRA)->is_terminator = 1; + + ++bld->cond_lvl; + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_ENDIF: /* XXX: deal with ENDIF; ENDIF; */ + { + struct nv_basic_block *b = new_basic_block(bld->pc); + + --bld->cond_lvl; + nvbb_attach_block(bld->pc->current_block, b); + nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b); + + bld->cond_bb[bld->cond_lvl]->exit->target = b; + + if (0 && bld->join_bb[bld->cond_lvl]) { + bld->join_bb[bld->cond_lvl]->exit->prev->target = b; + + new_instruction(bld->pc, NV_OP_NOP)->is_join = TRUE; + } + + bld_new_block(bld, b); + } + break; + case TGSI_OPCODE_BGNLOOP: + assert(0); + break; + case TGSI_OPCODE_BRK: + assert(0); + break; + case TGSI_OPCODE_CONT: + assert(0); + break; + case TGSI_OPCODE_ENDLOOP: + assert(0); + break; + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_CEIL: + case TGSI_OPCODE_FLR: + case TGSI_OPCODE_TRUNC: + case TGSI_OPCODE_DDX: + case TGSI_OPCODE_DDY: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + dst0[c] = bld_insn_1(bld, opcode, src0); + } + break; + case TGSI_OPCODE_LIT: + bld_lit(bld, dst0, insn); + break; + case TGSI_OPCODE_LRP: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + dst0[c] = bld_insn_2(bld, NV_OP_SUB, src1, src2); + dst0[c] = bld_insn_3(bld, NV_OP_MAD, dst0[c], src0, src2); + } + break; + case TGSI_OPCODE_MOV: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = emit_fetch(bld, insn, 0, c); + break; + case TGSI_OPCODE_MAD: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + src2 = emit_fetch(bld, insn, 2, c); + dst0[c] = bld_insn_3(bld, opcode, src0, src1, src2); + } + break; + case TGSI_OPCODE_POW: + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + temp = bld_pow(bld, src0, src1); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_LG2: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, opcode, src0); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_RSQ: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_ABS, src0); + temp = bld_insn_1(bld, NV_OP_RSQ, temp); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, NV_OP_SET, src0, src1); + dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode); + dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode); + + if (dst0[c]->reg.type != NV_TYPE_F32) + break; + dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]); + dst0[c]->insn->src[0]->typecast = NV_TYPE_S32; + dst0[c]->reg.type = NV_TYPE_S32; + dst0[c] = bld_insn_1(bld, NV_OP_CVT, dst0[c]); + dst0[c]->reg.type = NV_TYPE_F32; + } + break; + case TGSI_OPCODE_SUB: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dst0[c] = bld_insn_2(bld, NV_OP_ADD, src0, src1); + dst0[c]->insn->src[1]->mod ^= NV_MOD_NEG; + } + break; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + bld_tex(bld, dst0, insn); + break; + case TGSI_OPCODE_XPD: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + if (c == 3) { + dst0[3] = bld_imm_f32(bld, 1.0f); + break; + } + src0 = emit_fetch(bld, insn, 0, (c + 1) % 3); + src1 = emit_fetch(bld, insn, 1, (c + 2) % 3); + dst0[c] = bld_insn_2(bld, NV_OP_MUL, src0, src1); + + src0 = emit_fetch(bld, insn, 0, (c + 2) % 3); + src1 = emit_fetch(bld, insn, 1, (c + 1) % 3); + dst0[c] = bld_insn_3(bld, NV_OP_MAD, src0, src1, dst0[c]); + + dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG; + } + break; + case TGSI_OPCODE_END: + if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) + bld_export_outputs(bld); + break; + default: + NOUVEAU_ERR("nv_bld: unhandled opcode %u\n", insn->Instruction.Opcode); + abort(); + break; + } + + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + emit_store(bld, insn, c, dst0[c]); +} + +int +nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) +{ + struct bld_context *bld = CALLOC_STRUCT(bld_context); + int c; + + pc->root = pc->current_block = new_basic_block(pc); + + bld->pc = pc; + bld->ti = ti; + + pc->loop_nesting_bound = 1; /* XXX: should work with 0 */ + + c = util_bitcount(bld->ti->p->fp.interp >> 24); + if (c && ti->p->type == PIPE_SHADER_FRAGMENT) { + bld->frgcrd[3] = new_value(pc, NV_FILE_MEM_V, NV_TYPE_F32); + bld->frgcrd[3]->reg.id = c - 1; + bld->frgcrd[3] = bld_insn_1(bld, NV_OP_LINTERP, bld->frgcrd[3]); + bld->frgcrd[3] = bld_insn_1(bld, NV_OP_RCP, bld->frgcrd[3]); + } + + tgsi_parse_init(&bld->parse[0], ti->p->pipe.tokens); + + while (!tgsi_parse_end_of_tokens(&bld->parse[bld->call_lvl])) { + const union tgsi_full_token *tok = &bld->parse[bld->call_lvl].FullToken; + + tgsi_parse_token(&bld->parse[bld->call_lvl]); + + switch (tok->Token.Type) { + case TGSI_TOKEN_TYPE_INSTRUCTION: + bld_instruction(bld, &tok->FullInstruction); + break; + default: + break; + } + } + + FREE(bld); + return 0; +} + +#if 0 +/* If a variable is assigned in a loop, replace all references to the value + * from outside the loop with a phi value. + */ +static void +bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b, + struct nv_value *old_val, + struct nv_value *new_val) +{ + struct nv_instruction *nvi; + + for (nvi = b->entry; nvi; nvi = nvi->next) { + int s; + for (s = 0; s < 5; ++s) { + if (!nvi->src[s]) + continue; + if (nvi->src[s]->value == old_val) + nv_reference(pc, &nvi->src[s], new_val); + } + if (nvi->flags_src && nvi->flags_src->value == old_val) + nv_reference(pc, &nvi->flags_src, new_val); + } + b->pass_seq = pc->pass_seq; + + if (b->out[0] && b->out[0]->pass_seq < pc->pass_seq) + bld_adjust_nv_refs(pc, b, old_val, new_val); + + if (b->out[1] && b->out[1]->pass_seq < pc->pass_seq) + bld_adjust_nv_refs(pc, b, old_val, new_val); +} +#endif diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index 864cb09352..6bd52884b5 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -519,7 +519,7 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib, so_data (so, fui(v[1])); break; case 1: - if (attrib == nv50->vertprog->cfg.edgeflag_in) { + if (attrib == nv50->vertprog->vp.edgeflag) { so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1); so_data (so, v[0] ? 1 : 0); } @@ -560,7 +560,7 @@ nv50_vbo_validate(struct nv50_context *nv50) nv50->vbo_fifo = 0; if (nv50->screen->force_push || - nv50->vertprog->cfg.edgeflag_in < 16) + nv50->vertprog->vp.edgeflag < 16) nv50->vbo_fifo = 0xffff; for (i = 0; i < nv50->vtxbuf_nr; i++) { -- cgit v1.2.3 From f3af1201c578443dd0f72e73470dd1763888a41d Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 24 Jul 2010 12:49:15 +0200 Subject: nouveau: update nouveau_class.h Adds nvc0, new vertex formats, and dual source blending values. --- src/gallium/drivers/nouveau/nouveau_class.h | 1171 +++++++++++++++++++++++++-- src/gallium/drivers/nv50/nv50_state.c | 55 +- src/gallium/drivers/nv50/nv50_vbo.c | 17 +- 3 files changed, 1159 insertions(+), 84 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h index adfdd37b1b..975fd8f35a 100644 --- a/src/gallium/drivers/nouveau/nouveau_class.h +++ b/src/gallium/drivers/nouveau/nouveau_class.h @@ -735,6 +735,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_HIGH 0x0000023c +#define NVC0_MEMORY_TO_MEMORY_FORMAT 0x00009039 + +#define NVC0_MEMORY_TO_MEMORY_FORMAT_NOP 0x00000100 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_SERIALIZE 0x00000110 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_MODE_IN 0x00000204 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_PITCH_IN 0x00000208 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_HEIGHT_IN 0x0000020c +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_DEPTH_IN 0x00000210 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_Z 0x00000214 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_MODE_OUT 0x00000220 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_PITCH_OUT 0x00000224 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_HEIGHT_OUT 0x00000228 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_DEPTH_OUT 0x0000022c +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_Z 0x00000230 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_HIGH 0x00000238 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_LOW 0x0000023c +#define NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC 0x00000300 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_PUSH (1 << 0) +#define NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_LINEAR_IN (1 << 4) +#define NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_LINEAR_OUT (1 << 8) +#define NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_NOTIFY (1 << 13) +#define NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_INC_SHIFT 20 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_EXEC_INC_MASK 0x00f00000 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_DATA 0x00000304 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH 0x0000030c +#define NVC0_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_LOW 0x00000310 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_PITCH_IN 0x00000314 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT 0x00000318 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN 0x0000031c +#define NVC0_MEMORY_TO_MEMORY_FORMAT_LINE_COUNT 0x00000320 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_NOTIFY_ADDRESS_HIGH 0x0000032c +#define NVC0_MEMORY_TO_MEMORY_FORMAT_NOTIFY_ADDRESS_LOW 0x00000330 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_NOTIFY 0x00000334 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_X 0x00000344 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN_Y 0x00000348 +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_X 0x0000034c +#define NVC0_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT_Y 0x00000350 + + #define NV01_MEMORY_LOCAL_BANKED 0x0000003d @@ -4507,6 +4546,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV20TCL_VTXFMT_TYPE_SHIFT 0 #define NV20TCL_VTXFMT_TYPE_MASK 0x0000000f #define NV20TCL_VTXFMT_TYPE_FLOAT 0x00000002 +#define NV20TCL_VTXFMT_TYPE_HALF 0x00000003 #define NV20TCL_VTXFMT_TYPE_UBYTE 0x00000004 #define NV20TCL_VTXFMT_TYPE_USHORT 0x00000005 #define NV20TCL_VTXFMT_SIZE_SHIFT 4 @@ -6990,6 +7030,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV40TCL_VTXFMT_TYPE_SHIFT 0 #define NV40TCL_VTXFMT_TYPE_MASK 0x0000000f #define NV40TCL_VTXFMT_TYPE_FLOAT 0x00000002 +#define NV40TCL_VTXFMT_TYPE_HALF 0x00000003 #define NV40TCL_VTXFMT_TYPE_UBYTE 0x00000004 #define NV40TCL_VTXFMT_TYPE_USHORT 0x00000005 #define NV40TCL_VTXFMT_SIZE_SHIFT 4 @@ -7699,7 +7740,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_DMA_TIC 0x000001a0 #define NV50TCL_DMA_TEXTURE 0x000001a4 #define NV50TCL_DMA_STRMOUT 0x000001a8 -#define NV50TCL_DMA_UNK01AC 0x000001ac +#define NV50TCL_DMA_CLIPID 0x000001ac #define NV50TCL_DMA_COLOR(x) (0x000001c0+((x)*4)) #define NV50TCL_DMA_COLOR__SIZE 0x00000008 #define NV50TCL_RT_ADDRESS_HIGH(x) (0x00000200+((x)*32)) @@ -7916,8 +7957,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_DEPTH_RANGE_FAR__SIZE 0x00000010 #define NV50TCL_VIEWPORT_CLIP_HORIZ(x) (0x00000d00+((x)*8)) #define NV50TCL_VIEWPORT_CLIP_HORIZ__SIZE 0x00000008 +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MIN_SHIFT 0 +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MIN_MASK 0x0000ffff +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MAX_SHIFT 16 +#define NV50TCL_VIEWPORT_CLIP_HORIZ_MAX_MASK 0xffff0000 #define NV50TCL_VIEWPORT_CLIP_VERT(x) (0x00000d04+((x)*8)) #define NV50TCL_VIEWPORT_CLIP_VERT__SIZE 0x00000008 +#define NV50TCL_VIEWPORT_CLIP_VERT_MIN_SHIFT 0 +#define NV50TCL_VIEWPORT_CLIP_VERT_MIN_MASK 0x0000ffff +#define NV50TCL_VIEWPORT_CLIP_VERT_MAX_SHIFT 16 +#define NV50TCL_VIEWPORT_CLIP_VERT_MAX_MASK 0xffff0000 +#define NV50TCL_CLIPID_REGION_HORIZ(x) (0x00000d40+((x)*8)) +#define NV50TCL_CLIPID_REGION_HORIZ__SIZE 0x00000004 +#define NV50TCL_CLIPID_REGION_VERT(x) (0x00000d44+((x)*8)) +#define NV50TCL_CLIPID_REGION_VERT__SIZE 0x00000004 #define NV50TCL_VERTEX_BUFFER_FIRST 0x00000d74 #define NV50TCL_VERTEX_BUFFER_COUNT 0x00000d78 #define NV50TCL_CLEAR_COLOR(x) (0x00000d80+((x)*4)) @@ -7975,14 +8028,16 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_GP_ADDRESS_LOW 0x00000f74 #define NV50TCL_VP_ADDRESS_HIGH 0x00000f7c #define NV50TCL_VP_ADDRESS_LOW 0x00000f80 -#define NV50TCL_UNK0F84_ADDRESS_HIGH 0x00000f84 -#define NV50TCL_UNK0F84_ADDRESS_LOW 0x00000f88 +#define NV50TCL_VERTEX_RUNOUT_HIGH 0x00000f84 +#define NV50TCL_VERTEX_RUNOUT_LOW 0x00000f88 #define NV50TCL_DEPTH_BOUNDS(x) (0x00000f9c+((x)*4)) #define NV50TCL_DEPTH_BOUNDS__SIZE 0x00000002 #define NV50TCL_FP_ADDRESS_HIGH 0x00000fa4 #define NV50TCL_FP_ADDRESS_LOW 0x00000fa8 #define NV50TCL_MSAA_MASK(x) (0x00000fbc+((x)*4)) #define NV50TCL_MSAA_MASK__SIZE 0x00000004 +#define NV50TCL_CLIPID_ADDRESS_HIGH 0x00000fcc +#define NV50TCL_CLIPID_ADDRESS_LOW 0x00000fd0 #define NV50TCL_ZETA_ADDRESS_HIGH 0x00000fe0 #define NV50TCL_ZETA_ADDRESS_LOW 0x00000fe4 #define NV50TCL_ZETA_FORMAT 0x00000fe8 @@ -8112,37 +8167,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT 0x0000800a #define NV50TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT 0x0000800b #define NV50TCL_BLEND_FUNC_SRC_RGB 0x00001344 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_FUNC_DST_RGB 0x00001348 -#define NV50TCL_BLEND_FUNC_DST_RGB_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_DST_RGB_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_DST_RGB_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_EQUATION_ALPHA 0x0000134c #define NV50TCL_BLEND_EQUATION_ALPHA_FUNC_ADD 0x00008006 #define NV50TCL_BLEND_EQUATION_ALPHA_MIN 0x00008007 @@ -8150,37 +8213,45 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT 0x0000800a #define NV50TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT 0x0000800b #define NV50TCL_BLEND_FUNC_SRC_ALPHA 0x00001350 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_FUNC_DST_ALPHA 0x00001358 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO 0x00000000 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE 0x00000001 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR 0x00000300 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR 0x00000301 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA 0x00000302 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA 0x00000303 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA 0x00000304 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA 0x00000305 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR 0x00000306 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR 0x00000307 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE 0x00000308 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR 0x00008001 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x00008002 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA 0x00008003 -#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x00008004 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO 0x00004000 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE 0x00004001 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR 0x00004300 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA 0x00004302 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA 0x00004304 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR 0x00004306 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC1_COLOR 0x0000c900 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_SRC1_ALPHA 0x0000c902 +#define NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 #define NV50TCL_BLEND_ENABLE(x) (0x00001360+((x)*4)) #define NV50TCL_BLEND_ENABLE__SIZE 0x00000008 #define NV50TCL_STENCIL_FRONT_ENABLE 0x00001380 @@ -8239,6 +8310,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_FP_START_ID 0x00001414 #define NV50TCL_GP_VERTEX_OUTPUT_COUNT 0x00001420 #define NV50TCL_VB_ELEMENT_BASE 0x00001434 +#define NV50TCL_INSTANCE_BASE 0x00001438 #define NV50TCL_CODE_CB_FLUSH 0x00001440 #define NV50TCL_BIND_TSC(x) (0x00001444+((x)*8)) #define NV50TCL_BIND_TSC__SIZE 0x00000003 @@ -8256,6 +8328,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_BIND_TIC_TIC_MASK 0x7ffffe00 #define NV50TCL_STRMOUT_MAP(x) (0x00001480+((x)*4)) #define NV50TCL_STRMOUT_MAP__SIZE 0x00000020 +#define NV50TCL_CLIPID_HEIGHT 0x00001504 #define NV50TCL_VP_CLIP_DISTANCE_ENABLE 0x00001510 #define NV50TCL_VP_CLIP_DISTANCE_ENABLE_0 (1 << 0) #define NV50TCL_VP_CLIP_DISTANCE_ENABLE_1 (1 << 1) @@ -8340,7 +8413,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_GP_BUILTIN_RESULT_EN 0x000015cc #define NV50TCL_GP_BUILTIN_RESULT_EN_VPORT_IDX (1 << 0) #define NV50TCL_GP_BUILTIN_RESULT_EN_LAYER_IDX (1 << 16) -#define NV50TCL_MULTISAMPLE_SAMPLES_LOG2 0x000015d0 +#define NV50TCL_MULTISAMPLE_MODE 0x000015d0 +#define NV50TCL_MULTISAMPLE_MODE_1X 0x00000000 +#define NV50TCL_MULTISAMPLE_MODE_2XMS 0x00000001 +#define NV50TCL_MULTISAMPLE_MODE_4XMS 0x00000002 +#define NV50TCL_MULTISAMPLE_MODE_8XMS 0x00000004 +#define NV50TCL_MULTISAMPLE_MODE_4XMS_4XCS 0x00000008 +#define NV50TCL_MULTISAMPLE_MODE_4XMS_12XCS 0x00000009 +#define NV50TCL_MULTISAMPLE_MODE_8XMS_8XCS 0x0000000a #define NV50TCL_VERTEX_BEGIN 0x000015dc #define NV50TCL_VERTEX_BEGIN_POINTS 0x00000000 #define NV50TCL_VERTEX_BEGIN_LINES 0x00000001 @@ -8356,6 +8436,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY 0x0000000b #define NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY 0x0000000c #define NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY 0x0000000d +#define NV50TCL_VERTEX_BEGIN_PATCHES 0x0000000e #define NV50TCL_VERTEX_END 0x000015e0 #define NV50TCL_EDGEFLAG_ENABLE 0x000015e4 #define NV50TCL_VB_ELEMENT_U32 0x000015e8 @@ -8369,6 +8450,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VB_ELEMENT_U16_I0_MASK 0x0000ffff #define NV50TCL_VB_ELEMENT_U16_I1_SHIFT 16 #define NV50TCL_VB_ELEMENT_U16_I1_MASK 0xffff0000 +#define NV50TCL_VERTEX_BASE_HIGH 0x000015f4 +#define NV50TCL_VERTEX_BASE_LOW 0x000015f8 #define NV50TCL_VERTEX_DATA 0x00001640 #define NV50TCL_PRIM_RESTART_ENABLE 0x00001644 #define NV50TCL_PRIM_RESTART_INDEX 0x00001648 @@ -8754,7 +8837,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VIEWPORT_TRANSFORM_EN 0x0000192c #define NV50TCL_VIEW_VOLUME_CLIP_CTRL 0x0000193c #define NV50TCL_VIEWPORT_CLIP_RECTS_EN 0x0000194c +#define NV50TCL_VIEWPORT_CLIP_MODE 0x00001950 +#define NV50TCL_VIEWPORT_CLIP_MODE_INCLUDE 0x00000000 +#define NV50TCL_VIEWPORT_CLIP_MODE_EXCLUDE 0x00000001 +#define NV50TCL_VIEWPORT_CLIP_MODE_UNKNOWN 0x00000002 #define NV50TCL_FP_CTRL_UNK196C 0x0000196c +#define NV50TCL_CLIPID_ENABLE 0x0000197c +#define NV50TCL_CLIPID_WIDTH 0x00001980 +#define NV50TCL_CLIPID_ID 0x00001984 #define NV50TCL_FP_INTERPOLANT_CTRL 0x00001988 #define NV50TCL_FP_INTERPOLANT_CTRL_UMASK_SHIFT 24 #define NV50TCL_FP_INTERPOLANT_CTRL_UMASK_MASK 0xff000000 @@ -8855,19 +8945,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8 0x00c00000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16 0x00d80000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8 0x00e80000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_2_10_10_10 0x01800000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SHIFT 25 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK 0x7e000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT 0x7e000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM 0x24000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM 0x12000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED 0x5a000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED 0x6c000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT 0x48000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT 0x36000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK 0x0e000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT 0x0e000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM 0x02000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM 0x04000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED 0x0a000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED 0x0c000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT 0x08000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT 0x06000000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_BGRA (1 << 31) #define NV50TCL_QUERY_ADDRESS_HIGH 0x00001b00 #define NV50TCL_QUERY_ADDRESS_LOW 0x00001b04 -#define NV50TCL_QUERY_COUNTER 0x00001b08 +#define NV50TCL_QUERY_SEQUENCE 0x00001b08 #define NV50TCL_QUERY_GET 0x00001b0c @@ -9022,4 +9113,938 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50_COMPUTE_USER_PARAM__SIZE 0x00000040 +#define NVC0TCL 0x00009097 + +#define NVC0TCL_SEMAPHORE_ADDRESS_HIGH 0x00000010 +#define NVC0TCL_SEMAPHORE_ADDRESS_LOW 0x00000014 +#define NVC0TCL_NOP 0x00000100 +#define NVC0TCL_NOTIFY_ADDRESS_HIGH 0x00000104 +#define NVC0TCL_NOTIFY_ADDRESS_LOW 0x00000108 +#define NVC0TCL_NOTIFY 0x0000010c +#define NVC0TCL_SERIALIZE 0x00000110 +#define NVC0TCL_EARLY_FRAGMENT_TESTS 0x00000210 +#define NVC0TCL_TESS_MODE 0x00000320 +#define NVC0TCL_TESS_MODE_PRIM_SHIFT 0 +#define NVC0TCL_TESS_MODE_PRIM_MASK 0x0000000f +#define NVC0TCL_TESS_MODE_PRIM_ISOLINES 0x00000000 +#define NVC0TCL_TESS_MODE_PRIM_TRIANGLES 0x00000001 +#define NVC0TCL_TESS_MODE_PRIM_QUADS 0x00000002 +#define NVC0TCL_TESS_MODE_SPACING_SHIFT 4 +#define NVC0TCL_TESS_MODE_SPACING_MASK 0x000000f0 +#define NVC0TCL_TESS_MODE_SPACING_EQUAL 0x00000000 +#define NVC0TCL_TESS_MODE_SPACING_FRACTIONAL_ODD 0x00000010 +#define NVC0TCL_TESS_MODE_SPACING_FRACTIONAL_EVEN 0x00000020 +#define NVC0TCL_TESS_MODE_CW (1 << 8) +#define NVC0TCL_TESS_MODE_CONNECTED (1 << 9) +#define NVC0TCL_TESS_LEVEL_OUTER(x) (0x00000324+((x)*4)) +#define NVC0TCL_TESS_LEVEL_OUTER__SIZE 0x00000004 +#define NVC0TCL_TESS_LEVEL_INNER(x) (0x00000334+((x)*4)) +#define NVC0TCL_TESS_LEVEL_INNER__SIZE 0x00000002 +#define NVC0TCL_RASTERIZE_ENABLE 0x0000037c +#define NVC0TCL_TFB_BUFFER_ENABLE(x) (0x00000380+((x)*32)) +#define NVC0TCL_TFB_BUFFER_ENABLE__SIZE 0x00000004 +#define NVC0TCL_TFB_ADDRESS_HIGH(x) (0x00000384+((x)*32)) +#define NVC0TCL_TFB_ADDRESS_HIGH__SIZE 0x00000004 +#define NVC0TCL_TFB_ADDRESS_LOW(x) (0x00000388+((x)*32)) +#define NVC0TCL_TFB_ADDRESS_LOW__SIZE 0x00000004 +#define NVC0TCL_TFB_BUFFER_SIZE(x) (0x0000038c+((x)*32)) +#define NVC0TCL_TFB_BUFFER_SIZE__SIZE 0x00000004 +#define NVC0TCL_TFB_PRIMITIVE_ID(x) (0x00000390+((x)*32)) +#define NVC0TCL_TFB_PRIMITIVE_ID__SIZE 0x00000004 +#define NVC0TCL_TFB_UNK0700(x) (0x00000700+((x)*16)) +#define NVC0TCL_TFB_UNK0700__SIZE 0x00000004 +#define NVC0TCL_TFB_VARYING_COUNT(x) (0x00000704+((x)*16)) +#define NVC0TCL_TFB_VARYING_COUNT__SIZE 0x00000004 +#define NVC0TCL_TFB_BUFFER_STRIDE(x) (0x00000708+((x)*16)) +#define NVC0TCL_TFB_BUFFER_STRIDE__SIZE 0x00000004 +#define NVC0TCL_TFB_ENABLE 0x00000744 +#define NVC0TCL_LOCAL_BASE 0x0000077c +#define NVC0TCL_UNK0790_ADDRESS_HIGH 0x00000790 +#define NVC0TCL_UNK0790_ADDRESS_LOW 0x00000794 +#define NVC0TCL_RT_ADDRESS_HIGH(x) (0x00000800+((x)*32)) +#define NVC0TCL_RT_ADDRESS_HIGH__SIZE 0x00000008 +#define NVC0TCL_RT_ADDRESS_LOW(x) (0x00000804+((x)*32)) +#define NVC0TCL_RT_ADDRESS_LOW__SIZE 0x00000008 +#define NVC0TCL_RT_HORIZ(x) (0x00000808+((x)*32)) +#define NVC0TCL_RT_HORIZ__SIZE 0x00000008 +#define NVC0TCL_RT_VERT(x) (0x0000080c+((x)*32)) +#define NVC0TCL_RT_VERT__SIZE 0x00000008 +#define NVC0TCL_RT_FORMAT(x) (0x00000810+((x)*32)) +#define NVC0TCL_RT_FORMAT__SIZE 0x00000008 +#define NVC0TCL_RT_FORMAT_R32G32B32A32_FLOAT 0x000000c0 +#define NVC0TCL_RT_FORMAT_R32G32B32A32_SINT 0x000000c1 +#define NVC0TCL_RT_FORMAT_R32G32B32A32_UINT 0x000000c2 +#define NVC0TCL_RT_FORMAT_R32G32B32X32_FLOAT 0x000000c3 +#define NVC0TCL_RT_FORMAT_R16G16B16A16_UNORM 0x000000c6 +#define NVC0TCL_RT_FORMAT_R16G16B16A16_SNORM 0x000000c7 +#define NVC0TCL_RT_FORMAT_R16G16B16A16_SINT 0x000000c8 +#define NVC0TCL_RT_FORMAT_R16G16B16A16_UINT 0x000000c9 +#define NVC0TCL_RT_FORMAT_R16G16B16A16_FLOAT 0x000000ca +#define NVC0TCL_RT_FORMAT_R32G32_FLOAT 0x000000cb +#define NVC0TCL_RT_FORMAT_R32G32_SINT 0x000000cc +#define NVC0TCL_RT_FORMAT_R32G32_UINT 0x000000cd +#define NVC0TCL_RT_FORMAT_R16G16B16X16_FLOAT 0x000000ce +#define NVC0TCL_RT_FORMAT_A8R8G8B8_UNORM 0x000000cf +#define NVC0TCL_RT_FORMAT_A8R8G8B8_SRGB 0x000000d0 +#define NVC0TCL_RT_FORMAT_A2B10G10R10_UNORM 0x000000d1 +#define NVC0TCL_RT_FORMAT_A2B10G10R10_UINT 0x000000d2 +#define NVC0TCL_RT_FORMAT_A8B8G8R8_UNORM 0x000000d5 +#define NVC0TCL_RT_FORMAT_A8B8G8R8_SRGB 0x000000d6 +#define NVC0TCL_RT_FORMAT_A8B8G8R8_SNORM 0x000000d7 +#define NVC0TCL_RT_FORMAT_A8B8G8R8_SINT 0x000000d8 +#define NVC0TCL_RT_FORMAT_A8B8G8R8_UINT 0x000000d9 +#define NVC0TCL_RT_FORMAT_R16G16_UNORM 0x000000da +#define NVC0TCL_RT_FORMAT_R16G16_SNORM 0x000000db +#define NVC0TCL_RT_FORMAT_R16G16_SINT 0x000000dc +#define NVC0TCL_RT_FORMAT_R16G16_UINT 0x000000dd +#define NVC0TCL_RT_FORMAT_R16G16_FLOAT 0x000000de +#define NVC0TCL_RT_FORMAT_A2R10G10B10_UNORM 0x000000df +#define NVC0TCL_RT_FORMAT_B10G11R11_FLOAT 0x000000e0 +#define NVC0TCL_RT_FORMAT_R32_FLOAT 0x000000e5 +#define NVC0TCL_RT_FORMAT_X8R8G8B8_UNORM 0x000000e6 +#define NVC0TCL_RT_FORMAT_X8R8G8B8_SRGB 0x000000e7 +#define NVC0TCL_RT_FORMAT_R5G6B5_UNORM 0x000000e8 +#define NVC0TCL_RT_FORMAT_A1R5G5B5_UNORM 0x000000e9 +#define NVC0TCL_RT_FORMAT_R8G8_UNORM 0x000000ea +#define NVC0TCL_RT_FORMAT_R8G8_SNORM 0x000000eb +#define NVC0TCL_RT_FORMAT_R8G8_SINT 0x000000ec +#define NVC0TCL_RT_FORMAT_R8G8_UINT 0x000000ed +#define NVC0TCL_RT_FORMAT_R16_UNORM 0x000000ee +#define NVC0TCL_RT_FORMAT_R16_SNORM 0x000000ef +#define NVC0TCL_RT_FORMAT_R16_SINT 0x000000f0 +#define NVC0TCL_RT_FORMAT_R16_UINT 0x000000f1 +#define NVC0TCL_RT_FORMAT_R16_FLOAT 0x000000f2 +#define NVC0TCL_RT_FORMAT_R8_UNORM 0x000000f3 +#define NVC0TCL_RT_FORMAT_R8_SNORM 0x000000f4 +#define NVC0TCL_RT_FORMAT_R8_SINT 0x000000f5 +#define NVC0TCL_RT_FORMAT_R8_UINT 0x000000f6 +#define NVC0TCL_RT_FORMAT_A8_UNORM 0x000000f7 +#define NVC0TCL_RT_FORMAT_X1R5G5B5_UNORM 0x000000f8 +#define NVC0TCL_RT_FORMAT_X8B8G8R8_UNORM 0x000000f9 +#define NVC0TCL_RT_FORMAT_X8B8G8R8_SRGB 0x000000fa +#define NVC0TCL_RT_TILE_MODE(x) (0x00000814+((x)*32)) +#define NVC0TCL_RT_TILE_MODE__SIZE 0x00000008 +#define NVC0TCL_RT_ARRAY_MODE(x) (0x00000818+((x)*32)) +#define NVC0TCL_RT_ARRAY_MODE__SIZE 0x00000008 +#define NVC0TCL_RT_ARRAY_MODE_LAYERS_SHIFT 0 +#define NVC0TCL_RT_ARRAY_MODE_LAYERS_MASK 0x0000ffff +#define NVC0TCL_RT_ARRAY_MODE_VOLUME (1 << 16) +#define NVC0TCL_RT_LAYER_STRIDE(x) (0x0000081c+((x)*32)) +#define NVC0TCL_RT_LAYER_STRIDE__SIZE 0x00000008 +#define NVC0TCL_VIEWPORT_SCALE_X(x) (0x00000a00+((x)*32)) +#define NVC0TCL_VIEWPORT_SCALE_X__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_SCALE_Y(x) (0x00000a04+((x)*32)) +#define NVC0TCL_VIEWPORT_SCALE_Y__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_SCALE_Z(x) (0x00000a08+((x)*32)) +#define NVC0TCL_VIEWPORT_SCALE_Z__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_TRANSLATE_X(x) (0x00000a0c+((x)*32)) +#define NVC0TCL_VIEWPORT_TRANSLATE_X__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_TRANSLATE_Y(x) (0x00000a10+((x)*32)) +#define NVC0TCL_VIEWPORT_TRANSLATE_Y__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_TRANSLATE_Z(x) (0x00000a14+((x)*32)) +#define NVC0TCL_VIEWPORT_TRANSLATE_Z__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_HORIZ(x) (0x00000c00+((x)*16)) +#define NVC0TCL_VIEWPORT_HORIZ__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_HORIZ_X_SHIFT 0 +#define NVC0TCL_VIEWPORT_HORIZ_X_MASK 0x0000ffff +#define NVC0TCL_VIEWPORT_HORIZ_W_SHIFT 16 +#define NVC0TCL_VIEWPORT_HORIZ_W_MASK 0xffff0000 +#define NVC0TCL_VIEWPORT_VERT(x) (0x00000c04+((x)*16)) +#define NVC0TCL_VIEWPORT_VERT__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_VERT_Y_SHIFT 0 +#define NVC0TCL_VIEWPORT_VERT_Y_MASK 0x0000ffff +#define NVC0TCL_VIEWPORT_VERT_H_SHIFT 16 +#define NVC0TCL_VIEWPORT_VERT_H_MASK 0xffff0000 +#define NVC0TCL_DEPTH_RANGE_NEAR(x) (0x00000c08+((x)*16)) +#define NVC0TCL_DEPTH_RANGE_NEAR__SIZE 0x00000010 +#define NVC0TCL_DEPTH_RANGE_FAR(x) (0x00000c0c+((x)*16)) +#define NVC0TCL_DEPTH_RANGE_FAR__SIZE 0x00000010 +#define NVC0TCL_VIEWPORT_CLIP_HORIZ(x) (0x00000d00+((x)*8)) +#define NVC0TCL_VIEWPORT_CLIP_HORIZ__SIZE 0x00000008 +#define NVC0TCL_VIEWPORT_CLIP_HORIZ_MIN_SHIFT 0 +#define NVC0TCL_VIEWPORT_CLIP_HORIZ_MIN_MASK 0x0000ffff +#define NVC0TCL_VIEWPORT_CLIP_HORIZ_MAX_SHIFT 16 +#define NVC0TCL_VIEWPORT_CLIP_HORIZ_MAX_MASK 0xffff0000 +#define NVC0TCL_VIEWPORT_CLIP_VERT(x) (0x00000d04+((x)*8)) +#define NVC0TCL_VIEWPORT_CLIP_VERT__SIZE 0x00000008 +#define NVC0TCL_VIEWPORT_CLIP_VERT_MIN_SHIFT 0 +#define NVC0TCL_VIEWPORT_CLIP_VERT_MIN_MASK 0x0000ffff +#define NVC0TCL_VIEWPORT_CLIP_VERT_MAX_SHIFT 16 +#define NVC0TCL_VIEWPORT_CLIP_VERT_MAX_MASK 0xffff0000 +#define NVC0TCL_CLIPID_REGION_HORIZ(x) (0x00000d40+((x)*8)) +#define NVC0TCL_CLIPID_REGION_HORIZ__SIZE 0x00000004 +#define NVC0TCL_CLIPID_REGION_VERT(x) (0x00000d44+((x)*8)) +#define NVC0TCL_CLIPID_REGION_VERT__SIZE 0x00000004 +#define NVC0TCL_VERTEX_BUFFER_FIRST 0x00000d74 +#define NVC0TCL_VERTEX_BUFFER_COUNT 0x00000d78 +#define NVC0TCL_CLEAR_COLOR(x) (0x00000d80+((x)*4)) +#define NVC0TCL_CLEAR_COLOR__SIZE 0x00000004 +#define NVC0TCL_CLEAR_DEPTH 0x00000d90 +#define NVC0TCL_STACK_ADDRESS_HIGH 0x00000d94 +#define NVC0TCL_STACK_ADDRESS_LOW 0x00000d98 +#define NVC0TCL_STACK_SIZE_LOG 0x00000d9c +#define NVC0TCL_CLEAR_STENCIL 0x00000da0 +#define NVC0TCL_POLYGON_SMOOTH_ENABLE 0x00000db4 +#define NVC0TCL_POLYGON_OFFSET_POINT_ENABLE 0x00000dc0 +#define NVC0TCL_POLYGON_OFFSET_LINE_ENABLE 0x00000dc4 +#define NVC0TCL_POLYGON_OFFSET_FILL_ENABLE 0x00000dc8 +#define NVC0TCL_PATCH_VERTICES 0x00000dcc +#define NVC0TCL_WATCHDOG_TIMER 0x00000de4 +#define NVC0TCL_WINDOW_OFFSET_X 0x00000df8 +#define NVC0TCL_WINDOW_OFFSET_Y 0x00000dfc +#define NVC0TCL_SCISSOR_ENABLE(x) (0x00000e00+((x)*16)) +#define NVC0TCL_SCISSOR_ENABLE__SIZE 0x00000010 +#define NVC0TCL_SCISSOR_HORIZ(x) (0x00000e04+((x)*16)) +#define NVC0TCL_SCISSOR_HORIZ__SIZE 0x00000010 +#define NVC0TCL_SCISSOR_HORIZ_MIN_SHIFT 0 +#define NVC0TCL_SCISSOR_HORIZ_MIN_MASK 0x0000ffff +#define NVC0TCL_SCISSOR_HORIZ_MAX_SHIFT 16 +#define NVC0TCL_SCISSOR_HORIZ_MAX_MASK 0xffff0000 +#define NVC0TCL_SCISSOR_VERT(x) (0x00000e08+((x)*16)) +#define NVC0TCL_SCISSOR_VERT__SIZE 0x00000010 +#define NVC0TCL_SCISSOR_VERT_MIN_SHIFT 0 +#define NVC0TCL_SCISSOR_VERT_MIN_MASK 0x0000ffff +#define NVC0TCL_SCISSOR_VERT_MAX_SHIFT 16 +#define NVC0TCL_SCISSOR_VERT_MAX_MASK 0xffff0000 +#define NVC0TCL_LOCAL_WARPS_LOG_ALLOC 0x00000f44 +#define NVC0TCL_LOCAL_WARPS_NO_CLAMP 0x00000f48 +#define NVC0TCL_STACK_WARPS_LOG_ALLOC 0x00000f4c +#define NVC0TCL_STACK_WARPS_NO_CLAMP 0x00000f50 +#define NVC0TCL_STENCIL_BACK_FUNC_REF 0x00000f54 +#define NVC0TCL_STENCIL_BACK_MASK 0x00000f58 +#define NVC0TCL_STENCIL_BACK_FUNC_MASK 0x00000f5c +#define NVC0TCL_VERTEX_RUNOUT_HIGH 0x00000f84 +#define NVC0TCL_VERTEX_RUNOUT_LOW 0x00000f88 +#define NVC0TCL_DEPTH_BOUNDS(x) (0x00000f9c+((x)*4)) +#define NVC0TCL_DEPTH_BOUNDS__SIZE 0x00000002 +#define NVC0TCL_MSAA_MASK(x) (0x00000fbc+((x)*4)) +#define NVC0TCL_MSAA_MASK__SIZE 0x00000004 +#define NVC0TCL_CLIPID_ADDRESS_HIGH 0x00000fcc +#define NVC0TCL_CLIPID_ADDRESS_LOW 0x00000fd0 +#define NVC0TCL_ZETA_ADDRESS_HIGH 0x00000fe0 +#define NVC0TCL_ZETA_ADDRESS_LOW 0x00000fe4 +#define NVC0TCL_ZETA_FORMAT 0x00000fe8 +#define NVC0TCL_ZETA_FORMAT_Z32_FLOAT 0x0000000a +#define NVC0TCL_ZETA_FORMAT_Z16_UNORM 0x00000013 +#define NVC0TCL_ZETA_FORMAT_Z24S8_UNORM 0x00000014 +#define NVC0TCL_ZETA_FORMAT_X8Z24_UNORM 0x00000015 +#define NVC0TCL_ZETA_FORMAT_S8Z24_UNORM 0x00000016 +#define NVC0TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM 0x00000019 +#define NVC0TCL_ZETA_TILE_MODE 0x00000fec +#define NVC0TCL_ZETA_LAYER_STRIDE 0x00000ff0 +#define NVC0TCL_SCREEN_SCISSOR_HORIZ 0x00000ff4 +#define NVC0TCL_SCREEN_SCISSOR_HORIZ_W_SHIFT 16 +#define NVC0TCL_SCREEN_SCISSOR_HORIZ_W_MASK 0xffff0000 +#define NVC0TCL_SCREEN_SCISSOR_HORIZ_X_SHIFT 0 +#define NVC0TCL_SCREEN_SCISSOR_HORIZ_X_MASK 0x0000ffff +#define NVC0TCL_SCREEN_SCISSOR_VERT 0x00000ff8 +#define NVC0TCL_SCREEN_SCISSOR_VERT_H_SHIFT 16 +#define NVC0TCL_SCREEN_SCISSOR_VERT_H_MASK 0xffff0000 +#define NVC0TCL_SCREEN_SCISSOR_VERT_Y_SHIFT 0 +#define NVC0TCL_SCREEN_SCISSOR_VERT_Y_MASK 0x0000ffff +#define NVC0TCL_VTX_ATTR_DEFINE 0x0000114c +#define NVC0TCL_VTX_ATTR_DEFINE_ATTR_SHIFT 0 +#define NVC0TCL_VTX_ATTR_DEFINE_ATTR_MASK 0x0000003f +#define NVC0TCL_VTX_ATTR_DEFINE_COMP_SHIFT 8 +#define NVC0TCL_VTX_ATTR_DEFINE_COMP_MASK 0x00000f00 +#define NVC0TCL_VTX_ATTR_DEFINE_SIZE_SHIFT 12 +#define NVC0TCL_VTX_ATTR_DEFINE_SIZE_MASK 0x0000f000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SHIFT 16 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_MASK 0x000f0000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_FLOAT 0x00070000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_UNORM 0x00010000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SNORM 0x00020000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_USCALED 0x00050000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SSCALED 0x00060000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_UINT 0x00040000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SINT 0x00030000 +#define NVC0TCL_VTX_ATTR_DATA(x) (0x00001150+((x)*4)) +#define NVC0TCL_VTX_ATTR_DATA__SIZE 0x00000004 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT(x) (0x00001160+((x)*4)) +#define NVC0TCL_VERTEX_ATTRIB_FORMAT__SIZE 0x00000020 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_BUFFER_SHIFT 0 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_BUFFER_MASK 0x0000003f +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_CONST (1 << 6) +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_OFFSET_SHIFT 7 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_OFFSET_MASK 0x001fff80 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_SHIFT 21 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_MASK 0x07e00000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32_32_32_32 0x00200000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32_32_32 0x00400000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16_16_16_16 0x00600000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32_32 0x00800000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16_16_16 0x00a00000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8_8_8_8 0x01400000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16_16 0x01e00000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_32 0x02400000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8_8_8 0x02600000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8_8 0x03000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_16 0x03600000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_8 0x03a00000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_FORMAT_2_10_10_10 0x06000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SHIFT 27 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_MASK 0x78000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT 0x38000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UNORM 0x08000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SNORM 0x10000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_USCALED 0x28000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SSCALED 0x30000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UINT 0x20000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SINT 0x18000000 +#define NVC0TCL_RT_CONTROL 0x0000121c +#define NVC0TCL_RT_CONTROL_COUNT_SHIFT 0 +#define NVC0TCL_RT_CONTROL_COUNT_MASK 0x0000000f +#define NVC0TCL_RT_CONTROL_MAP0_SHIFT 4 +#define NVC0TCL_RT_CONTROL_MAP0_MASK 0x00000070 +#define NVC0TCL_RT_CONTROL_MAP1_SHIFT 7 +#define NVC0TCL_RT_CONTROL_MAP1_MASK 0x00000380 +#define NVC0TCL_RT_CONTROL_MAP2_SHIFT 10 +#define NVC0TCL_RT_CONTROL_MAP2_MASK 0x00001c00 +#define NVC0TCL_RT_CONTROL_MAP3_SHIFT 13 +#define NVC0TCL_RT_CONTROL_MAP3_MASK 0x0000e000 +#define NVC0TCL_RT_CONTROL_MAP4_SHIFT 16 +#define NVC0TCL_RT_CONTROL_MAP4_MASK 0x00070000 +#define NVC0TCL_RT_CONTROL_MAP5_SHIFT 19 +#define NVC0TCL_RT_CONTROL_MAP5_MASK 0x00380000 +#define NVC0TCL_RT_CONTROL_MAP6_SHIFT 22 +#define NVC0TCL_RT_CONTROL_MAP6_MASK 0x01c00000 +#define NVC0TCL_RT_CONTROL_MAP7_SHIFT 25 +#define NVC0TCL_RT_CONTROL_MAP7_MASK 0x0e000000 +#define NVC0TCL_ZETA_HORIZ 0x00001228 +#define NVC0TCL_ZETA_VERT 0x0000122c +#define NVC0TCL_ZETA_ARRAY_MODE 0x00001230 +#define NVC0TCL_ZETA_ARRAY_MODE_LAYERS_SHIFT 0 +#define NVC0TCL_ZETA_ARRAY_MODE_LAYERS_MASK 0x0000ffff +#define NVC0TCL_ZETA_ARRAY_MODE_UNK (1 << 16) +#define NVC0TCL_LINKED_TSC 0x00001234 +#define NVC0TCL_FP_RESULT_COUNT 0x00001298 +#define NVC0TCL_DEPTH_TEST_ENABLE 0x000012cc +#define NVC0TCL_SHADE_MODEL 0x000012d4 +#define NVC0TCL_SHADE_MODEL_FLAT 0x00001d00 +#define NVC0TCL_SHADE_MODEL_SMOOTH 0x00001d01 +#define NVC0TCL_BLEND_INDEPENDENT 0x000012e4 +#define NVC0TCL_DEPTH_WRITE_ENABLE 0x000012e8 +#define NVC0TCL_ALPHA_TEST_ENABLE 0x000012ec +#define NVC0TCL_PM_SET(x) (0x000012f0+((x)*4)) +#define NVC0TCL_PM_SET__SIZE 0x00000004 +#define NVC0TCL_VB_ELEMENT_U8_SETUP 0x00001300 +#define NVC0TCL_VB_ELEMENT_U8_SETUP_OFFSET_SHIFT 30 +#define NVC0TCL_VB_ELEMENT_U8_SETUP_OFFSET_MASK 0xc0000000 +#define NVC0TCL_VB_ELEMENT_U8_SETUP_COUNT_SHIFT 0 +#define NVC0TCL_VB_ELEMENT_U8_SETUP_COUNT_MASK 0x3fffffff +#define NVC0TCL_VB_ELEMENT_U8 0x00001304 +#define NVC0TCL_VB_ELEMENT_U8_I0_SHIFT 0 +#define NVC0TCL_VB_ELEMENT_U8_I0_MASK 0x000000ff +#define NVC0TCL_VB_ELEMENT_U8_I1_SHIFT 8 +#define NVC0TCL_VB_ELEMENT_U8_I1_MASK 0x0000ff00 +#define NVC0TCL_VB_ELEMENT_U8_I2_SHIFT 16 +#define NVC0TCL_VB_ELEMENT_U8_I2_MASK 0x00ff0000 +#define NVC0TCL_VB_ELEMENT_U8_I3_SHIFT 24 +#define NVC0TCL_VB_ELEMENT_U8_I3_MASK 0xff000000 +#define NVC0TCL_DEPTH_TEST_FUNC 0x0000130c +#define NVC0TCL_DEPTH_TEST_FUNC_NEVER 0x00000200 +#define NVC0TCL_DEPTH_TEST_FUNC_LESS 0x00000201 +#define NVC0TCL_DEPTH_TEST_FUNC_EQUAL 0x00000202 +#define NVC0TCL_DEPTH_TEST_FUNC_LEQUAL 0x00000203 +#define NVC0TCL_DEPTH_TEST_FUNC_GREATER 0x00000204 +#define NVC0TCL_DEPTH_TEST_FUNC_NOTEQUAL 0x00000205 +#define NVC0TCL_DEPTH_TEST_FUNC_GEQUAL 0x00000206 +#define NVC0TCL_DEPTH_TEST_FUNC_ALWAYS 0x00000207 +#define NVC0TCL_ALPHA_TEST_REF 0x00001310 +#define NVC0TCL_ALPHA_TEST_FUNC 0x00001314 +#define NVC0TCL_ALPHA_TEST_FUNC_NEVER 0x00000200 +#define NVC0TCL_ALPHA_TEST_FUNC_LESS 0x00000201 +#define NVC0TCL_ALPHA_TEST_FUNC_EQUAL 0x00000202 +#define NVC0TCL_ALPHA_TEST_FUNC_LEQUAL 0x00000203 +#define NVC0TCL_ALPHA_TEST_FUNC_GREATER 0x00000204 +#define NVC0TCL_ALPHA_TEST_FUNC_NOTEQUAL 0x00000205 +#define NVC0TCL_ALPHA_TEST_FUNC_GEQUAL 0x00000206 +#define NVC0TCL_ALPHA_TEST_FUNC_ALWAYS 0x00000207 +#define NVC0TCL_BLEND_COLOR(x) (0x0000131c+((x)*4)) +#define NVC0TCL_BLEND_COLOR__SIZE 0x00000004 +#define NVC0TCL_TIC_FLUSH 0x00001330 +#define NVC0TCL_TSC_FLUSH 0x00001334 +#define NVC0TCL_TEX_CACHE_CTL 0x00001338 +#define NVC0TCL_BLEND_EQUATION_RGB 0x00001340 +#define NVC0TCL_BLEND_EQUATION_RGB_FUNC_ADD 0x00008006 +#define NVC0TCL_BLEND_EQUATION_RGB_MIN 0x00008007 +#define NVC0TCL_BLEND_EQUATION_RGB_MAX 0x00008008 +#define NVC0TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT 0x0000800a +#define NVC0TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT 0x0000800b +#define NVC0TCL_BLEND_FUNC_SRC_RGB 0x00001344 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_BLEND_FUNC_DST_RGB 0x00001348 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNC_DST_RGB_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNC_DST_RGB_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNC_DST_RGB_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNC_DST_RGB_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNC_DST_RGB_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_BLEND_EQUATION_ALPHA 0x0000134c +#define NVC0TCL_BLEND_EQUATION_ALPHA_FUNC_ADD 0x00008006 +#define NVC0TCL_BLEND_EQUATION_ALPHA_MIN 0x00008007 +#define NVC0TCL_BLEND_EQUATION_ALPHA_MAX 0x00008008 +#define NVC0TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT 0x0000800a +#define NVC0TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT 0x0000800b +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA 0x00001350 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA 0x00001358 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_STENCIL_ENABLE 0x00001380 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL 0x00001384 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_ZERO 0x00000000 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_INVERT 0x0000150a +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_KEEP 0x00001e00 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_REPLACE 0x00001e01 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_INCR 0x00001e02 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_DECR 0x00001e03 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP 0x00008507 +#define NVC0TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP 0x00008508 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL 0x00001388 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_ZERO 0x00000000 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_INVERT 0x0000150a +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_KEEP 0x00001e00 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE 0x00001e01 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_INCR 0x00001e02 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_DECR 0x00001e03 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP 0x00008507 +#define NVC0TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP 0x00008508 +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS 0x0000138c +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_ZERO 0x00000000 +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_INVERT 0x0000150a +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_KEEP 0x00001e00 +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_REPLACE 0x00001e01 +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_INCR 0x00001e02 +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_DECR 0x00001e03 +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP 0x00008507 +#define NVC0TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP 0x00008508 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC 0x00001390 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_NEVER 0x00000200 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_LESS 0x00000201 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL 0x00000202 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL 0x00000203 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_GREATER 0x00000204 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL 0x00000205 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL 0x00000206 +#define NVC0TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS 0x00000207 +#define NVC0TCL_STENCIL_FRONT_FUNC_REF 0x00001394 +#define NVC0TCL_STENCIL_FRONT_MASK 0x00001398 +#define NVC0TCL_STENCIL_FRONT_FUNC_MASK 0x0000139c +#define NVC0TCL_FRAG_COLOR_CLAMP_EN 0x000013a8 +#define NVC0TCL_Y_ORIGIN_BOTTOM 0x000013ac +#define NVC0TCL_LINE_WIDTH(x) (0x000013b0+((x)*4)) +#define NVC0TCL_LINE_WIDTH__SIZE 0x00000002 +#define NVC0TCL_POINT_COORD_REPLACE_MAP(x) (0x000013c0+((x)*4)) +#define NVC0TCL_POINT_COORD_REPLACE_MAP__SIZE 0x00000008 +#define NVC0TCL_GP_VERTEX_OUTPUT_COUNT 0x00001420 +#define NVC0TCL_FENCE 0x0000142c +#define NVC0TCL_VB_ELEMENT_BASE 0x00001434 +#define NVC0TCL_INSTANCE_BASE 0x00001438 +#define NVC0TCL_CODE_CB_FLUSH 0x00001440 +#define NVC0TCL_CLIPID_HEIGHT 0x00001504 +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE 0x00001510 +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_0 (1 << 0) +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_1 (1 << 1) +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_2 (1 << 2) +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_3 (1 << 3) +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_4 (1 << 4) +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_5 (1 << 5) +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_6 (1 << 6) +#define NVC0TCL_VP_CLIP_DISTANCE_ENABLE_7 (1 << 7) +#define NVC0TCL_SAMPLECNT_ENABLE 0x00001514 +#define NVC0TCL_POINT_SIZE 0x00001518 +#define NVC0TCL_POINT_SPRITE_ENABLE 0x00001520 +#define NVC0TCL_SAMPLECNT_RESET 0x00001530 +#define NVC0TCL_MULTISAMPLE_ZETA_ENABLE 0x00001534 +#define NVC0TCL_ZETA_ENABLE 0x00001538 +#define NVC0TCL_MULTISAMPLE_CTRL 0x0000153c +#define NVC0TCL_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE (1 << 0) +#define NVC0TCL_MULTISAMPLE_CTRL_ALPHA_TO_ONE (1 << 4) +#define NVC0TCL_NOPERSPECTIVE_BITMAP(x) (0x00001540+((x)*4)) +#define NVC0TCL_NOPERSPECTIVE_BITMAP__SIZE 0x00000004 +#define NVC0TCL_COND_ADDRESS_HIGH 0x00001550 +#define NVC0TCL_COND_ADDRESS_LOW 0x00001554 +#define NVC0TCL_COND_MODE 0x00001558 +#define NVC0TCL_COND_MODE_NEVER 0x00000000 +#define NVC0TCL_COND_MODE_ALWAYS 0x00000001 +#define NVC0TCL_COND_MODE_RES 0x00000002 +#define NVC0TCL_COND_MODE_NOT_RES_AND_NOT_ID 0x00000003 +#define NVC0TCL_COND_MODE_RES_OR_ID 0x00000004 +#define NVC0TCL_TSC_ADDRESS_HIGH 0x0000155c +#define NVC0TCL_TSC_ADDRESS_LOW 0x00001560 +#define NVC0TCL_TSC_LIMIT 0x00001564 +#define NVC0TCL_POLYGON_OFFSET_FACTOR 0x0000156c +#define NVC0TCL_LINE_SMOOTH_ENABLE 0x00001570 +#define NVC0TCL_TIC_ADDRESS_HIGH 0x00001574 +#define NVC0TCL_TIC_ADDRESS_LOW 0x00001578 +#define NVC0TCL_TIC_LIMIT 0x0000157c +#define NVC0TCL_PM_CONTROL(x) (0x00001580+((x)*4)) +#define NVC0TCL_PM_CONTROL__SIZE 0x00000004 +#define NVC0TCL_PM_CONTROL_UNK0 (1 << 0) +#define NVC0TCL_PM_CONTROL_UNK1_SHIFT 4 +#define NVC0TCL_PM_CONTROL_UNK1_MASK 0x00000070 +#define NVC0TCL_PM_CONTROL_UNK2_SHIFT 8 +#define NVC0TCL_PM_CONTROL_UNK2_MASK 0xffffff00 +#define NVC0TCL_STENCIL_TWO_SIDE_ENABLE 0x00001594 +#define NVC0TCL_STENCIL_BACK_OP_FAIL 0x00001598 +#define NVC0TCL_STENCIL_BACK_OP_FAIL_ZERO 0x00000000 +#define NVC0TCL_STENCIL_BACK_OP_FAIL_INVERT 0x0000150a +#define NVC0TCL_STENCIL_BACK_OP_FAIL_KEEP 0x00001e00 +#define NVC0TCL_STENCIL_BACK_OP_FAIL_REPLACE 0x00001e01 +#define NVC0TCL_STENCIL_BACK_OP_FAIL_INCR 0x00001e02 +#define NVC0TCL_STENCIL_BACK_OP_FAIL_DECR 0x00001e03 +#define NVC0TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP 0x00008507 +#define NVC0TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP 0x00008508 +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL 0x0000159c +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_ZERO 0x00000000 +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_INVERT 0x0000150a +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_KEEP 0x00001e00 +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_REPLACE 0x00001e01 +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_INCR 0x00001e02 +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_DECR 0x00001e03 +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP 0x00008507 +#define NVC0TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP 0x00008508 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS 0x000015a0 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_ZERO 0x00000000 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_INVERT 0x0000150a +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_KEEP 0x00001e00 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_REPLACE 0x00001e01 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_INCR 0x00001e02 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_DECR 0x00001e03 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP 0x00008507 +#define NVC0TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP 0x00008508 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC 0x000015a4 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_NEVER 0x00000200 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_LESS 0x00000201 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_EQUAL 0x00000202 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL 0x00000203 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_GREATER 0x00000204 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL 0x00000205 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL 0x00000206 +#define NVC0TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS 0x00000207 +#define NVC0TCL_MULTISAMPLE_COLOR_ENABLE 0x000015b4 +#define NVC0TCL_FRAMEBUFFER_SRGB 0x000015b8 +#define NVC0TCL_POLYGON_OFFSET_UNITS 0x000015bc +#define NVC0TCL_GP_BUILTIN_RESULT_EN 0x000015cc +#define NVC0TCL_GP_BUILTIN_RESULT_EN_VPORT (1 << 0) +#define NVC0TCL_GP_BUILTIN_RESULT_EN_LAYER (1 << 16) +#define NVC0TCL_MULTISAMPLE_MODE 0x000015d0 +#define NVC0TCL_MULTISAMPLE_MODE_1X 0x00000000 +#define NVC0TCL_MULTISAMPLE_MODE_2XMS 0x00000001 +#define NVC0TCL_MULTISAMPLE_MODE_4XMS 0x00000002 +#define NVC0TCL_MULTISAMPLE_MODE_8XMS 0x00000004 +#define NVC0TCL_MULTISAMPLE_MODE_4XMS_4XCS 0x00000008 +#define NVC0TCL_MULTISAMPLE_MODE_4XMS_12XCS 0x00000009 +#define NVC0TCL_MULTISAMPLE_MODE_8XMS_8XCS 0x0000000a +#define NVC0TCL_EDGEFLAG_ENABLE 0x000015e4 +#define NVC0TCL_VB_ELEMENT_U32 0x000015e8 +#define NVC0TCL_VB_ELEMENT_U16_SETUP 0x000015ec +#define NVC0TCL_VB_ELEMENT_U16_SETUP_OFFSET_SHIFT 30 +#define NVC0TCL_VB_ELEMENT_U16_SETUP_OFFSET_MASK 0xc0000000 +#define NVC0TCL_VB_ELEMENT_U16_SETUP_COUNT_SHIFT 0 +#define NVC0TCL_VB_ELEMENT_U16_SETUP_COUNT_MASK 0x3fffffff +#define NVC0TCL_VB_ELEMENT_U16 0x000015f0 +#define NVC0TCL_VB_ELEMENT_U16_I0_SHIFT 0 +#define NVC0TCL_VB_ELEMENT_U16_I0_MASK 0x0000ffff +#define NVC0TCL_VB_ELEMENT_U16_I1_SHIFT 16 +#define NVC0TCL_VB_ELEMENT_U16_I1_MASK 0xffff0000 +#define NVC0TCL_VERTEX_BASE_HIGH 0x000015f4 +#define NVC0TCL_VERTEX_BASE_LOW 0x000015f8 +#define NVC0TCL_CODE_ADDRESS_HIGH 0x00001608 +#define NVC0TCL_CODE_ADDRESS_LOW 0x0000160c +#define NVC0TCL_VERTEX_BEGIN 0x00001618 +#define NVC0TCL_VERTEX_BEGIN_MODE_SHIFT 0 +#define NVC0TCL_VERTEX_BEGIN_MODE_MASK 0x0000000f +#define NVC0TCL_VERTEX_BEGIN_MODE_POINTS 0x00000000 +#define NVC0TCL_VERTEX_BEGIN_MODE_LINES 0x00000001 +#define NVC0TCL_VERTEX_BEGIN_MODE_LINE_LOOP 0x00000002 +#define NVC0TCL_VERTEX_BEGIN_MODE_LINE_STRIP 0x00000003 +#define NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLES 0x00000004 +#define NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLE_STRIP 0x00000005 +#define NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLE_FAN 0x00000006 +#define NVC0TCL_VERTEX_BEGIN_MODE_QUADS 0x00000007 +#define NVC0TCL_VERTEX_BEGIN_MODE_QUAD_STRIP 0x00000008 +#define NVC0TCL_VERTEX_BEGIN_MODE_POLYGON 0x00000009 +#define NVC0TCL_VERTEX_BEGIN_MODE_LINES_ADJACENCY 0x0000000a +#define NVC0TCL_VERTEX_BEGIN_MODE_LINE_STRIP_ADJACENCY 0x0000000b +#define NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLES_ADJACENCY 0x0000000c +#define NVC0TCL_VERTEX_BEGIN_MODE_TRIANGLE_STRIP_ADJACENCY 0x0000000d +#define NVC0TCL_VERTEX_BEGIN_MODE_PATCHES 0x0000000e +#define NVC0TCL_VERTEX_BEGIN_INSTANCE (1 << 26) +#define NVC0TCL_VERTEX_END 0x00001614 +#define NVC0TCL_VERTEX_DATA 0x00001640 +#define NVC0TCL_PRIM_RESTART_ENABLE 0x00001644 +#define NVC0TCL_PRIM_RESTART_INDEX 0x00001648 +#define NVC0TCL_POINT_SMOOTH_ENABLE 0x00001658 +#define NVC0TCL_POINT_SPRITE_CTRL 0x00001660 +#define NVC0TCL_LINE_STIPPLE_ENABLE 0x0000166c +#define NVC0TCL_LINE_STIPPLE_PATTERN 0x00001680 +#define NVC0TCL_PROVOKING_VERTEX_LAST 0x00001684 +#define NVC0TCL_VERTEX_TWO_SIDE_ENABLE 0x00001688 +#define NVC0TCL_POLYGON_STIPPLE_ENABLE 0x0000168c +#define NVC0TCL_POLYGON_STIPPLE_PATTERN(x) (0x00001700+((x)*4)) +#define NVC0TCL_POLYGON_STIPPLE_PATTERN__SIZE 0x00000020 +#define NVC0TCL_UNK17BC_ADDRESS_HIGH 0x000017bc +#define NVC0TCL_UNK17BC_ADDRESS_LOW 0x000017c0 +#define NVC0TCL_UNK17BC_LIMIT 0x000017c4 +#define NVC0TCL_VP_POINT_SIZE_EN 0x00001910 +#define NVC0TCL_CULL_FACE_ENABLE 0x00001918 +#define NVC0TCL_FRONT_FACE 0x0000191c +#define NVC0TCL_FRONT_FACE_CW 0x00000900 +#define NVC0TCL_FRONT_FACE_CCW 0x00000901 +#define NVC0TCL_CULL_FACE 0x00001920 +#define NVC0TCL_CULL_FACE_FRONT 0x00000404 +#define NVC0TCL_CULL_FACE_BACK 0x00000405 +#define NVC0TCL_CULL_FACE_FRONT_AND_BACK 0x00000408 +#define NVC0TCL_VIEWPORT_TRANSFORM_EN 0x0000192c +#define NVC0TCL_VIEW_VOLUME_CLIP_CTRL 0x0000193c +#define NVC0TCL_VIEWPORT_CLIP_RECTS_EN 0x0000194c +#define NVC0TCL_VIEWPORT_CLIP_MODE 0x00001950 +#define NVC0TCL_VIEWPORT_CLIP_MODE_INCLUDE 0x00000000 +#define NVC0TCL_VIEWPORT_CLIP_MODE_EXCLUDE 0x00000001 +#define NVC0TCL_VIEWPORT_CLIP_MODE_UNKNOWN 0x00000002 +#define NVC0TCL_FP_ZORDER_CTRL 0x0000196c +#define NVC0TCL_CLIPID_ENABLE 0x0000197c +#define NVC0TCL_CLIPID_WIDTH 0x00001980 +#define NVC0TCL_CLIPID_ID 0x00001984 +#define NVC0TCL_REG_MODE 0x000019a0 +#define NVC0TCL_REG_MODE_PACKED 0x00000001 +#define NVC0TCL_REG_MODE_STRIPED 0x00000002 +#define NVC0TCL_FP_CONTROL 0x000019a8 +#define NVC0TCL_FP_CONTROL_MULTIPLE_RESULTS (1 << 0) +#define NVC0TCL_FP_CONTROL_EXPORTS_Z (1 << 8) +#define NVC0TCL_FP_CONTROL_USES_KIL (1 << 20) +#define NVC0TCL_DEPTH_BOUNDS_EN 0x000019bc +#define NVC0TCL_LOGIC_OP_ENABLE 0x000019c4 +#define NVC0TCL_LOGIC_OP 0x000019c8 +#define NVC0TCL_LOGIC_OP_CLEAR 0x00001500 +#define NVC0TCL_LOGIC_OP_AND 0x00001501 +#define NVC0TCL_LOGIC_OP_AND_REVERSE 0x00001502 +#define NVC0TCL_LOGIC_OP_COPY 0x00001503 +#define NVC0TCL_LOGIC_OP_AND_INVERTED 0x00001504 +#define NVC0TCL_LOGIC_OP_NOOP 0x00001505 +#define NVC0TCL_LOGIC_OP_XOR 0x00001506 +#define NVC0TCL_LOGIC_OP_OR 0x00001507 +#define NVC0TCL_LOGIC_OP_NOR 0x00001508 +#define NVC0TCL_LOGIC_OP_EQUIV 0x00001509 +#define NVC0TCL_LOGIC_OP_INVERT 0x0000150a +#define NVC0TCL_LOGIC_OP_OR_REVERSE 0x0000150b +#define NVC0TCL_LOGIC_OP_COPY_INVERTED 0x0000150c +#define NVC0TCL_LOGIC_OP_OR_INVERTED 0x0000150d +#define NVC0TCL_LOGIC_OP_NAND 0x0000150e +#define NVC0TCL_LOGIC_OP_SET 0x0000150f +#define NVC0TCL_CLEAR_BUFFERS 0x000019d0 +#define NVC0TCL_CLEAR_BUFFERS_Z (1 << 0) +#define NVC0TCL_CLEAR_BUFFERS_S (1 << 1) +#define NVC0TCL_CLEAR_BUFFERS_R (1 << 2) +#define NVC0TCL_CLEAR_BUFFERS_G (1 << 3) +#define NVC0TCL_CLEAR_BUFFERS_B (1 << 4) +#define NVC0TCL_CLEAR_BUFFERS_A (1 << 5) +#define NVC0TCL_CLEAR_BUFFERS_RT_SHIFT 6 +#define NVC0TCL_CLEAR_BUFFERS_RT_MASK 0x000003c0 +#define NVC0TCL_CLEAR_BUFFERS_LAYER_SHIFT 10 +#define NVC0TCL_CLEAR_BUFFERS_LAYER_MASK 0x0007fc00 +#define NVC0TCL_COLOR_MASK(x) (0x00001a00+((x)*4)) +#define NVC0TCL_COLOR_MASK__SIZE 0x00000008 +#define NVC0TCL_COLOR_MASK_R_SHIFT 0 +#define NVC0TCL_COLOR_MASK_R_MASK 0x0000000f +#define NVC0TCL_COLOR_MASK_G_SHIFT 4 +#define NVC0TCL_COLOR_MASK_G_MASK 0x000000f0 +#define NVC0TCL_COLOR_MASK_B_SHIFT 8 +#define NVC0TCL_COLOR_MASK_B_MASK 0x00000f00 +#define NVC0TCL_COLOR_MASK_A_SHIFT 12 +#define NVC0TCL_COLOR_MASK_A_MASK 0x0000f000 +#define NVC0TCL_QUERY_ADDRESS_HIGH 0x00001b00 +#define NVC0TCL_QUERY_ADDRESS_LOW 0x00001b04 +#define NVC0TCL_QUERY_SEQUENCE 0x00001b08 +#define NVC0TCL_QUERY_GET 0x00001b0c +#define NVC0TCL_VERTEX_ARRAY_FETCH(x) (0x00001c00+((x)*16)) +#define NVC0TCL_VERTEX_ARRAY_FETCH__SIZE 0x00000020 +#define NVC0TCL_VERTEX_ARRAY_FETCH_STRIDE_SHIFT 0 +#define NVC0TCL_VERTEX_ARRAY_FETCH_STRIDE_MASK 0x00000fff +#define NVC0TCL_VERTEX_ARRAY_FETCH_ENABLE (1 << 12) +#define NVC0TCL_BLEND_EQUATIONI_RGB(x) (0x00001e04+((x)*32)) +#define NVC0TCL_BLEND_EQUATIONI_RGB__SIZE 0x00000008 +#define NVC0TCL_BLEND_EQUATIONI_RGB_FUNC_ADD 0x00008006 +#define NVC0TCL_BLEND_EQUATIONI_RGB_MIN 0x00008007 +#define NVC0TCL_BLEND_EQUATIONI_RGB_MAX 0x00008008 +#define NVC0TCL_BLEND_EQUATIONI_RGB_FUNC_SUBTRACT 0x0000800a +#define NVC0TCL_BLEND_EQUATIONI_RGB_FUNC_REVERSE_SUBTRACT 0x0000800b +#define NVC0TCL_BLEND_FUNCI_SRC_RGB(x) (0x00001e08+((x)*32)) +#define NVC0TCL_BLEND_FUNCI_SRC_RGB__SIZE 0x00000008 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNCI_SRC_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_BLEND_FUNCI_DST_RGB(x) (0x00001e0c+((x)*32)) +#define NVC0TCL_BLEND_FUNCI_DST_RGB__SIZE 0x00000008 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNCI_DST_RGB_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_BLEND_EQUATIONI_ALPHA(x) (0x00001e10+((x)*32)) +#define NVC0TCL_BLEND_EQUATIONI_ALPHA__SIZE 0x00000008 +#define NVC0TCL_BLEND_EQUATIONI_ALPHA_FUNC_ADD 0x00008006 +#define NVC0TCL_BLEND_EQUATIONI_ALPHA_MIN 0x00008007 +#define NVC0TCL_BLEND_EQUATIONI_ALPHA_MAX 0x00008008 +#define NVC0TCL_BLEND_EQUATIONI_ALPHA_FUNC_SUBTRACT 0x0000800a +#define NVC0TCL_BLEND_EQUATIONI_ALPHA_FUNC_REVERSE_SUBTRACT 0x0000800b +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA(x) (0x00001e14+((x)*32)) +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA__SIZE 0x00000008 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNCI_SRC_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA(x) (0x00001e18+((x)*32)) +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA__SIZE 0x00000008 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ZERO 0x00004000 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE 0x00004001 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC_COLOR 0x00004300 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC_COLOR 0x00004301 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC_ALPHA 0x00004302 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC_ALPHA 0x00004303 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_DST_ALPHA 0x00004304 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_DST_ALPHA 0x00004305 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_DST_COLOR 0x00004306 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_DST_COLOR 0x00004307 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC_ALPHA_SATURATE 0x00004308 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_CONSTANT_COLOR 0x0000c001 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR 0x0000c002 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_CONSTANT_ALPHA 0x0000c003 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA 0x0000c004 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC1_COLOR 0x0000c900 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC1_COLOR 0x0000c901 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_SRC1_ALPHA 0x0000c902 +#define NVC0TCL_BLEND_FUNCI_DST_ALPHA_ONE_MINUS_SRC1_ALPHA 0x0000c903 +#define NVC0TCL_SP_SELECT(x) (0x00002000+((x)*64)) +#define NVC0TCL_SP_SELECT__SIZE 0x00000006 +#define NVC0TCL_SP_SELECT_ENABLE (1 << 0) +#define NVC0TCL_SP_SELECT_PROGRAM_SHIFT 4 +#define NVC0TCL_SP_SELECT_PROGRAM_MASK 0x000000f0 +#define NVC0TCL_SP_START_ID(x) (0x00002004+((x)*64)) +#define NVC0TCL_SP_START_ID__SIZE 0x00000006 +#define NVC0TCL_SP_GPR_ALLOC(x) (0x0000200c+((x)*64)) +#define NVC0TCL_SP_GPR_ALLOC__SIZE 0x00000006 +#define NVC0TCL_CB_SIZE 0x00002380 +#define NVC0TCL_CB_BIND(x) (0x00002410+((x)*32)) +#define NVC0TCL_CB_BIND__SIZE 0x00000005 +#define NVC0TCL_CB_BIND_VALID (1 << 0) +#define NVC0TCL_CB_BIND_INDEX_SHIFT 4 +#define NVC0TCL_CB_BIND_INDEX_MASK 0x000000f0 +#define NVC0TCL_BIND_TIC(x) (0x00002404+((x)*32)) +#define NVC0TCL_BIND_TIC__SIZE 0x00000005 +#define NVC0TCL_BIND_TIC_ACTIVE (1 << 0) +#define NVC0TCL_BIND_TIC_TEXTURE_SHIFT 1 +#define NVC0TCL_BIND_TIC_TEXTURE_MASK 0x000001fe +#define NVC0TCL_BIND_TIC_TIC_SHIFT 9 +#define NVC0TCL_BIND_TIC_TIC_MASK 0x7ffffe00 +#define NVC0TCL_TEX_LIMITS(x) (0x00002200+((x)*16)) +#define NVC0TCL_TEX_LIMITS__SIZE 0x00000005 +#define NVC0TCL_TEX_LIMITS_SAMPLERS_LOG2_SHIFT 0 +#define NVC0TCL_TEX_LIMITS_SAMPLERS_LOG2_MASK 0x0000000f +#define NVC0TCL_TEX_LIMITS_TEXTURES_LOG2_SHIFT 4 +#define NVC0TCL_TEX_LIMITS_TEXTURES_LOG2_MASK 0x000000f0 +#define NVC0TCL_CB_ADDR_HIGH 0x00002384 +#define NVC0TCL_CB_ADDR_LOW 0x00002388 +#define NVC0TCL_CB_POS 0x0000238c +#define NVC0TCL_CB_DATA(x) (0x00002390+((x)*4)) +#define NVC0TCL_CB_DATA__SIZE 0x00000010 +#define NVC0TCL_TFB_VARYING_LOCS(x) (0x00002800+((x)*4)) +#define NVC0TCL_TFB_VARYING_LOCS__SIZE 0x00000080 +#define NVC0TCL_UNK_UPLOAD_POS 0x00003800 +#define NVC0TCL_UNK_UPLOAD_DATA 0x00003804 +#define NVC0TCL_VERTEX_ARRAY_SELECT 0x00003820 +#define NVC0TCL_VERTEX_ARRAY_ADDRESS 0x00003824 +#define NVC0TCL_BLEND_ENABLEI 0x00003858 +#define NVC0TCL_POLYGON_MODE_FRONT 0x00003868 +#define NVC0TCL_POLYGON_MODE_FRONT_POINT 0x00001b00 +#define NVC0TCL_POLYGON_MODE_FRONT_LINE 0x00001b01 +#define NVC0TCL_POLYGON_MODE_FRONT_FILL 0x00001b02 +#define NVC0TCL_POLYGON_MODE_BACK 0x00003870 +#define NVC0TCL_POLYGON_MODE_BACK_POINT 0x00001b00 +#define NVC0TCL_POLYGON_MODE_BACK_LINE 0x00001b01 +#define NVC0TCL_POLYGON_MODE_BACK_FILL 0x00001b02 +#define NVC0TCL_GP_SELECT 0x00003878 +#define NVC0TCL_GP_SELECT_ENABLE (1 << 0) +#define NVC0TCL_GP_SELECT_PROGRAM_SHIFT 4 +#define NVC0TCL_GP_SELECT_PROGRAM_MASK 0x000000f0 +#define NVC0TCL_TEP_SELECT 0x00003880 +#define NVC0TCL_TEP_SELECT_ENABLE (1 << 0) +#define NVC0TCL_TEP_SELECT_PROGRAM_SHIFT 4 +#define NVC0TCL_TEP_SELECT_PROGRAM_MASK 0x000000f0 + + +#define NVC0_COMPUTE 0x000090c0 + +#define NVC0_COMPUTE_NOP 0x00000100 +#define NVC0_COMPUTE_NOTIFY 0x00000104 +#define NVC0_COMPUTE_SERIALIZE 0x00000110 +#define NVC0_COMPUTE_LOCAL_SIZE 0x00000204 +#define NVC0_COMPUTE_SHARED_BASE 0x00000214 +#define NVC0_COMPUTE_GRIDDIM_YX 0x00000238 +#define NVC0_COMPUTE_GRIDDIM_YX_X_SHIFT 0 +#define NVC0_COMPUTE_GRIDDIM_YX_X_MASK 0x0000ffff +#define NVC0_COMPUTE_GRIDDIM_YX_Y_SHIFT 16 +#define NVC0_COMPUTE_GRIDDIM_YX_Y_MASK 0xffff0000 +#define NVC0_COMPUTE_GRIDDIM_Z 0x0000023c +#define NVC0_COMPUTE_SHARED_SIZE 0x0000024c +#define NVC0_COMPUTE_BLOCK_ALLOC 0x00000250 +#define NVC0_COMPUTE_BLOCK_ALLOC_THREADS_SHIFT 0 +#define NVC0_COMPUTE_BLOCK_ALLOC_THREADS_MASK 0x0000ffff +#define NVC0_COMPUTE_BLOCK_ALLOC_BARRIERS_SHIFT 16 +#define NVC0_COMPUTE_BLOCK_ALLOC_BARRIERS_MASK 0xffff0000 +#define NVC0_COMPUTE_CP_GPR_ALLOC 0x000002c0 +#define NVC0_COMPUTE_GLOBAL_BASE 0x000002c8 +#define NVC0_COMPUTE_GLOBAL_BASE_HIGH_SHIFT 0 +#define NVC0_COMPUTE_GLOBAL_BASE_HIGH_MASK 0x000000ff +#define NVC0_COMPUTE_GLOBAL_BASE_INDEX_SHIFT 16 +#define NVC0_COMPUTE_GLOBAL_BASE_INDEX_MASK 0x00ff0000 +#define NVC0_COMPUTE_GLOBAL_BASE_FLAGS_SHIFT 28 +#define NVC0_COMPUTE_GLOBAL_BASE_FLAGS_MASK 0xf0000000 +#define NVC0_COMPUTE_LAUNCH 0x00000368 +#define NVC0_COMPUTE_BLOCKDIM_YX 0x000003ac +#define NVC0_COMPUTE_BLOCKDIM_YX_X_SHIFT 0 +#define NVC0_COMPUTE_BLOCKDIM_YX_X_MASK 0x0000ffff +#define NVC0_COMPUTE_BLOCKDIM_YX_Y_SHIFT 16 +#define NVC0_COMPUTE_BLOCKDIM_YX_Y_MASK 0xffff0000 +#define NVC0_COMPUTE_BLOCKDIM_Z 0x000003b0 +#define NVC0_COMPUTE_CP_START_ID 0x000003b4 +#define NVC0_COMPUTE_LOCAL_BASE 0x0000077c +#define NVC0_COMPUTE_UNK0790_ADDRESS_HIGH 0x00000790 +#define NVC0_COMPUTE_UNK0790_ADDRESS_LOW 0x00000794 +#define NVC0_COMPUTE_LINKED_TSC 0x00001234 +#define NVC0_COMPUTE_TSC_ADDRESS_HIGH 0x0000155c +#define NVC0_COMPUTE_TSC_ADDRESS_LOW 0x00001560 +#define NVC0_COMPUTE_TSC_LIMIT 0x00001564 +#define NVC0_COMPUTE_TIC_ADDRESS_HIGH 0x00001574 +#define NVC0_COMPUTE_TIC_ADDRESS_LOW 0x00001578 +#define NVC0_COMPUTE_TIC_LIMIT 0x0000157c +#define NVC0_COMPUTE_CODE_ADDRESS_HIGH 0x00001608 +#define NVC0_COMPUTE_CODE_ADDRESS_LOW 0x0000160c +#define NVC0_COMPUTE_CB_BIND 0x00001694 +#define NVC0_COMPUTE_CB_BIND_INDEX_SHIFT 1 +#define NVC0_COMPUTE_CB_BIND_INDEX_MASK 0xfffffffe +#define NVC0_COMPUTE_CB_BIND_VALID (1 << 0) +#define NVC0_COMPUTE_QUERY_ADDRESS_HIGH 0x00001b00 +#define NVC0_COMPUTE_QUERY_ADDRESS_LOW 0x00001b04 +#define NVC0_COMPUTE_QUERY_SEQUENCE 0x00001b08 +#define NVC0_COMPUTE_QUERY_GET 0x00001b0c +#define NVC0_COMPUTE_CB_ADDRESS_HIGH 0x00002384 +#define NVC0_COMPUTE_CB_ADDRESS_LOW 0x00002388 +#define NVC0_COMPUTE_CB_POS 0x0000238c +#define NVC0_COMPUTE_CB_DATA 0x00002390 + + #endif /* NOUVEAU_REG_H */ diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index 0d744ab788..88fee3630b 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -48,6 +48,53 @@ nv50_colormask(unsigned mask) return cmask; } +static INLINE uint32_t +nv50_blend_func(unsigned factor) +{ + switch (factor) { + case PIPE_BLENDFACTOR_ZERO: + return NV50TCL_BLEND_FUNC_SRC_RGB_ZERO; + case PIPE_BLENDFACTOR_ONE: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_SRC1_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC1_ALPHA; + default: + return NV50TCL_BLEND_FUNC_SRC_RGB_ZERO; + } +} + static void * nv50_blend_state_create(struct pipe_context *pipe, const struct pipe_blend_state *cso) @@ -80,12 +127,12 @@ nv50_blend_state_create(struct pipe_context *pipe, if (blend_enabled) { so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5); so_data (so, nvgl_blend_eqn(cso->rt[0].rgb_func)); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_src_factor)); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].rgb_dst_factor)); + so_data (so, nv50_blend_func(cso->rt[0].rgb_src_factor)); + so_data (so, nv50_blend_func(cso->rt[0].rgb_dst_factor)); so_data (so, nvgl_blend_eqn(cso->rt[0].alpha_func)); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_src_factor)); + so_data (so, nv50_blend_func(cso->rt[0].alpha_src_factor)); so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1); - so_data (so, 0x4000 | nvgl_blend_func(cso->rt[0].alpha_dst_factor)); + so_data (so, nv50_blend_func(cso->rt[0].alpha_dst_factor)); } if (cso->logicop_enable == 0 ) { diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index 6bd52884b5..996844b18f 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -29,6 +29,9 @@ #include "nv50_context.h" #include "nv50_resource.h" +/* VERTEX_ARRAY_ATTRIB_TYPE is duplicated for unknown reason */ +#define NV50_VAT(x) ((x) | ((x) << 3)) + static INLINE uint32_t nv50_vbo_type_to_hw(enum pipe_format format) { @@ -39,22 +42,22 @@ nv50_vbo_type_to_hw(enum pipe_format format) switch (desc->channel[0].type) { case UTIL_FORMAT_TYPE_FLOAT: - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT; + return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT); case UTIL_FORMAT_TYPE_UNSIGNED: if (desc->channel[0].normalized) { - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM; + return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM); } - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED; + return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED); case UTIL_FORMAT_TYPE_SIGNED: if (desc->channel[0].normalized) { - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM; + return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM); } - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED; + return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED); /* case PIPE_FORMAT_TYPE_UINT: - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT; + return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT); case PIPE_FORMAT_TYPE_SINT: - return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT; */ + return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT); */ default: return 0; } -- cgit v1.2.3 From d7aac107e64e1c4c1af30806817a2888e7a4a96c Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 24 Jul 2010 14:46:44 +0200 Subject: nv50: introduce the big formats table --- src/gallium/drivers/nv50/Makefile | 1 + src/gallium/drivers/nv50/SConscript | 1 + src/gallium/drivers/nv50/nv50_formats.c | 427 +++++++++++++++++++++++++ src/gallium/drivers/nv50/nv50_miptree.c | 3 + src/gallium/drivers/nv50/nv50_screen.c | 81 ++--- src/gallium/drivers/nv50/nv50_screen.h | 9 + src/gallium/drivers/nv50/nv50_state_validate.c | 75 +---- src/gallium/drivers/nv50/nv50_tex.c | 52 +-- src/gallium/drivers/nv50/nv50_texture.h | 9 + src/gallium/drivers/nv50/nv50_vbo.c | 100 +----- 10 files changed, 486 insertions(+), 272 deletions(-) create mode 100644 src/gallium/drivers/nv50/nv50_formats.c (limited to 'src') diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile index 3943a9e257..bf1e8201a0 100644 --- a/src/gallium/drivers/nv50/Makefile +++ b/src/gallium/drivers/nv50/Makefile @@ -8,6 +8,7 @@ C_SOURCES = \ nv50_clear.c \ nv50_context.c \ nv50_draw.c \ + nv50_formats.c \ nv50_miptree.c \ nv50_query.c \ nv50_resource.c \ diff --git a/src/gallium/drivers/nv50/SConscript b/src/gallium/drivers/nv50/SConscript index 8625f92622..e4a93c15ce 100644 --- a/src/gallium/drivers/nv50/SConscript +++ b/src/gallium/drivers/nv50/SConscript @@ -9,6 +9,7 @@ nv50 = env.ConvenienceLibrary( 'nv50_clear.c', 'nv50_context.c', 'nv50_draw.c', + 'nv50_formats.c', 'nv50_miptree.c', 'nv50_query.c', 'nv50_program.c', diff --git a/src/gallium/drivers/nv50/nv50_formats.c b/src/gallium/drivers/nv50/nv50_formats.c new file mode 100644 index 0000000000..5b65cdaa02 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_formats.c @@ -0,0 +1,427 @@ + +#include "nv50_screen.h" +#include "nv50_texture.h" +#include "nouveau/nouveau_class.h" +#include "pipe/p_defines.h" + +#define A_(cr, cg, cb, ca, t0, t1, t2, t3, sz, r) \ + NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ + NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ + NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ + NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ + NV50TIC_0_0_FMT_##sz, \ + NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_##sz | \ + NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_##t0 | \ + (NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_##t0 << 3) | (r << 31) + +#define B_(cr, cg, cb, ca, t0, t1, t2, t3, sz, r) \ + NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ + NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ + NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ + NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ + NV50TIC_0_0_FMT_##sz, 0 + +#define VERTEX_BUFFER PIPE_BIND_VERTEX_BUFFER +#define SAMPLER_VIEW PIPE_BIND_SAMPLER_VIEW +#define RENDER_TARGET PIPE_BIND_RENDER_TARGET +#define DEPTH_STENCIL PIPE_BIND_DEPTH_STENCIL +#define SCANOUT PIPE_BIND_SCANOUT + +/* for vertex buffers: */ +#define NV50TIC_0_0_FMT_8_8_8 NV50TIC_0_0_FMT_8_8_8_8 +#define NV50TIC_0_0_FMT_16_16_16 NV50TIC_0_0_FMT_16_16_16_16 +#define NV50TIC_0_0_FMT_32_32_32 NV50TIC_0_0_FMT_32_32_32_32 + +const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] = +{ + /* COMMON FORMATS */ + + [PIPE_FORMAT_B8G8R8A8_UNORM] = { NV50TCL_RT_FORMAT_A8R8G8B8_UNORM, + A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B8G8R8X8_UNORM] = { NV50TCL_RT_FORMAT_X8R8G8B8_UNORM, + A_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B8G8R8A8_SRGB] = { NV50TCL_RT_FORMAT_A8R8G8B8_SRGB, + A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_B8G8R8X8_SRGB] = { NV50TCL_RT_FORMAT_X8R8G8B8_SRGB, + A_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 1), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_B5G6R5_UNORM] = { NV50TCL_RT_FORMAT_R5G6B5_UNORM, + B_(C2, C1, C0, ONE, UNORM, UNORM, UNORM, UNORM, 5_6_5, 1), + SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B5G5R5A1_UNORM] = { NV50TCL_RT_FORMAT_A1R5G5B5_UNORM, + B_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 1_5_5_5, 1), + SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B4G4R4A4_UNORM] = { 0, + B_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 4_4_4_4, 1), + SAMPLER_VIEW }, + + [PIPE_FORMAT_R10G10B10A2_UNORM] = { NV50TCL_RT_FORMAT_A2B10G10R10_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1), + SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + + [PIPE_FORMAT_B10G10R10A2_UNORM] = { NV50TCL_RT_FORMAT_A2R10G10B10_UNORM, + A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1), + SAMPLER_VIEW | RENDER_TARGET }, + + /* DEPTH/STENCIL FORMATS */ + + [PIPE_FORMAT_Z16_UNORM] = { NV50TCL_ZETA_FORMAT_Z16_UNORM, + B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 16_DEPTH, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z24_UNORM_S8_USCALED] = { NV50TCL_ZETA_FORMAT_S8Z24_UNORM, + B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 8_24, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z24X8_UNORM] = { NV50TCL_ZETA_FORMAT_X8Z24_UNORM, + B_(C0, C0, C0, ONE, UNORM, UINT, UINT, UINT, 8_24, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_S8_USCALED_Z24_UNORM] = { NV50TCL_ZETA_FORMAT_S8Z24_UNORM, + B_(C1, C1, C1, ONE, UINT, UNORM, UINT, UINT, 24_8, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z32_FLOAT] = { NV50TCL_ZETA_FORMAT_Z32_FLOAT, + B_(C0, C0, C0, ONE, FLOAT, UINT, UINT, UINT, 32_DEPTH, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + [PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED] = { + NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM, + B_(C0, C0, C0, ONE, FLOAT, UINT, UINT, UINT, 32_8, 0), + SAMPLER_VIEW | DEPTH_STENCIL }, + + /* LUMINANCE, ALPHA, INTENSITY */ + + [PIPE_FORMAT_L8_UNORM] = { 0, + A_(C0, C0, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_L8_SRGB] = { 0, + A_(C0, C0, C0, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_I8_UNORM] = { 0, + A_(C0, C0, C0, C0, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_A8_UNORM] = { NV50TCL_RT_FORMAT_A8_UNORM, + A_(ZERO, ZERO, ZERO, C0, UNORM, UNORM, UNORM, UNORM, 8, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_L8A8_UNORM] = { 0, + A_(C0, C0, C0, C1, UNORM, UNORM, UNORM, UNORM, 8_8, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_L8A8_SRGB] = { 0, + A_(C0, C0, C0, C1, UNORM, UNORM, UNORM, UNORM, 8_8, 0), + SAMPLER_VIEW }, + + /* DXT, RGTC */ + + [PIPE_FORMAT_DXT1_RGB] = { 0, + B_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, DXT1, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_DXT1_RGBA] = { 0, + B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT1, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_DXT3_RGBA] = { 0, + B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT3, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_DXT5_RGBA] = { 0, + B_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, DXT5, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_RGTC1_UNORM] = { 0, + B_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC1, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_RGTC2_UNORM] = { 0, + B_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC2, 0), + SAMPLER_VIEW }, + + [PIPE_FORMAT_RGTC2_SNORM] = { 0, + B_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, RGTC2, 0), + SAMPLER_VIEW }, + + /* FLOAT 16 */ + + [PIPE_FORMAT_R16G16B16A16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16B16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16B16X16_FLOAT, + A_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16_FLOAT] = { NV50TCL_RT_FORMAT_R16G16_FLOAT, + A_(C0, C1, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16_FLOAT] = { NV50TCL_RT_FORMAT_R16_FLOAT, + A_(C0, ZERO, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* FLOAT 32 */ + + [PIPE_FORMAT_R32G32B32A32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R32G32B32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32B32X32_FLOAT, + A_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R32G32_FLOAT] = { NV50TCL_RT_FORMAT_R32G32_FLOAT, + A_(C0, C1, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R32_FLOAT] = { NV50TCL_RT_FORMAT_R32_FLOAT, + A_(C0, ZERO, ZERO, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* ODD FORMATS */ + + [PIPE_FORMAT_R11G11B10_FLOAT] = { NV50TCL_RT_FORMAT_B10G11R11_FLOAT, + B_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 10_11_11, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R9G9B9E5_FLOAT] = { 0, + B_(C0, C1, C2, ONE, FLOAT, FLOAT, FLOAT, FLOAT, 5_9_9_9, 0), + SAMPLER_VIEW }, + + /* SNORM 32 */ + + [PIPE_FORMAT_R32G32B32A32_SNORM] = { 0, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_SNORM] = { 0, + A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_SNORM] = { 0, + A_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_SNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* UNORM 32 */ + + [PIPE_FORMAT_R32G32B32A32_UNORM] = { 0, + A_(C0, C1, C2, C3, FLOAT, FLOAT, FLOAT, FLOAT, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_UNORM] = { 0, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_UNORM] = { 0, + A_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_UNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SNORM 16 */ + + [PIPE_FORMAT_R16G16B16A16_SNORM] = { NV50TCL_RT_FORMAT_R16G16B16A16_SNORM, + A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16B16_SNORM] = { 0, + A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_SNORM] = { NV50TCL_RT_FORMAT_R16G16_SNORM, + A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16_SNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* UNORM 16 */ + + [PIPE_FORMAT_R16G16B16A16_UNORM] = { NV50TCL_RT_FORMAT_R16G16B16A16_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16G16B16_UNORM] = { 0, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_UNORM] = { NV50TCL_RT_FORMAT_R16G16_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R16_UNORM] = { 0, + A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SNORM 8 */ + + [PIPE_FORMAT_R8G8B8A8_SNORM] = { NV50TCL_RT_FORMAT_A8B8G8R8_SNORM, + A_(C0, C1, C2, C3, SNORM, SNORM, SNORM, SNORM, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8_SNORM] = { 0, + A_(C0, C1, C2, ONE, SNORM, SNORM, SNORM, SNORM, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8_SNORM] = { NV50TCL_RT_FORMAT_R8G8_SNORM, + A_(C0, C1, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8_SNORM] = { NV50TCL_RT_FORMAT_R8_SNORM, + A_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* UNORM 8 */ + + [PIPE_FORMAT_R8G8B8A8_UNORM] = { NV50TCL_RT_FORMAT_A8B8G8R8_UNORM, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8A8_SRGB] = { NV50TCL_RT_FORMAT_A8B8G8R8_SRGB, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 8_8_8_8, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8_UNORM] = { NV50TCL_RT_FORMAT_X8B8G8R8_UNORM, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8B8_SRGB] = { NV50TCL_RT_FORMAT_X8B8G8R8_SRGB, + A_(C0, C1, C2, ONE, UNORM, UNORM, UNORM, UNORM, 8_8_8, 0), + SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8G8_UNORM] = { NV50TCL_RT_FORMAT_R8G8_UNORM, + A_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + [PIPE_FORMAT_R8_UNORM] = { NV50TCL_RT_FORMAT_R8_UNORM, + A_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW | RENDER_TARGET }, + + /* SSCALED 32 */ + + [PIPE_FORMAT_R32G32B32A32_SSCALED] = { 0, + A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_SSCALED] = { 0, + A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_SSCALED] = { 0, + A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_SSCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* USCALED 32 */ + + [PIPE_FORMAT_R32G32B32A32_USCALED] = { 0, + A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 32_32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32B32_USCALED] = { 0, + A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 32_32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32G32_USCALED] = { 0, + A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 32_32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R32_USCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 32, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SSCALED 16 */ + + [PIPE_FORMAT_R16G16B16A16_SSCALED] = { 0, + A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16B16_SSCALED] = { 0, + A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_SSCALED] = { 0, + A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16_SSCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* USCALED 16 */ + + [PIPE_FORMAT_R16G16B16A16_USCALED] = { 0, + A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 16_16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16B16_USCALED] = { 0, + A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 16_16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16G16_USCALED] = { 0, + A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 16_16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R16_USCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 16, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* SSCALED 8 */ + + [PIPE_FORMAT_R8G8B8A8_SSCALED] = { 0, + A_(C0, C1, C2, C3, SSCALED, SSCALED, SSCALED, SSCALED, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8B8_SSCALED] = { 0, + A_(C0, C1, C2, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8_SSCALED] = { 0, + A_(C0, C1, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8_SSCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, SSCALED, SSCALED, SSCALED, SSCALED, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + /* USCALED 8 */ + + [PIPE_FORMAT_R8G8B8A8_USCALED] = { 0, + A_(C0, C1, C2, C3, USCALED, USCALED, USCALED, USCALED, 8_8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8B8_USCALED] = { 0, + A_(C0, C1, C2, ONE, USCALED, USCALED, USCALED, USCALED, 8_8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8G8_USCALED] = { 0, + A_(C0, C1, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 8_8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, + + [PIPE_FORMAT_R8_USCALED] = { 0, + A_(C0, ZERO, ZERO, ONE, USCALED, USCALED, USCALED, USCALED, 8, 0), + VERTEX_BUFFER | SAMPLER_VIEW }, +}; diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c index b7cd92158f..12b5ad106c 100644 --- a/src/gallium/drivers/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nv50/nv50_miptree.c @@ -159,6 +159,9 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *tmp case PIPE_FORMAT_Z24_UNORM_S8_USCALED: tile_flags = 0x2800; break; + case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED: + tile_flags = 0xe000; + break; case PIPE_FORMAT_R32G32B32A32_FLOAT: case PIPE_FORMAT_R32G32B32_FLOAT: tile_flags = 0x7400; diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index ca4b01b12b..e0c06c29ba 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -34,75 +34,38 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, enum pipe_format format, enum pipe_texture_target target, unsigned sample_count, - unsigned tex_usage, unsigned geom_flags) + unsigned usage, unsigned geom_flags) { if (sample_count > 1) return FALSE; - if (tex_usage & PIPE_BIND_RENDER_TARGET) { + if (!util_format_s3tc_enabled) { switch (format) { - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_R16G16B16A16_SNORM: - case PIPE_FORMAT_R16G16B16A16_UNORM: - case PIPE_FORMAT_R32G32B32A32_FLOAT: - case PIPE_FORMAT_R16G16_SNORM: - case PIPE_FORMAT_R16G16_UNORM: - return TRUE; - default: - break; - } - } else - if (tex_usage & PIPE_BIND_DEPTH_STENCIL) { - switch (format) { - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - return TRUE; - default: - break; - } - } else { - if (tex_usage & PIPE_BIND_SAMPLER_VIEW) { - switch (format) { - case PIPE_FORMAT_DXT1_RGB: - case PIPE_FORMAT_DXT1_RGBA: - case PIPE_FORMAT_DXT3_RGBA: - case PIPE_FORMAT_DXT5_RGBA: - return util_format_s3tc_enabled; - default: - break; - } - } - switch (format) { - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B8G8R8A8_SRGB: - case PIPE_FORMAT_B8G8R8X8_SRGB: - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_I8_UNORM: - case PIPE_FORMAT_L8A8_UNORM: - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_R16G16B16A16_SNORM: - case PIPE_FORMAT_R16G16B16A16_UNORM: - case PIPE_FORMAT_R32G32B32A32_FLOAT: - case PIPE_FORMAT_R16G16_SNORM: - case PIPE_FORMAT_R16G16_UNORM: - return TRUE; + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + case PIPE_FORMAT_DXT3_RGBA: + case PIPE_FORMAT_DXT5_RGBA: + return FALSE; default: break; } } - return FALSE; + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + if ((nouveau_screen(pscreen)->device->chipset & 0xf0) != 0xa0) + return FALSE; + break; + default: + break; + } + + /* transfers & shared are always supported */ + usage &= ~(PIPE_BIND_TRANSFER_READ | + PIPE_BIND_TRANSFER_WRITE | + PIPE_BIND_SHARED); + + return (nv50_format_table[format].usage & usage) == usage; } static int diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h index fbf15a7596..a491ba31b2 100644 --- a/src/gallium/drivers/nv50/nv50_screen.h +++ b/src/gallium/drivers/nv50/nv50_screen.h @@ -38,4 +38,13 @@ nv50_screen(struct pipe_screen *screen) extern void nv50_screen_relocs(struct nv50_screen *); +struct nv50_format { + uint32_t rt; + uint32_t tic; + uint32_t vtx; + uint32_t usage; +}; + +extern const struct nv50_format nv50_format_table[]; + #endif diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index 8d662d8f60..f1d8202dff 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -56,6 +56,8 @@ validate_fb(struct nv50_context *nv50) assert(h == fb->cbufs[i]->height); } + assert(nv50_format_table[fb->cbufs[i]->format].rt); + so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2); so_data (so, fb->cbufs[i]->width); so_data (so, fb->cbufs[i]->height); @@ -65,42 +67,9 @@ validate_fb(struct nv50_context *nv50) NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0); so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); - switch (fb->cbufs[i]->format) { - case PIPE_FORMAT_B8G8R8A8_UNORM: - so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM); - break; - case PIPE_FORMAT_B8G8R8X8_UNORM: - so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM); - break; - case PIPE_FORMAT_B5G6R5_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM); - break; - case PIPE_FORMAT_R16G16B16A16_SNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_SNORM); - break; - case PIPE_FORMAT_R16G16B16A16_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_UNORM); - break; - case PIPE_FORMAT_R16G16B16A16_FLOAT: - so_data(so, NV50TCL_RT_FORMAT_R16G16B16A16_FLOAT); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - so_data(so, NV50TCL_RT_FORMAT_R32G32B32A32_FLOAT); - break; - case PIPE_FORMAT_R16G16_SNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16_SNORM); - break; - case PIPE_FORMAT_R16G16_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R16G16_UNORM); - break; - default: - NOUVEAU_ERR("AIIII unknown format %s\n", - util_format_name(fb->cbufs[i]->format)); - so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM); - break; - } - so_data(so, nv50_miptree(pt)-> - level[fb->cbufs[i]->level].tile_mode << 4); + so_data (so, nv50_format_table[fb->cbufs[i]->format].rt); + so_data (so, nv50_miptree(pt)-> + level[fb->cbufs[i]->level].tile_mode << 4); so_data(so, 0x00000000); so_method(so, tesla, NV50TCL_RT_ARRAY_MODE, 1); @@ -120,39 +89,17 @@ validate_fb(struct nv50_context *nv50) assert(h == fb->zsbuf->height); } + assert(nv50_format_table[fb->zsbuf->format].rt); + so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5); so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH | NOUVEAU_BO_RDWR, 0, 0); so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); - switch (fb->zsbuf->format) { - case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM); - break; - case PIPE_FORMAT_Z24X8_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM); - break; - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM); - break; - case PIPE_FORMAT_Z32_FLOAT: - so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT); - break; - case PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED: - so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT_X24S8_UNORM); - break; - case PIPE_FORMAT_Z16_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_Z16_UNORM); - break; - default: - NOUVEAU_ERR("AIIII unknown format %s\n", - util_format_name(fb->zsbuf->format)); - so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM); - break; - } - so_data(so, nv50_miptree(pt)-> - level[fb->zsbuf->level].tile_mode << 4); - so_data(so, 0x00000000); + so_data (so, nv50_format_table[fb->zsbuf->format].rt); + so_data (so, nv50_miptree(pt)-> + level[fb->zsbuf->level].tile_mode << 4); + so_data (so, 0x00000000); so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1); so_data (so, 1); diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c index 5ea0c1d726..5535818370 100644 --- a/src/gallium/drivers/nv50/nv50_tex.c +++ b/src/gallium/drivers/nv50/nv50_tex.c @@ -29,56 +29,6 @@ #include "util/u_format.h" -#define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f) \ -[PIPE_FORMAT_##pf] = ( \ - NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ - NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ - NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ - NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ - NV50TIC_0_0_FMT_##f) - -#define _(pf, t, cr, cg, cb, ca, f) _MIXED(pf, t, t, t, t, cr, cg, cb, ca, f) - -static const uint32_t nv50_texture_formats[PIPE_FORMAT_COUNT] = -{ - _(B8G8R8A8_UNORM, UNORM, C2, C1, C0, C3, 8_8_8_8), - _(B8G8R8A8_SRGB, UNORM, C2, C1, C0, C3, 8_8_8_8), - _(B8G8R8X8_UNORM, UNORM, C2, C1, C0, ONE, 8_8_8_8), - _(B8G8R8X8_SRGB, UNORM, C2, C1, C0, ONE, 8_8_8_8), - _(B5G5R5A1_UNORM, UNORM, C2, C1, C0, C3, 1_5_5_5), - _(B4G4R4A4_UNORM, UNORM, C2, C1, C0, C3, 4_4_4_4), - - _(B5G6R5_UNORM, UNORM, C2, C1, C0, ONE, 5_6_5), - - _(L8_UNORM, UNORM, C0, C0, C0, ONE, 8), - _(L8_SRGB, UNORM, C0, C0, C0, ONE, 8), - _(A8_UNORM, UNORM, ZERO, ZERO, ZERO, C0, 8), - _(I8_UNORM, UNORM, C0, C0, C0, C0, 8), - - _(L8A8_UNORM, UNORM, C0, C0, C0, C1, 8_8), - _(L8A8_SRGB, UNORM, C0, C0, C0, C1, 8_8), - - _(DXT1_RGB, UNORM, C0, C1, C2, ONE, DXT1), - _(DXT1_RGBA, UNORM, C0, C1, C2, C3, DXT1), - _(DXT3_RGBA, UNORM, C0, C1, C2, C3, DXT3), - _(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5), - - _MIXED(S8_USCALED_Z24_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8), - _MIXED(Z24_UNORM_S8_USCALED, UNORM, UINT, UINT, UINT, C0, C0, C0, ONE, 8_24), - - _(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16), - _(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16), - _(R32G32B32A32_FLOAT, FLOAT, C0, C1, C2, C3, 32_32_32_32), - - _(R16G16_SNORM, SNORM, C0, C1, ZERO, ONE, 16_16), - _(R16G16_UNORM, UNORM, C0, C1, ZERO, ONE, 16_16), - - _MIXED(Z32_FLOAT, FLOAT, UINT, UINT, UINT, C0, C0, C0, ONE, 32_DEPTH) -}; - -#undef _ -#undef _MIXED - static INLINE uint32_t nv50_tic_swizzle(uint32_t tc, unsigned swz) { @@ -106,7 +56,7 @@ nv50_tex_construct(struct nv50_sampler_view *view) struct nv50_miptree *mt = nv50_miptree(view->pipe.texture); uint32_t swz[4], *tic = view->tic; - tic[0] = nv50_texture_formats[view->pipe.format]; + tic[0] = nv50_format_table[view->pipe.format].tic; swz[0] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_r); swz[1] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_g); diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h index 3475d3e432..b4939943e8 100644 --- a/src/gallium/drivers/nv50/nv50_texture.h +++ b/src/gallium/drivers/nv50/nv50_texture.h @@ -45,24 +45,32 @@ #define NV50TIC_0_0_TYPEA_SNORM 0x00008000 #define NV50TIC_0_0_TYPEA_SINT 0x00018000 #define NV50TIC_0_0_TYPEA_UINT 0x00020000 +#define NV50TIC_0_0_TYPEA_SSCALED 0x00028000 +#define NV50TIC_0_0_TYPEA_USCALED 0x00030000 #define NV50TIC_0_0_TYPEA_FLOAT 0x00038000 #define NV50TIC_0_0_TYPEB_MASK 0x00007000 #define NV50TIC_0_0_TYPEB_UNORM 0x00002000 #define NV50TIC_0_0_TYPEB_SNORM 0x00001000 #define NV50TIC_0_0_TYPEB_SINT 0x00003000 #define NV50TIC_0_0_TYPEB_UINT 0x00004000 +#define NV50TIC_0_0_TYPEB_SSCALED 0x00005000 +#define NV50TIC_0_0_TYPEB_USCALED 0x00006000 #define NV50TIC_0_0_TYPEB_FLOAT 0x00007000 #define NV50TIC_0_0_TYPEG_MASK 0x00000e00 #define NV50TIC_0_0_TYPEG_UNORM 0x00000400 #define NV50TIC_0_0_TYPEG_SNORM 0x00000200 #define NV50TIC_0_0_TYPEG_SINT 0x00000600 #define NV50TIC_0_0_TYPEG_UINT 0x00000800 +#define NV50TIC_0_0_TYPEG_SSCALED 0x00000a00 +#define NV50TIC_0_0_TYPEG_USCALED 0x00000c00 #define NV50TIC_0_0_TYPEG_FLOAT 0x00000e00 #define NV50TIC_0_0_TYPER_MASK 0x000001c0 #define NV50TIC_0_0_TYPER_UNORM 0x00000080 #define NV50TIC_0_0_TYPER_SNORM 0x00000040 #define NV50TIC_0_0_TYPER_SINT 0x000000c0 #define NV50TIC_0_0_TYPER_UINT 0x00000100 +#define NV50TIC_0_0_TYPER_SSCALED 0x00000140 +#define NV50TIC_0_0_TYPER_USCALED 0x00000180 #define NV50TIC_0_0_TYPER_FLOAT 0x000001c0 #define NV50TIC_0_0_FMT_MASK 0x0000003f #define NV50TIC_0_0_FMT_32_32_32_32 0x00000001 @@ -90,6 +98,7 @@ #define NV50TIC_0_0_FMT_8_24 0x0000002a #define NV50TIC_0_0_FMT_32_DEPTH 0x0000002f #define NV50TIC_0_0_FMT_32_8 0x00000030 +#define NV50TIC_0_0_FMT_16_DEPTH 0x0000003a #define NV50TIC_0_1_OFFSET_LOW_MASK 0xffffffff #define NV50TIC_0_1_OFFSET_LOW_SHIFT 0 diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index 996844b18f..4fe0df5683 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -29,99 +29,6 @@ #include "nv50_context.h" #include "nv50_resource.h" -/* VERTEX_ARRAY_ATTRIB_TYPE is duplicated for unknown reason */ -#define NV50_VAT(x) ((x) | ((x) << 3)) - -static INLINE uint32_t -nv50_vbo_type_to_hw(enum pipe_format format) -{ - const struct util_format_description *desc; - - desc = util_format_description(format); - assert(desc); - - switch (desc->channel[0].type) { - case UTIL_FORMAT_TYPE_FLOAT: - return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT); - case UTIL_FORMAT_TYPE_UNSIGNED: - if (desc->channel[0].normalized) { - return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM); - } - return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED); - case UTIL_FORMAT_TYPE_SIGNED: - if (desc->channel[0].normalized) { - return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM); - } - return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED); - /* - case PIPE_FORMAT_TYPE_UINT: - return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT); - case PIPE_FORMAT_TYPE_SINT: - return NV50_VAT(NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SINT); */ - default: - return 0; - } -} - -static INLINE uint32_t -nv50_vbo_size_to_hw(unsigned size, unsigned nr_c) -{ - static const uint32_t hw_values[] = { - 0, 0, 0, 0, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16, - 0, 0, 0, 0, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32, - NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 }; - - /* we'd also have R11G11B10 and R10G10B10A2 */ - - assert(nr_c > 0 && nr_c <= 4); - - if (size > 32) - return 0; - size >>= (3 - 2); - - return hw_values[size + (nr_c - 1)]; -} - -static INLINE uint32_t -nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve) -{ - uint32_t hw_type, hw_size; - enum pipe_format pf = ve->src_format; - const struct util_format_description *desc; - unsigned size, nr_components; - - desc = util_format_description(pf); - assert(desc); - - size = util_format_get_component_bits(pf, UTIL_FORMAT_COLORSPACE_RGB, 0); - nr_components = util_format_get_nr_components(pf); - - hw_type = nv50_vbo_type_to_hw(pf); - hw_size = nv50_vbo_size_to_hw(size, nr_components); - - if (!hw_type || !hw_size) { - NOUVEAU_ERR("unsupported vbo format: %s\n", util_format_name(pf)); - abort(); - return 0x24e80000; - } - - if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_Z) /* BGRA */ - hw_size |= (1 << 31); /* no real swizzle bits :-( */ - - return (hw_type | hw_size); -} - struct instance { struct nouveau_bo *bo; unsigned delta; @@ -543,11 +450,8 @@ nv50_vtxelt_construct(struct nv50_vtxelt_stateobj *cso) { unsigned i; - for (i = 0; i < cso->num_elements; ++i) { - struct pipe_vertex_element *ve = &cso->pipe[i]; - - cso->hw[i] = nv50_vbo_vtxelt_to_hw(ve); - } + for (i = 0; i < cso->num_elements; ++i) + cso->hw[i] = nv50_format_table[cso->pipe[i].src_format].vtx; } struct nouveau_stateobj * -- cgit v1.2.3 From 1d1bb206122b719d6959eceddd511a0294816a9a Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 24 Jul 2010 21:17:21 +0200 Subject: nv50: don't produce MOV immediate to output reg in store opt --- src/gallium/drivers/nv50/nv50_pc_emit.c | 12 ++++++------ src/gallium/drivers/nv50/nv50_pc_optimize.c | 12 ++++++++---- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index b917d23232..51304670a1 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -539,26 +539,26 @@ emit_mov(struct nv_pc *pc, struct nv_instruction *i) pc->emit[1] |= DREG(i->def[0])->id << 4; } else if (SFILE(i, 0) == NV_FILE_IMM) { - if (i->opcode == NV_OP_LDA) + if (i->opcode == NV_OP_LDA) { emit_ld(pc, i); - else { + } else { pc->emit[0] = 0x10008001; pc->emit[1] = 0x00000003; - emit_form_IMM(pc, i, 0); + emit_form_IMM(pc, i, 0); } } else { pc->emit[0] = 0x10000000; pc->emit[0] |= DREG(i->def[0])->id << 2; pc->emit[0] |= SREG(i->src[0])->id << 9; - if (!i->is_long) + if (!i->is_long) { pc->emit[0] |= 0x8000; - else { + } else { pc->emit[0] |= 0x00000001; pc->emit[1] = 0x0403c000; - set_pred(pc, i); + set_pred(pc, i); } } diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 0811420e42..f81384f00d 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -266,11 +266,10 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) int j; for (sti = b->entry; sti; sti = sti->next) { - if (!sti->def[0]) + if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT) continue; - if (sti->def[0]->reg.file != NV_FILE_OUT) - continue; + /* only handling MOV to $oX here */ if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA) continue; @@ -282,8 +281,13 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) if (nvi->def[0]->refc > 1) continue; + /* cannot MOV immediate to $oX */ + if (nvi->src[0]->value->reg.file == NV_FILE_IMM) + continue; + nvi->def[0] = sti->def[0]; - nvi->fixed = 1; + sti->def[0] = NULL; + nvi->fixed = sti->fixed; sti->fixed = 0; } DESCEND_ARBITRARY(j, nv_pass_fold_stores); -- cgit v1.2.3 From 4baaf1d4c32053a191d8718e46dab95d25f119a5 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 24 Jul 2010 21:18:51 +0200 Subject: nv50: change back accidentally swapped UNORM,SNORM vertex type --- src/gallium/drivers/nouveau/nouveau_class.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h index 975fd8f35a..f44979e562 100644 --- a/src/gallium/drivers/nouveau/nouveau_class.h +++ b/src/gallium/drivers/nouveau/nouveau_class.h @@ -8949,8 +8949,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SHIFT 25 #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_MASK 0x0e000000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT 0x0e000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM 0x02000000 -#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM 0x04000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM 0x02000000 +#define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM 0x04000000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED 0x0a000000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED 0x0c000000 #define NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UINT 0x08000000 @@ -9352,8 +9352,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SHIFT 16 #define NVC0TCL_VTX_ATTR_DEFINE_TYPE_MASK 0x000f0000 #define NVC0TCL_VTX_ATTR_DEFINE_TYPE_FLOAT 0x00070000 -#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_UNORM 0x00010000 -#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SNORM 0x00020000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SNORM 0x00010000 +#define NVC0TCL_VTX_ATTR_DEFINE_TYPE_UNORM 0x00020000 #define NVC0TCL_VTX_ATTR_DEFINE_TYPE_USCALED 0x00050000 #define NVC0TCL_VTX_ATTR_DEFINE_TYPE_SSCALED 0x00060000 #define NVC0TCL_VTX_ATTR_DEFINE_TYPE_UINT 0x00040000 @@ -9385,8 +9385,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SHIFT 27 #define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_MASK 0x78000000 #define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT 0x38000000 -#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UNORM 0x08000000 -#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SNORM 0x10000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SNORM 0x08000000 +#define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UNORM 0x10000000 #define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_USCALED 0x28000000 #define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_SSCALED 0x30000000 #define NVC0TCL_VERTEX_ATTRIB_FORMAT_TYPE_UINT 0x20000000 -- cgit v1.2.3 From bb9d634730b7e97050e50d9238764a99099fbc7f Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 24 Jul 2010 22:16:05 +0200 Subject: nv50: add/fix some license headers --- src/gallium/drivers/nv50/nv50_formats.c | 21 ++++++++++++++++ src/gallium/drivers/nv50/nv50_pc.c | 21 ++++++++++++++++ src/gallium/drivers/nv50/nv50_pc.h | 37 ++++++++++++++++------------- src/gallium/drivers/nv50/nv50_pc_emit.c | 37 ++++++++++++++++------------- src/gallium/drivers/nv50/nv50_pc_optimize.c | 21 ++++++++++++++++ src/gallium/drivers/nv50/nv50_pc_print.c | 21 ++++++++++++++++ src/gallium/drivers/nv50/nv50_pc_regalloc.c | 22 +++++++++++++++++ src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 21 ++++++++++++++++ 8 files changed, 169 insertions(+), 32 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_formats.c b/src/gallium/drivers/nv50/nv50_formats.c index 5b65cdaa02..433c74e611 100644 --- a/src/gallium/drivers/nv50/nv50_formats.c +++ b/src/gallium/drivers/nv50/nv50_formats.c @@ -1,3 +1,24 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ #include "nv50_screen.h" #include "nv50_texture.h" diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 8aba0a32b7..89dbc7aa20 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -1,3 +1,24 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ #include "nv50_pc.h" #include "nv50_program.h" diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 3ab48d0afd..3db300dabb 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -1,19 +1,24 @@ -/*************************************************************************/ -/* Copyright (C) 2010 I */ -/* */ -/* This program is free software: you can redistribute it and/or modify */ -/* it under the terms of the GNU General Public License as published by */ -/* the Free Software Foundation, either version 3 of the License, or */ -/* (at your option) any later version. */ -/* */ -/* This program is distributed in the hope that it will be useful, */ -/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ -/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ -/* GNU General Public License for more details. */ -/* */ -/* You should have received a copy of the GNU General Public License */ -/* along with this program. If not, see . */ -/*************************************************************************/ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ #ifndef __NV50_COMPILER_H__ #define __NV50_COMPILER_H__ diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index 51304670a1..728e2b145d 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -1,19 +1,24 @@ -/*************************************************************************/ -/* Copyright (C) 2009 */ -/* */ -/* This program is free software: you can redistribute it and/or modify */ -/* it under the terms of the GNU General Public License as published by */ -/* the Free Software Foundation, either version 3 of the License, or */ -/* (at your option) any later version. */ -/* */ -/* This program is distributed in the hope that it will be useful, */ -/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ -/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ -/* GNU General Public License for more details. */ -/* */ -/* You should have received a copy of the GNU General Public License */ -/* along with this program. If not, see . */ -/*************************************************************************/ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ #include "nv50_context.h" #include "nv50_pc.h" diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index f81384f00d..a514c59e6a 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -1,3 +1,24 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ #include "nv50_pc.h" diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 09512ffb88..00b50b4edc 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -1,3 +1,24 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ #include "nv50_context.h" #include "nv50_pc.h" diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index eb446d641a..3cec219d1a 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -1,3 +1,25 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /* * XXX: phi function live intervals start at first ordinary instruction, * add_range should be taking care of that already ... diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index aa15917774..5b69d520bc 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1,3 +1,24 @@ +/* + * Copyright 2010 Christoph Bumiller + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ #include -- cgit v1.2.3 From 5811c6926450c4aafd2f9c87a2c6fe73b517f2c6 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 25 Jul 2010 22:21:38 +0200 Subject: nv50: simple reload elimination and local CSE --- src/gallium/drivers/nv50/nv50_pc.c | 18 +++ src/gallium/drivers/nv50/nv50_pc.h | 10 +- src/gallium/drivers/nv50/nv50_pc_optimize.c | 168 +++++++++++++++++++++++++--- src/gallium/drivers/nv50/nv50_pc_print.c | 6 +- 4 files changed, 178 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 89dbc7aa20..e09f94074d 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -163,6 +163,24 @@ nv_nvi_refcount(struct nv_instruction *nvi) return rc; } +int +nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, + struct nv_value *new_val) +{ + int i, n; + + if (old_val == new_val) + return old_val->refc; + + for (i = 0, n = 0; i < pc->num_refs; ++i) { + if (pc->refs[i]->value == old_val) { + ++n; + nv_reference(pc, &pc->refs[i], new_val); + } + } + return n; +} + static void nv_pc_free_refs(struct nv_pc *pc) { diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 3db300dabb..ffcdaf44af 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -363,11 +363,11 @@ new_ref(struct nv_pc *pc, struct nv_value *val) const unsigned old_size = pc->num_refs * sizeof(struct nv_ref *); const unsigned new_size = (pc->num_refs + 64) * sizeof(struct nv_ref *); - pc->refs = REALLOC(pc->refs, old_size, new_size); + pc->refs = REALLOC(pc->refs, old_size, new_size); - ref = CALLOC(64, sizeof(struct nv_ref)); - for (i = 0; i < 64; ++i) - pc->refs[pc->num_refs + i] = &ref[i]; + ref = CALLOC(64, sizeof(struct nv_ref)); + for (i = 0; i < 64; ++i) + pc->refs[pc->num_refs + i] = &ref[i]; } ref = pc->refs[pc->num_refs++]; @@ -426,6 +426,8 @@ int nv_nvi_refcount(struct nv_instruction *); void nv_nvi_delete(struct nv_instruction *); void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *); +int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, + struct nv_value *new_val); int nv_pc_exec_pass0(struct nv_pc *pc); int nv_pc_exec_pass1(struct nv_pc *pc); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index a514c59e6a..0018131fb5 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -570,31 +570,99 @@ nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b) } #endif -/* TODO: reload elimination, redundant store elimination */ +/* TODO: redundant store elimination */ -struct nv_pass_reldelim { +struct load_record { + struct load_record *next; + uint64_t data; + struct nv_value *value; +}; + +#define LOAD_RECORD_POOL_SIZE 1024 + +struct nv_pass_reld_elim { struct nv_pc *pc; + + struct load_record *imm; + struct load_record *mem_s; + struct load_record *mem_v; + struct load_record *mem_c[16]; + struct load_record *mem_l; + + struct load_record pool[LOAD_RECORD_POOL_SIZE]; + int alloc; }; static int -nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b) +nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) { - int j; + struct load_record **rec, *it; struct nv_instruction *ld, *next; + uint64_t data; + struct nv_value *val; + int j; for (ld = b->entry; ld; ld = next) { next = ld->next; + if (!ld->src[0]) + continue; + val = ld->src[0]->value; + rec = NULL; if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { - + data = val->reg.id; + rec = &ctx->mem_v; } else if (ld->opcode == NV_OP_LDA) { - + data = val->reg.id; + if (val->reg.file >= NV_FILE_MEM_C(0) && + val->reg.file <= NV_FILE_MEM_C(15)) + rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)]; + else + if (val->reg.file == NV_FILE_MEM_S) + rec = &ctx->mem_s; + else + if (val->reg.file == NV_FILE_MEM_L) + rec = &ctx->mem_l; } else - if (ld->opcode == NV_OP_MOV) { - + if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) { + data = val->reg.imm.u32; + rec = &ctx->imm; + } + + if (!rec || !ld->def[0]->refc) + continue; + + for (it = *rec; it; it = it->next) + if (it->data == data) + break; + + if (it) { +#if 1 + nvcg_replace_value(ctx->pc, ld->def[0], it->value); +#else + ld->opcode = NV_OP_MOV; + nv_reference(ctx->pc, &ld->src[0], it->value); +#endif + } else { + if (ctx->alloc == LOAD_RECORD_POOL_SIZE) + continue; + it = &ctx->pool[ctx->alloc++]; + it->next = *rec; + it->data = data; + it->value = ld->def[0]; + *rec = it; } } + + ctx->imm = NULL; + ctx->mem_s = NULL; + ctx->mem_v = NULL; + for (j = 0; j < 16; ++j) + ctx->mem_c[j] = NULL; + ctx->mem_l = NULL; + ctx->alloc = 0; + DESCEND_ARBITRARY(j, nv_pass_reload_elim); return 0; @@ -678,23 +746,74 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) return 0; } +/* local common subexpression elimination, stupid O(n^2) implementation */ +static int +nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *ir, *ik, *next; + struct nv_instruction *entry = b->phi ? b->phi : b->entry; + int s; + unsigned int reps; + + do { + reps = 0; + for (ir = entry; ir; ir = next) { + next = ir->next; + for (ik = entry; ik != ir; ik = ik->next) { + if (ir->opcode != ik->opcode) + continue; + + if (ik->opcode == NV_OP_LDA || + ik->opcode == NV_OP_STA || + ik->opcode == NV_OP_MOV || + nv_is_vector_op(ik->opcode)) + continue; /* ignore loads, stores & moves */ + + if (ik->src[4] || ir->src[4]) + continue; /* don't mess with address registers */ + + for (s = 0; s < 3; ++s) { + struct nv_value *a, *b; + + if (!ik->src[s]) { + if (ir->src[s]) + break; + continue; + } + if (ik->src[s]->mod != ir->src[s]->mod) + break; + a = ik->src[s]->value; + b = ir->src[s]->value; + if (a == b) + continue; + if (a->reg.file != b->reg.file || + a->reg.id < 0 || + a->reg.id != b->reg.id) + break; + } + if (s == 3) { + nv_nvi_delete(ir); + ++reps; + nvcg_replace_value(ctx->pc, ir->def[0], ik->def[0]); + break; + } + } + } + } while(reps); + + DESCEND_ARBITRARY(s, nv_pass_cse); + + return 0; +} + int nv_pc_exec_pass0(struct nv_pc *pc) { - struct nv_pass_reldelim *reldelim; + struct nv_pass_reld_elim *reldelim; struct nv_pass pass; struct nv_pass_dce dce; int ret; - reldelim = CALLOC_STRUCT(nv_pass_reldelim); - reldelim->pc = pc; - - ret = nv_pass_reload_elim(reldelim, pc->root); - - FREE(reldelim); - if (ret) - return ret; - pass.pc = pc; pc->pass_seq++; @@ -720,6 +839,19 @@ nv_pc_exec_pass0(struct nv_pc *pc) if (ret) return ret; + reldelim = CALLOC_STRUCT(nv_pass_reld_elim); + reldelim->pc = pc; + pc->pass_seq++; + ret = nv_pass_reload_elim(reldelim, pc->root); + FREE(reldelim); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_cse(&pass, pc->root); + if (ret) + return ret; + pc->pass_seq++; ret = nv_pass_lower_mods(&pass, pc->root); if (ret) diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 00b50b4edc..82080779c3 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -181,7 +181,7 @@ nv_print_address(const char c, int buf, struct nv_value *a, int offset) static INLINE void nv_print_cond(struct nv_instruction *nvi) { - PRINT("%s%s%s$c%i ", + PRINT("%s%s %s$c%i ", gree, nv_cond_name(nvi->cc), mgta, nv_value_id(nvi->flags_src->value)); } @@ -198,7 +198,7 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) PRINT(" %s%s", gree, nv_type_name(type)); if (!nv_value_allocated(value)) - reg_pfx = '%'; + reg_pfx = nv_value_allocated(value->join) ? '&' : '%'; switch (value->reg.file) { case NV_FILE_GPR: @@ -268,6 +268,8 @@ nv_print_instruction(struct nv_instruction *i) { int j; + PRINT("%i: ", i->serial); + if (i->flags_src) nv_print_cond(i); -- cgit v1.2.3 From a3ba99b3037bad629622766d4e08d48ab6d20aae Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 25 Jul 2010 23:32:18 +0200 Subject: nv50: fix constant_operand opt mul by 2 case --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 0018131fb5..107ef0f4bf 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -465,11 +465,7 @@ constant_operand(struct nv_pc *pc, if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) || (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) { nvi->opcode = NV_OP_ADD; - nv_reference(pc, &nvi->src[s], NULL); - if (!s) { - nvi->src[0] = nvi->src[1]; - nvi->src[1] = NULL; - } + nv_reference(pc, &nvi->src[s], nvi->src[t]->value); } else if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) { nvi->opcode = NV_OP_NEG; -- cgit v1.2.3 From e1ad3bd2f25832147814fcfe72166898bc07f11a Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 26 Jul 2010 00:56:12 +0200 Subject: nv50: permit usage of undefined TGSI TEMPs --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 5b69d520bc..3d5843ee0e 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -174,7 +174,8 @@ bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) fetch_by_bb(stack, vals, &n, bld->pc->current_block); - assert(n); + if (n == 0) + return NULL; if (n == 1) return vals[0]; @@ -606,6 +607,7 @@ bld_export_outputs(struct bld_context *bld) if (!bld_is_output_written(bld, i, c)) continue; vals[n] = bld_fetch_global(bld, &bld->ovs[i][c]); + assert(vals[n]); vals[n] = bld_insn_1(bld, NV_OP_MOV, vals[n]); vals[n++]->reg.id = bld->ti->output_map[i][c]; } @@ -734,6 +736,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, abort(); break; } + if (!res) { + debug_printf("WARNING: undefined source value in TGSI instruction\n"); + return bld_load_imm_u32(bld, 0); + } switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) { case TGSI_UTIL_SIGN_KEEP: -- cgit v1.2.3 From 7d34e79e449284c6a833c2e58c714ea1e48669dd Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 26 Jul 2010 11:18:56 +0200 Subject: nv50: add missing 2nd source for POW multiplication --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 3d5843ee0e..da7fe746f4 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -278,13 +278,21 @@ bld_insn_3(struct bld_context *bld, uint opcode, (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ } while(0) +#define BLD_INSN_2_EX(d, op, dt, s0, s0t, s1, s1t) \ + do { \ + (d) = bld_insn_2(bld, (NV_OP_##op), (s0), (s1)); \ + (d)->reg.type = NV_TYPE_##dt; \ + (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ + (d)->insn->src[1]->typecast = NV_TYPE_##s1t; \ + } while(0) + static struct nv_value * bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e) { struct nv_value *val; BLD_INSN_1_EX(val, LG2, F32, x, F32); - BLD_INSN_1_EX(val, MUL, F32, e, F32); + BLD_INSN_2_EX(val, MUL, F32, e, F32, val, F32); val = bld_insn_1(bld, NV_OP_PREEX2, val); val = bld_insn_1(bld, NV_OP_EX2, val); -- cgit v1.2.3 From 28ded2585ca856b67b8cc0dd7c1de000b3fc729b Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 26 Jul 2010 11:32:27 +0200 Subject: nv50: add signed RGTC1 to format table, allow 2_10_10_10 for vbufs --- src/gallium/drivers/nv50/nv50_formats.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_formats.c b/src/gallium/drivers/nv50/nv50_formats.c index 433c74e611..e1c7dae306 100644 --- a/src/gallium/drivers/nv50/nv50_formats.c +++ b/src/gallium/drivers/nv50/nv50_formats.c @@ -86,12 +86,12 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] = SAMPLER_VIEW }, [PIPE_FORMAT_R10G10B10A2_UNORM] = { NV50TCL_RT_FORMAT_A2B10G10R10_UNORM, - A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1), - SAMPLER_VIEW | RENDER_TARGET | SCANOUT }, + A_(C0, C1, C2, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 0), + SAMPLER_VIEW | RENDER_TARGET | VERTEX_BUFFER | SCANOUT }, [PIPE_FORMAT_B10G10R10A2_UNORM] = { NV50TCL_RT_FORMAT_A2R10G10B10_UNORM, A_(C2, C1, C0, C3, UNORM, UNORM, UNORM, UNORM, 2_10_10_10, 1), - SAMPLER_VIEW | RENDER_TARGET }, + SAMPLER_VIEW | RENDER_TARGET | VERTEX_BUFFER }, /* DEPTH/STENCIL FORMATS */ @@ -168,6 +168,10 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] = B_(C0, ZERO, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC1, 0), SAMPLER_VIEW }, + [PIPE_FORMAT_RGTC1_SNORM] = { 0, + B_(C0, ZERO, ZERO, ONE, SNORM, SNORM, SNORM, SNORM, RGTC1, 0), + SAMPLER_VIEW }, + [PIPE_FORMAT_RGTC2_UNORM] = { 0, B_(C0, C1, ZERO, ONE, UNORM, UNORM, UNORM, UNORM, RGTC2, 0), SAMPLER_VIEW }, -- cgit v1.2.3 From 582311ca979ac2316807cdffb15e7a25000693f4 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 26 Jul 2010 15:06:58 +0200 Subject: nv50: fix for empty BBs --- src/gallium/drivers/nv50/nv50_pc.c | 30 +++++++++++----------------- src/gallium/drivers/nv50/nv50_pc.h | 1 - src/gallium/drivers/nv50/nv50_pc_optimize.c | 31 +++++++++++++++++------------ src/gallium/drivers/nv50/nv50_pc_print.c | 7 ++++--- 4 files changed, 34 insertions(+), 35 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index e09f94074d..0e8aadf5a9 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -254,7 +254,7 @@ nv50_emit_program(struct nv_pc *pc) assert(pc->emit == &code[pc->bin_size / 4]); /* XXX: we can do better than this ... */ - if ((pc->emit[-1] & 3) == 3) { + if ((pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) { pc->emit[0] = 0xf0000001; pc->emit[1] = 0xe0000000; pc->bin_size += 8; @@ -347,16 +347,16 @@ nvbb_insert_phi(struct nv_basic_block *b, struct nv_instruction *i) b->entry->prev = i; } else { b->entry = i; - b->exit = i; + b->exit = i; } } else { assert(b->entry); if (b->entry->opcode == NV_OP_PHI) { /* insert after entry */ - assert(b->entry == b->exit); + assert(b->entry == b->exit); b->entry->next = i; i->prev = b->entry; b->entry = i; - b->exit = i; + b->exit = i; } else { /* insert before entry */ assert(b->entry->prev && b->exit); i->next = b->entry; @@ -396,12 +396,9 @@ nv_nvi_delete(struct nv_instruction *nvi) debug_printf("REM: "); nv_print_instruction(nvi); - for (j = 0; j < 4; ++j) { - if (!nvi->src[j]) - break; - --(nvi->src[j]->value->refc); - nvi->src[j] = NULL; - } + for (j = 0; j < 5; ++j) + nv_reference(NULL, &nvi->src[j], NULL); + nv_reference(NULL, &nvi->flags_src, NULL); if (nvi->next) nvi->next->prev = nvi->prev; @@ -414,19 +411,16 @@ nv_nvi_delete(struct nv_instruction *nvi) nvi->prev->next = nvi->next; if (nvi == b->entry) { - assert(nvi->opcode != NV_OP_PHI || !nvi->next); - - if (!nvi->next || (nvi->opcode == NV_OP_PHI)) - b->entry = nvi->prev; - else - b->entry = nvi->next; + /* PHIs don't get hooked to b->entry */ + b->entry = nvi->next; + assert(!nvi->prev || nvi->prev->opcode == NV_OP_PHI); } if (nvi == b->phi) { - assert(!nvi->prev); if (nvi->opcode != NV_OP_PHI) - debug_printf("WARN: b->phi points to non-PHI instruction\n"); + debug_printf("NOTE: b->phi points to non-PHI instruction\n"); + assert(!nvi->prev); if (!nvi->next || nvi->next->opcode != NV_OP_PHI) b->phi = NULL; else diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index ffcdaf44af..da3f984783 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -402,7 +402,6 @@ nv_reference(struct nv_pc *pc, struct nv_ref **d, struct nv_value *s) ++(s->refc); } } else { - assert(*d); *d = NULL; } } diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 107ef0f4bf..42f3a8634e 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -122,15 +122,29 @@ nvi_isnop(struct nv_instruction *nvi) static void nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) { + struct nv_basic_block *in; struct nv_instruction *nvi, *next; int j; uint size, n32 = 0; b->priv = 0; - if (pc->num_blocks) - b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos + - pc->bb_list[pc->num_blocks - 1]->bin_size; + for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->bin_size; --j); + if (j >= 0) { + in = pc->bb_list[j]; + + /* check for no-op branches (BRA $PC+8) */ + if (in->exit && in->exit->opcode == NV_OP_BRA && in->exit->target == b) { + in->bin_size -= 8; + pc->bin_size -= 8; + + for (++j; j < pc->num_blocks; ++j) + pc->bb_list[j]->bin_pos -= 8; + + nv_nvi_delete(in->exit); + } + b->bin_pos = in->bin_pos + in->bin_size; + } pc->bb_list[pc->num_blocks++] = b; @@ -183,7 +197,7 @@ nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) b->exit->prev->is_long = 1; } } - assert(!b->exit || b->exit->is_long); + assert(!b->entry || (b->exit && b->exit->is_long)); pc->bin_size += b->bin_size *= 4; @@ -194,15 +208,6 @@ nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) return; -#if 0 - /* delete ELSE branch */ - if (b->entry && - b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) { - nv_nvi_delete(b->entry); - b->bin_size -= 2; - pc->bin_size -= 8; - } -#endif for (j = 0; j < 2; ++j) if (b->out[j] && b->out[j] != b) nv_pc_pass_pre_emission(pc, b->out[j]); diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 82080779c3..c2c3eb25bc 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -290,6 +290,9 @@ nv_print_instruction(struct nv_instruction *i) */ if (i->def[0]) nv_print_value(i->def[0], NULL, NV_TYPE_ANY); + else + if (i->target) + PRINT(" %s(BB:%i)", orng, i->target->id); else PRINT(" #"); @@ -304,7 +307,5 @@ nv_print_instruction(struct nv_instruction *i) (j == nv50_indirect_opnd(i)) ? i->src[4]->value : NULL); } - if (!i->is_long) - PRINT(" %ss", norm); - PRINT("\n"); + PRINT(" %s%c\n", norm, i->is_long ? 'l' : 's'); } -- cgit v1.2.3 From 5de5e4fd5c7c6d55e9b3aadbaae0ca34e2662e2c Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 27 Jul 2010 17:56:13 +0200 Subject: nv50: insert MOVs also for PHI sources from dominating block Otherwise we get live range conflicts for operands that are written only in e.g. an ELSE block but not the IF block. --- src/gallium/drivers/nv50/nv50_pc_print.c | 12 ++++--- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 53 ++++++++++++++--------------- 2 files changed, 32 insertions(+), 33 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index c2c3eb25bc..c812dbd066 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -181,9 +181,11 @@ nv_print_address(const char c, int buf, struct nv_value *a, int offset) static INLINE void nv_print_cond(struct nv_instruction *nvi) { - PRINT("%s%s %s$c%i ", + char pfx = nv_value_allocated(nvi->flags_src->value->join) ? '$' : '%'; + + PRINT("%s%s %s%cc%i ", gree, nv_cond_name(nvi->cc), - mgta, nv_value_id(nvi->flags_src->value)); + mgta, pfx, nv_value_id(nvi->flags_src->value)); } static INLINE void @@ -197,8 +199,8 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) if (value->reg.file != NV_FILE_FLAGS) PRINT(" %s%s", gree, nv_type_name(type)); - if (!nv_value_allocated(value)) - reg_pfx = nv_value_allocated(value->join) ? '&' : '%'; + if (!nv_value_allocated(value->join)) + reg_pfx = '%'; switch (value->reg.file) { case NV_FILE_GPR: @@ -301,7 +303,7 @@ nv_print_instruction(struct nv_instruction *i) continue; if (i->src[j]->mod) - PRINT(" %s", nv_modifier_string(i->src[j]->mod)); + PRINT(" %s%s", gree, nv_modifier_string(i->src[j]->mod)); nv_print_ref(i->src[j], (j == nv50_indirect_opnd(i)) ? diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 3cec219d1a..568384fd82 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -56,6 +56,25 @@ struct nv_pc_pass { uint pass_seq; }; +/* check if bf (future) can be reached from bp (past) */ +static boolean +bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, + struct nv_basic_block *bt) +{ + if (bf == bp) + return TRUE; + if (bp == bt) + return FALSE; + + if (bp->out[0] && bp->out[0] != bp && + bb_reachable_by(bf, bp->out[0], bt)) + return TRUE; + if (bp->out[1] && bp->out[1] != bp && + bb_reachable_by(bf, bp->out[1], bt)) + return TRUE; + return FALSE; +} + static void ranges_coalesce(struct nv_range *range) { @@ -422,7 +441,7 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) if (!i->src[j]) j = 3; else - if (i->src[j]->value->insn->bb == p) + if (bb_reachable_by(pn, i->src[j]->value->insn->bb, b)) break; } if (j >= 4) @@ -580,25 +599,6 @@ live_set_test(struct nv_basic_block *b, struct nv_ref *ref) return b->live_set[n / 32] & (1 << (n % 32)); } -/* check if bf (future) can be reached from bp (past) */ -static boolean -bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, - struct nv_basic_block *bt) -{ - if (bf == bp) - return TRUE; - if (bp == bt) - return FALSE; - - if (bp->out[0] && bp->out[0] != bp && - bb_reachable_by(bf, bp->out[0], bt)) - return TRUE; - if (bp->out[1] && bp->out[1] != bp && - bb_reachable_by(bf, bp->out[1], bt)) - return TRUE; - return FALSE; -} - /* The live set of a block contains those values that are live immediately * before the beginning of the block. */ @@ -918,12 +918,6 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter) return 0; } -static int -pass_eliminate_moves(struct nv_pc_pass *ctx) -{ - return 0; -} - int nv_pc_exec_pass1(struct nv_pc *pc) { @@ -971,6 +965,11 @@ nv_pc_exec_pass1(struct nv_pc *pc) goto out; } +#ifdef NV50_RA_DEBUG_LIVEI + for (i = 0; i < pc->num_values; ++i) + livei_print(&pc->values[i]); +#endif + for (i = 0; i < 2; ++i) { ret = pass_join_values(ctx, i); if (ret) @@ -981,8 +980,6 @@ nv_pc_exec_pass1(struct nv_pc *pc) } assert(!ret && "joining"); - ret = pass_eliminate_moves(ctx); - for (i = 0; i < pc->num_values; ++i) livei_release(&pc->values[i]); -- cgit v1.2.3 From 5705b45b6a050f908120779e6049853931a8025a Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 27 Jul 2010 18:25:37 +0200 Subject: nv50: explicitly set src type for SET ops Need to do this more nicely for all ops. --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index da7fe746f4..aafb5e8295 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -20,6 +20,8 @@ * SOFTWARE. */ +/* XXX: need to clean this up so we get the typecasting right more naturally */ + #include #include "nv50_context.h" @@ -1173,6 +1175,10 @@ bld_instruction(struct bld_context *bld, dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode); dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode); + dst0[c]->insn->src[0]->typecast = + dst0[c]->insn->src[1]->typecast = + infer_src_type(insn->Instruction.Opcode); + if (dst0[c]->reg.type != NV_TYPE_F32) break; dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]); -- cgit v1.2.3 From fa67cabe7a9f1343e96c7c8a105e82dc05e3de44 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 31 Jul 2010 17:52:54 +0200 Subject: nv50: fixes for nested IFs --- src/gallium/drivers/nv50/nv50_pc.c | 15 +++ src/gallium/drivers/nv50/nv50_pc.h | 1 + src/gallium/drivers/nv50/nv50_pc_optimize.c | 1 + src/gallium/drivers/nv50/nv50_pc_regalloc.c | 175 +++++++++++++++++----------- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 4 - 5 files changed, 127 insertions(+), 69 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 0e8aadf5a9..614982db2d 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -464,3 +464,18 @@ void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b) b->in[b->num_in++] = parent; } + +int +nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d) +{ + int j, n; + + if (b == d) + return 1; + + n = 0; + for (j = 0; j < b->num_in; ++j) + n += nvbb_dominated_by(b->in[j], d); + + return n && (n == b->num_in); +} diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index da3f984783..4b191c508a 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -425,6 +425,7 @@ int nv_nvi_refcount(struct nv_instruction *); void nv_nvi_delete(struct nv_instruction *); void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *); +int nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *); int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, struct nv_value *new_val); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 42f3a8634e..1f2f1630f4 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -815,6 +815,7 @@ nv_pc_exec_pass0(struct nv_pc *pc) struct nv_pass_dce dce; int ret; + pass.n = 0; pass.pc = pc; pc->pass_seq++; diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 568384fd82..941ec9f6f8 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -20,19 +20,6 @@ * SOFTWARE. */ -/* - * XXX: phi function live intervals start at first ordinary instruction, - * add_range should be taking care of that already ... - * - * XXX: TEX must choose TEX's def as representative - * - * XXX: Aieee! Must materialize MOVs if source is in other basic block! - * -- absolutely, or we cannot execute the MOV conditionally at all - * XXX: Aieee! Must include PHIs in LVA so we pull through liveness if - * PHI source is e.g. in dominator block. - * -- seems we lose liveness somehow, track that - */ - #include "nv50_context.h" #include "nv50_pc.h" @@ -143,7 +130,6 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end) bgn = val->insn->serial; if (bgn < b->entry->serial || bgn > b->exit->serial) bgn = b->entry->serial; - // debug_printf("add_range(value %i): [%i, %i)\n", val->n, bgn, end); if (bgn > end) { debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n", @@ -391,25 +377,45 @@ try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) do_join_values(ctx, a, b); } -/* For each operand of each phi in b, generate a new value by inserting a MOV - * at the end of the block it is coming from and replace the operand with it. - * This eliminates liveness conflicts. +/* For phi functions with sources from blocks that are not direct predecessors, + * if such a source is to be used in an earlier predecessor, we need to add an + * additional phi function. Used when inserting the MOVs below. + */ +static struct nv_value * +propagate_phi(struct nv_pc *pc, struct nv_instruction *phi, int s) +{ + struct nv_basic_block *b = pc->current_block; + struct nv_value *val = phi->src[s]->value; + struct nv_instruction *nvi = new_instruction(pc, NV_OP_PHI); + int i, k; + + (nvi->def[0] = new_value(pc, val->reg.file, val->reg.type))->insn = nvi; + + for (k = 0, i = 0; i < 4 && phi->src[i]; ++i) { + if (bb_reachable_by(b, phi->src[i]->value->insn->bb, b)) + nvi->src[k++] = new_ref(pc, phi->src[i]->value); + } + return nvi->def[0]; +} + +/* For IF blocks without ELSE blocks, insert an empty block for the MOVs. + * Insert additional PHIs for cases where a direct MOV wouldn't be valid. */ static int -pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) +pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b) { - struct nv_instruction *i, *i2; - struct nv_basic_block *p, *pn; + struct nv_instruction *i, *ni; struct nv_value *val; + struct nv_basic_block *p, *pn; int n, j; b->pass_seq = ctx->pc->pass_seq; for (n = 0; n < b->num_in; ++n) { - p = b->in[n]; + p = pn = b->in[n]; assert(p); - if (b->num_in > 1 && p->out[0] && p->out[1]) { /* if without else */ + if (b->num_in > 1 && p->out[0] && p->out[1]) { pn = new_basic_block(ctx->pc); if (p->out[0] == b) @@ -426,58 +432,99 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) break; } } - pn->out[0] = b; pn->in[0] = p; pn->num_in = 1; - } else - pn = p; + } ctx->pc->current_block = pn; - /* every block with PHIs will also have other operations */ for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { - for (j = 0; j < 4; ++j) { - if (!i->src[j]) - j = 3; - else + for (j = 0; j < 4 && i->src[j]; ++j) { if (bb_reachable_by(pn, i->src[j]->value->insn->bb, b)) break; } - if (j >= 4) + if (j >= 4 || !i->src[j]) continue; - assert(i->src[j]); val = i->src[j]->value; - /* XXX: should probably not insert this after terminator */ - i2 = new_instruction(ctx->pc, NV_OP_MOV); - - i2->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); - i2->src[0] = new_ref (ctx->pc, val); - i2->def[0]->insn = i2; - - nv_reference(ctx->pc, &i->src[j], i2->def[0]); + if (!nvbb_dominated_by(pn, val->insn->bb)) + nv_reference(ctx->pc, &i->src[j], propagate_phi(ctx->pc, i, j)); } if (pn != p && pn->exit) { - /* XXX: this branch should probably be eliminated */ ctx->pc->current_block = b->in[n ? 0 : 1]; - i2 = new_instruction(ctx->pc, NV_OP_BRA); - i2->target = b; - i2->is_terminator = 1; + ni = new_instruction(ctx->pc, NV_OP_BRA); + ni->target = b; + ni->is_terminator = 1; } } - if (b->out[0] && b->out[0]->pass_seq < ctx->pc->pass_seq) { - pass_generate_phi_movs(ctx, b->out[0]); - } + for (j = 0; j < 2; ++j) + if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) + pass_generate_phi_movs_1(ctx, b->out[j]); + + return 0; +} + +/* Now everything should be in order and we can insert the MOVs. */ +static int +pass_generate_phi_movs_2(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *i, *mov; + struct nv_value *val; + struct nv_basic_block *p; + int n, j; + + b->pass_seq = ctx->pc->pass_seq; + + for (n = 0; n < b->num_in; ++n) { + ctx->pc->current_block = p = b->in[n]; + + for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { + for (j = 0; j < 4 && i->src[j]; ++j) { + if (bb_reachable_by(p, i->src[j]->value->insn->bb, b)) + break; + } + if (j >= 4 || !i->src[j]) + continue; + val = i->src[j]->value; + + mov = new_instruction(ctx->pc, NV_OP_MOV); + + /* TODO: insert instruction at correct position in the first place */ + if (mov->prev && mov->prev->target) + nv_nvi_permute(mov->prev, mov); + + mov->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); + mov->def[0]->insn = mov; + mov->src[0] = new_ref(ctx->pc, val); - if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) { - pass_generate_phi_movs(ctx, b->out[1]); + nv_reference(ctx->pc, &i->src[j], mov->def[0]); + } } + for (j = 1; j >= 0; --j) /* different order for the sake of diversity */ + if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) + pass_generate_phi_movs_2(ctx, b->out[j]); + return 0; } +/* For each operand of each PHI in b, generate a new value by inserting a MOV + * at the end of the block it is coming from and replace the operand with its + * result. This eliminates liveness conflicts and enables us to let values be + * copied to the right register if such a conflict exists nonetheless. + */ +static INLINE int +pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) +{ + if (pass_generate_phi_movs_1(ctx, b)) + return 1; + + ++ctx->pc->pass_seq; + return pass_generate_phi_movs_2(ctx, b); +} + static int pass_join_values(struct nv_pc_pass *ctx, int iter) { @@ -525,6 +572,7 @@ pass_join_values(struct nv_pc_pass *ctx, int iter) return 0; } +/* Order the instructions so that live intervals can be expressed in numbers. */ static int pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b) { @@ -560,7 +608,7 @@ bb_live_set_print(struct nv_pc *pc, struct nv_basic_block *b) int j; struct nv_value *val; - debug_printf("live_set of %p: ", b); + debug_printf("LIVE-INs of BB:%i: ", b->id); for (j = 0; j < pc->num_values; ++j) { if (!(b->live_set[j / 32] & (1 << (j % 32)))) @@ -579,16 +627,12 @@ live_set_add(struct nv_basic_block *b, struct nv_value *val) { if (!val->insn) /* don't add non-def values */ return; - /* debug_printf("live[%p] <- %i\n", b, val->n); */ - b->live_set[val->n / 32] |= 1 << (val->n % 32); } static INLINE void live_set_rem(struct nv_basic_block *b, struct nv_value *val) { - /* if (val->insn) - debug_printf("live[%p] -> %i\n", b, val->n); */ b->live_set[val->n / 32] &= ~(1 << (val->n % 32)); } @@ -600,7 +644,7 @@ live_set_test(struct nv_basic_block *b, struct nv_ref *ref) } /* The live set of a block contains those values that are live immediately - * before the beginning of the block. + * before the beginning of the block, so do a backwards scan. */ static int pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) @@ -608,6 +652,14 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) struct nv_instruction *i; int j, n, ret = 0; + debug_printf("pass_build_live_sets BB:%i\n", b->id); + + if (b->pass_seq >= ctx->pc->pass_seq) { + debug_printf("already visited\n"); + return 0; + } + b->pass_seq = ctx->pc->pass_seq; + /* slight hack for undecidedness: set phi = entry if it's undefined */ if (!b->phi) b->phi = b->entry; @@ -638,23 +690,18 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) { live_set_add(b, i->src[j]->value); - debug_printf("%p: live set + %i\n", b, i->src[j]->value->n); + debug_printf("BB:%i liveset + %i\n", b->id, i->src[j]->value->n); } else { live_set_rem(b, i->src[j]->value); - debug_printf("%p: live set - %i\n", b, i->src[j]->value->n); + debug_printf("BB:%i liveset - %i\n", b->id, i->src[j]->value->n); } } } } - if (b->pass_seq >= ctx->pc->pass_seq) - return 0; - b->pass_seq = ctx->pc->pass_seq; - - debug_printf("%s: visiting block %p\n", __FUNCTION__, b); - if (!b->entry) return 0; + bb_live_set_print(ctx->pc, b); for (i = b->exit; i; i = i->prev) { @@ -786,8 +833,6 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) if (b->out[1] && b->out[1]->pass_seq < ctx->pc->pass_seq) pass_build_intervals(ctx, b->out[1]); - debug_printf("built intervals for block %p\n", b); - return 0; } diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index aafb5e8295..8846ef08b5 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -212,15 +212,11 @@ bld_imm_u32(struct bld_context *bld, uint32_t u) int i; unsigned n = bld->num_immds; - debug_printf("bld_imm_u32: 0x%08x\n", u); - for (i = 0; i < n; ++i) if (bld->saved_immd[i]->reg.imm.u32 == u) return bld->saved_immd[i]; assert(n < BLD_MAX_IMMDS); - debug_printf("need new one\n"); - bld->num_immds++; bld->saved_immd[n] = new_value(bld->pc, NV_FILE_IMM, NV_TYPE_U32); -- cgit v1.2.3 From 2c695d38e6b194572becf82300fba5e34b1fd7d7 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 31 Jul 2010 20:56:42 +0200 Subject: nv50: don't eliminate loads to dedicated values --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 1f2f1630f4..324f8bb2da 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -639,12 +639,10 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) break; if (it) { -#if 1 - nvcg_replace_value(ctx->pc, ld->def[0], it->value); -#else - ld->opcode = NV_OP_MOV; - nv_reference(ctx->pc, &ld->src[0], it->value); -#endif + if (ld->def[0]->reg.id >= 0) + it->value = ld->def[0]; + else + nvcg_replace_value(ctx->pc, ld->def[0], it->value); } else { if (ctx->alloc == LOAD_RECORD_POOL_SIZE) continue; -- cgit v1.2.3 From 720e0c430d0a66cbf5adfcf40030f27e55ad6c6a Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 31 Jul 2010 21:30:35 +0200 Subject: nv50: fix constbuf validation We only uploaded up to the highest offset a program would use, and if the constant buffer isn't changed when a new program is used, the new program is missing the rest of them. Might want to introduce a "fill state" for user mem constbufs. --- src/gallium/drivers/nv50/nv50_shader_state.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c index f7e6355286..3d5df596ef 100644 --- a/src/gallium/drivers/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -44,7 +44,7 @@ nv50_transfer_constbuf(struct nv50_context *nv50, if (!map) return; - count = MIN2(buf->width0, size); + count = buf->width0; /* MIN2(buf->width0, size); */ start = 0; while (count) { @@ -92,8 +92,13 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) } } + /* If the state tracker doesn't change the constbuf, and it is first + * validated with a program that doesn't use it, this check prevents + * it from even being uploaded. */ + /* if (p->parm_size == 0) return; + */ switch (p->type) { case PIPE_SHADER_VERTEX: -- cgit v1.2.3 From aaa8802a22d83fd89d7e306b7d03fa587a19aa0a Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 5 Aug 2010 00:11:56 +0200 Subject: nv50: build proper phi functions in the first place --- src/gallium/drivers/nv50/nv50_pc.c | 39 +++++++- src/gallium/drivers/nv50/nv50_pc.h | 3 + src/gallium/drivers/nv50/nv50_pc_optimize.c | 4 + src/gallium/drivers/nv50/nv50_pc_regalloc.c | 140 +++++----------------------- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 127 +++++++++++++++++++------ 5 files changed, 166 insertions(+), 147 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 614982db2d..e32d28a9ce 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -394,7 +394,7 @@ nv_nvi_delete(struct nv_instruction *nvi) struct nv_basic_block *b = nvi->bb; int j; - debug_printf("REM: "); nv_print_instruction(nvi); + /* debug_printf("REM: "); nv_print_instruction(nvi); */ for (j = 0; j < 5; ++j) nv_reference(NULL, &nvi->src[j], NULL); @@ -477,5 +477,40 @@ nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d) for (j = 0; j < b->num_in; ++j) n += nvbb_dominated_by(b->in[j], d); - return n && (n == b->num_in); + return (n && (n == b->num_in)) ? 1 : 0; +} + +/* check if bf (future) can be reached from bp (past) */ +boolean +nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, + struct nv_basic_block *bt) +{ + if (bf == bp) + return TRUE; + if (bp == bt) + return FALSE; + + if (bp->out[0] && bp->out[0] != bp && + nvbb_reachable_by(bf, bp->out[0], bt)) + return TRUE; + if (bp->out[1] && bp->out[1] != bp && + nvbb_reachable_by(bf, bp->out[1], bt)) + return TRUE; + return FALSE; +} + +struct nv_basic_block * +nvbb_dom_frontier(struct nv_basic_block *b) +{ + struct nv_basic_block *df = b->out[0]; + + assert(df); + while (nvbb_dominated_by(df, b) || + (!nvbb_dominated_by(df->in[0], b) && + (!df->in[1] || !nvbb_dominated_by(df->in[1], b)))) { + df = df->out[0]; + assert(df); + } + assert(df); + return df; } diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 4b191c508a..987043c7a0 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -426,6 +426,9 @@ void nv_nvi_delete(struct nv_instruction *); void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *); int nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *); +boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *, + struct nv_basic_block *); +struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *); int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, struct nv_value *new_val); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 324f8bb2da..f2f8d0eaa3 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -771,6 +771,10 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) if (ik->src[4] || ir->src[4]) continue; /* don't mess with address registers */ + if (ik->flags_src || ir->flags_src || + ik->flags_def || ir->flags_def) + continue; /* and also not with flags, for now */ + for (s = 0; s < 3; ++s) { struct nv_value *a, *b; diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 941ec9f6f8..172e44f62b 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -43,25 +43,6 @@ struct nv_pc_pass { uint pass_seq; }; -/* check if bf (future) can be reached from bp (past) */ -static boolean -bb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, - struct nv_basic_block *bt) -{ - if (bf == bp) - return TRUE; - if (bp == bt) - return FALSE; - - if (bp->out[0] && bp->out[0] != bp && - bb_reachable_by(bf, bp->out[0], bt)) - return TRUE; - if (bp->out[1] && bp->out[1] != bp && - bb_reachable_by(bf, bp->out[1], bt)) - return TRUE; - return FALSE; -} - static void ranges_coalesce(struct nv_range *range) { @@ -377,32 +358,13 @@ try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) do_join_values(ctx, a, b); } -/* For phi functions with sources from blocks that are not direct predecessors, - * if such a source is to be used in an earlier predecessor, we need to add an - * additional phi function. Used when inserting the MOVs below. - */ -static struct nv_value * -propagate_phi(struct nv_pc *pc, struct nv_instruction *phi, int s) -{ - struct nv_basic_block *b = pc->current_block; - struct nv_value *val = phi->src[s]->value; - struct nv_instruction *nvi = new_instruction(pc, NV_OP_PHI); - int i, k; - - (nvi->def[0] = new_value(pc, val->reg.file, val->reg.type))->insn = nvi; - - for (k = 0, i = 0; i < 4 && phi->src[i]; ++i) { - if (bb_reachable_by(b, phi->src[i]->value->insn->bb, b)) - nvi->src[k++] = new_ref(pc, phi->src[i]->value); - } - return nvi->def[0]; -} - -/* For IF blocks without ELSE blocks, insert an empty block for the MOVs. - * Insert additional PHIs for cases where a direct MOV wouldn't be valid. +/* For each operand of each PHI in b, generate a new value by inserting a MOV + * at the end of the block it is coming from and replace the operand with its + * result. This eliminates liveness conflicts and enables us to let values be + * copied to the right register if such a conflict exists nonetheless. */ static int -pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b) +pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) { struct nv_instruction *i, *ni; struct nv_value *val; @@ -426,31 +388,36 @@ pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b) if (p->exit->target == b) /* target to new else-block */ p->exit->target = pn; - for (j = 0; j < b->num_in; ++j) { - if (b->in[j] == p) { - b->in[j] = pn; - break; - } - } + b->in[n] = pn; + pn->out[0] = b; pn->in[0] = p; pn->num_in = 1; } - ctx->pc->current_block = pn; for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { for (j = 0; j < 4 && i->src[j]; ++j) { - if (bb_reachable_by(pn, i->src[j]->value->insn->bb, b)) + if (nvbb_reachable_by(p, i->src[j]->value->insn->bb, b)) break; } if (j >= 4 || !i->src[j]) continue; val = i->src[j]->value; - if (!nvbb_dominated_by(pn, val->insn->bb)) - nv_reference(ctx->pc, &i->src[j], propagate_phi(ctx->pc, i, j)); + ni = new_instruction(ctx->pc, NV_OP_MOV); + + /* TODO: insert instruction at correct position in the first place */ + if (ni->prev && ni->prev->target) + nv_nvi_permute(ni->prev, ni); + + ni->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); + ni->def[0]->insn = ni; + ni->src[0] = new_ref(ctx->pc, val); + + nv_reference(ctx->pc, &i->src[j], ni->def[0]); } + if (pn != p && pn->exit) { ctx->pc->current_block = b->in[n ? 0 : 1]; ni = new_instruction(ctx->pc, NV_OP_BRA); @@ -461,70 +428,11 @@ pass_generate_phi_movs_1(struct nv_pc_pass *ctx, struct nv_basic_block *b) for (j = 0; j < 2; ++j) if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) - pass_generate_phi_movs_1(ctx, b->out[j]); + pass_generate_phi_movs(ctx, b->out[j]); return 0; } -/* Now everything should be in order and we can insert the MOVs. */ -static int -pass_generate_phi_movs_2(struct nv_pc_pass *ctx, struct nv_basic_block *b) -{ - struct nv_instruction *i, *mov; - struct nv_value *val; - struct nv_basic_block *p; - int n, j; - - b->pass_seq = ctx->pc->pass_seq; - - for (n = 0; n < b->num_in; ++n) { - ctx->pc->current_block = p = b->in[n]; - - for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { - for (j = 0; j < 4 && i->src[j]; ++j) { - if (bb_reachable_by(p, i->src[j]->value->insn->bb, b)) - break; - } - if (j >= 4 || !i->src[j]) - continue; - val = i->src[j]->value; - - mov = new_instruction(ctx->pc, NV_OP_MOV); - - /* TODO: insert instruction at correct position in the first place */ - if (mov->prev && mov->prev->target) - nv_nvi_permute(mov->prev, mov); - - mov->def[0] = new_value(ctx->pc, val->reg.file, val->reg.type); - mov->def[0]->insn = mov; - mov->src[0] = new_ref(ctx->pc, val); - - nv_reference(ctx->pc, &i->src[j], mov->def[0]); - } - } - - for (j = 1; j >= 0; --j) /* different order for the sake of diversity */ - if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) - pass_generate_phi_movs_2(ctx, b->out[j]); - - return 0; -} - -/* For each operand of each PHI in b, generate a new value by inserting a MOV - * at the end of the block it is coming from and replace the operand with its - * result. This eliminates liveness conflicts and enables us to let values be - * copied to the right register if such a conflict exists nonetheless. - */ -static INLINE int -pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) -{ - if (pass_generate_phi_movs_1(ctx, b)) - return 1; - - ++ctx->pc->pass_seq; - return pass_generate_phi_movs_2(ctx, b); -} - static int pass_join_values(struct nv_pc_pass *ctx, int iter) { @@ -688,7 +596,7 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) break; assert(i->src[j]->value->insn); - if (bb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) { + if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) { live_set_add(b, i->src[j]->value); debug_printf("BB:%i liveset + %i\n", b->id, i->src[j]->value->n); } else { @@ -774,7 +682,7 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) if (!i->src[s]) break; assert(i->src[s]->value->insn); - if (bb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j])) + if (nvbb_reachable_by(b, i->src[s]->value->insn->bb, b->out[j])) live_set_add(b, i->src[s]->value); else live_set_rem(b, i->src[s]->value); @@ -978,7 +886,7 @@ nv_pc_exec_pass1(struct nv_pc *pc) nv_print_program(ctx->pc->root); - ctx->insns = CALLOC(pc->num_instructions, sizeof(struct nv_instruction *)); + ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *)); pc->pass_seq++; ret = pass_generate_phi_movs(ctx, pc->root); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 8846ef08b5..6a9259c898 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -51,16 +51,22 @@ struct bld_value_stack { }; static INLINE void -bld_push_value(struct bld_value_stack *stk) +bld_vals_push_val(struct bld_value_stack *stk, struct nv_value *val) { - assert(!stk->size || (stk->body[stk->size - 1] != stk->top)); + assert(!stk->size || (stk->body[stk->size - 1] != val)); if (!(stk->size % 8)) { unsigned old_sz = (stk->size + 0) * sizeof(struct nv_value *); unsigned new_sz = (stk->size + 8) * sizeof(struct nv_value *); stk->body = (struct nv_value **)REALLOC(stk->body, old_sz, new_sz); } - stk->body[stk->size++] = stk->top; + stk->body[stk->size++] = val; +} + +static INLINE void +bld_vals_push(struct bld_value_stack *stk) +{ + bld_vals_push_val(stk, stk->top); stk->top = NULL; } @@ -72,7 +78,7 @@ bld_push_values(struct bld_value_stack *stacks, int n) for (i = 0; i < n; ++i) for (c = 0; c < 4; ++c) if (stacks[i * 4 + c].top) - bld_push_value(&stacks[i * 4 + c]); + bld_vals_push(&stacks[i * 4 + c]); } #define FETCH_TEMP(i, c) (bld->tvs[i][c].top) @@ -121,6 +127,17 @@ struct bld_context { uint num_immds; }; +static INLINE void +bld_warn_uninitialized(struct bld_context *bld, int kind, + struct bld_value_stack *stk, struct nv_basic_block *b) +{ + long i = (stk - &bld->tvs[0][0]) / 4; + long c = (stk - &bld->tvs[0][0]) & 3; + + debug_printf("WARNING: TEMP[%li].%li %s used uninitialized in BB:%i\n", + i, c, kind ? "may be" : "is", b->id); +} + static INLINE struct nv_value * bld_def(struct nv_instruction *i, int c, struct nv_value *value) { @@ -168,42 +185,91 @@ fetch_by_bb(struct bld_value_stack *stack, fetch_by_bb(stack, vals, n, b->in[i]); } +static INLINE struct nv_value * +bld_load_imm_u32(struct bld_context *bld, uint32_t u); + static struct nv_value * -bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) +bld_phi(struct bld_context *bld, struct nv_basic_block *b, + struct bld_value_stack *stack) { - struct nv_value *vals[16], *phi = NULL; - int j, i = 0, n = 0; + struct nv_basic_block *in; + struct nv_value *vals[16], *val; + struct nv_instruction *phi; + int i, j, n; + + do { + i = n = 0; + fetch_by_bb(stack, vals, &n, b); + + if (!n) { + bld_warn_uninitialized(bld, 0, stack, b); + return NULL; + } - fetch_by_bb(stack, vals, &n, bld->pc->current_block); + if (n == 1) { + if (nvbb_dominated_by(b, vals[0]->insn->bb)) + break; - if (n == 0) - return NULL; - if (n == 1) - return vals[0]; + bld_warn_uninitialized(bld, 1, stack, b); + + /* back-tracking to insert missing value of other path */ + in = b; + while (in->in[0]) { + if (in->num_in == 1) { + in = in->in[0]; + } else { + if (!nvbb_reachable_by(in->in[0], vals[0]->insn->bb, b)) { + in = in->in[0]; + break; + } + if (!nvbb_reachable_by(in->in[1], vals[0]->insn->bb, b)) { + in = in->in[1]; + break; + } + in = in->in[0]; + } + } + bld->pc->current_block = in; + + /* should make this a no-op */ + bld_vals_push_val(stack, bld_load_imm_u32(bld, 0)); + continue; + } - debug_printf("phi required: %i candidates\n", n); + for (i = 0; i < n; ++i) { + if (nvbb_dominated_by(b, vals[i]->insn->bb)) + continue; - while (i < n) { - struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_PHI); + for (j = 0; j < b->num_in; ++j) + if (nvbb_dominated_by(b->in[j], vals[i]->insn->bb)) + break; + if (j == b->num_in) { + in = nvbb_dom_frontier(vals[i]->insn->bb); + val = bld_phi(bld, in, stack); + bld_vals_push_val(stack, val); + break; + } + } + } while(i < n); - j = phi ? 1 : 0; - if (phi) - insn->src[0] = new_ref(bld->pc, phi); + bld->pc->current_block = b; - phi = new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type); + if (n == 1) + return vals[0]; - bld_def(insn, 0, phi); + phi = new_instruction(bld->pc, NV_OP_PHI); - for (; j < 4; ++j) { - insn->src[j] = new_ref(bld->pc, vals[i++]); - if (i == n) - break; - } - debug_printf("new phi: %i, %i in\n", phi->n, j); - } + bld_def(phi, 0, new_value(bld->pc, vals[0]->reg.file, vals[0]->reg.type)); + for (i = 0; i < n; ++i) + phi->src[i] = new_ref(bld->pc, vals[i]); - /* insert_at_head(list, phi) is done at end of block */ - return phi; + return phi->def[0]; +} + +static INLINE struct nv_value * +bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) +{ + return bld_phi(bld, bld->pc->current_block, stack); } static INLINE struct nv_value * @@ -640,6 +706,9 @@ bld_new_block(struct bld_context *bld, struct nv_basic_block *b) for (i = 0; i < 4; ++i) bld->saved_addr[i][0] = NULL; + + for (i = 0; i < 128; ++i) + bld->saved_inputs[i] = NULL; } static struct nv_value * -- cgit v1.2.3 From fc1d72d15d929b629be399d977ad05611f01fc59 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 5 Aug 2010 12:29:23 +0200 Subject: nv50: fix reg count --- src/gallium/drivers/nv50/nv50_pc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index e32d28a9ce..ed92261488 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -312,8 +312,8 @@ nv50_generate_code(struct nv50_translation_info *ti) ti->p->immd_size = pc->immd_count * 4; ti->p->immd = pc->immd_buf; - ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] + 1) >> 1; - ti->p->max_gpr++; + /* highest 16 bit reg to num of 32 bit regs */ + ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] >> 1) + 1; ti->p->fixups = pc->fixups; ti->p->num_fixups = pc->num_fixups; -- cgit v1.2.3 From 3a68fcfb6b406cf864afbf200e436fc384fd0865 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 10 Aug 2010 17:36:25 +0200 Subject: nv50: begin implementing loops --- src/gallium/drivers/nv50/nv50_pc.c | 168 +++++++++++++----- src/gallium/drivers/nv50/nv50_pc.h | 20 ++- src/gallium/drivers/nv50/nv50_pc_emit.c | 2 +- src/gallium/drivers/nv50/nv50_pc_optimize.c | 28 +-- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 52 +++--- src/gallium/drivers/nv50/nv50_program.c | 3 + src/gallium/drivers/nv50/nv50_program.h | 11 ++ src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 259 ++++++++++++++++++++++++---- 8 files changed, 416 insertions(+), 127 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index ed92261488..7601049126 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -75,7 +75,8 @@ nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s) case NV_OP_XOR: case NV_OP_SHL: case NV_OP_SHR: - return (s == 1) && (nvi->def[0]->reg.file == NV_FILE_GPR); + return (s == 1) && (nvi->src[0]->value->reg.file == NV_FILE_GPR) && + (nvi->def[0]->reg.file == NV_FILE_GPR); case NV_OP_MOV: assert(s == 0); return (nvi->def[0]->reg.file == NV_FILE_GPR); @@ -87,6 +88,12 @@ nv50_nvi_can_use_imm(struct nv_instruction *nvi, int s) boolean nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) { + int i; + + for (i = 0; i < 3 && nvi->src[i]; ++i) + if (nvi->src[i]->value->reg.file == NV_FILE_IMM) + return FALSE; + switch (nvi->opcode) { case NV_OP_ABS: case NV_OP_ADD: @@ -189,37 +196,89 @@ nv_pc_free_refs(struct nv_pc *pc) FREE(pc->refs[i]); } +static const char * +edge_name(ubyte type) +{ + switch (type) { + case CFG_EDGE_FORWARD: return "forward"; + case CFG_EDGE_BACK: return "back"; + case CFG_EDGE_LOOP_ENTER: return "loop"; + case CFG_EDGE_LOOP_LEAVE: return "break"; + default: + return "?"; + } +} + void -nv_print_program(struct nv_basic_block *b) +nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv) { - struct nv_instruction *i = b->phi; + struct nv_basic_block *bb[64], *bbb[16], *b; + int j, p, pp; + + bb[0] = root; + p = 1; + pp = 0; + + while (p > 0) { + b = bb[--p]; + b->priv = 0; + + for (j = 1; j >= 0; --j) { + if (!b->out[j]) + continue; + + switch (b->out_kind[j]) { + case CFG_EDGE_BACK: + continue; + case CFG_EDGE_FORWARD: + if (++b->out[j]->priv == b->out[j]->num_in) + bb[p++] = b->out[j]; + break; + case CFG_EDGE_LOOP_ENTER: + bb[p++] = b->out[j]; + break; + case CFG_EDGE_LOOP_LEAVE: + bbb[pp++] = b->out[j]; + break; + default: + assert(0); + break; + } + } + + f(priv, b); - b->priv = 0; + if (!p) + while (pp > 0) + bb[p++] = bbb[--pp]; + } +} + +static void +nv_do_print_program(void *priv, struct nv_basic_block *b) +{ + struct nv_instruction *i = b->phi; debug_printf("=== BB %i ", b->id); if (b->out[0]) - debug_printf("(--0> %i) ", b->out[0]->id); + debug_printf("[%s -> %i] ", edge_name(b->out_kind[0]), b->out[0]->id); if (b->out[1]) - debug_printf("(--1> %i) ", b->out[1]->id); + debug_printf("[%s -> %i] ", edge_name(b->out_kind[1]), b->out[1]->id); debug_printf("===\n"); + i = b->phi; if (!i) i = b->entry; for (; i; i = i->next) nv_print_instruction(i); +} - if (!b->out[0]) { - debug_printf("END\n\n"); - return; - } - if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) - return; - - if (b->out[0] != b) - nv_print_program(b->out[0]); +void +nv_print_program(struct nv_basic_block *root) +{ + nv_pc_pass_in_order(root, nv_do_print_program, root); - if (b->out[1] && b->out[1] != b) - nv_print_program(b->out[1]); + debug_printf("END\n\n"); } static INLINE void @@ -254,7 +313,7 @@ nv50_emit_program(struct nv_pc *pc) assert(pc->emit == &code[pc->bin_size / 4]); /* XXX: we can do better than this ... */ - if ((pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) { + if (!(pc->emit[-2] & 1) || (pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) { pc->emit[0] = 0xf0000001; pc->emit[1] = 0xe0000000; pc->bin_size += 8; @@ -281,6 +340,7 @@ nv50_generate_code(struct nv50_translation_info *ti) ret = nv50_tgsi_to_nc(pc, ti); if (ret) goto out; + nv_print_program(pc->root); /* optimization */ ret = nv_pc_exec_pass0(pc); @@ -454,30 +514,40 @@ nv_nvi_permute(struct nv_instruction *i1, struct nv_instruction *i2) i1->next->prev = i1; } -void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *b) +void +nvbb_attach_block(struct nv_basic_block *parent, + struct nv_basic_block *b, ubyte edge_kind) { + assert(b->num_in < 8); + if (parent->out[0]) { assert(!parent->out[1]); parent->out[1] = b; - } else + parent->out_kind[1] = edge_kind; + } else { parent->out[0] = b; + parent->out_kind[0] = edge_kind; + } - b->in[b->num_in++] = parent; + b->in[b->num_in] = parent; + b->in_kind[b->num_in++] = edge_kind; } -int +/* NOTE: all BRKs are treated as conditional, so there are 2 outgoing BBs */ + +boolean nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d) { - int j, n; + int j; if (b == d) - return 1; + return TRUE; - n = 0; for (j = 0; j < b->num_in; ++j) - n += nvbb_dominated_by(b->in[j], d); + if ((b->in_kind[j] != CFG_EDGE_BACK) && !nvbb_dominated_by(b->in[j], d)) + return FALSE; - return (n && (n == b->num_in)) ? 1 : 0; + return j ? TRUE : FALSE; } /* check if bf (future) can be reached from bp (past) */ @@ -490,27 +560,45 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, if (bp == bt) return FALSE; - if (bp->out[0] && bp->out[0] != bp && + if (bp->out[0] && bp->out_kind[0] != CFG_EDGE_BACK && nvbb_reachable_by(bf, bp->out[0], bt)) return TRUE; - if (bp->out[1] && bp->out[1] != bp && + if (bp->out[1] && bp->out_kind[1] != CFG_EDGE_BACK && nvbb_reachable_by(bf, bp->out[1], bt)) return TRUE; return FALSE; } +static struct nv_basic_block * +nvbb_find_dom_frontier(struct nv_basic_block *b, struct nv_basic_block *df) +{ + int i; + + if (!nvbb_dominated_by(df, b)) { + for (i = 0; i < df->num_in; ++i) { + if (df->in_kind[i] == CFG_EDGE_BACK) + continue; + if (nvbb_dominated_by(df->in[i], b)) + return df; + } + } + for (i = 0; i < 2 && b->out[i]; ++i) { + if (b->out_kind[i] == CFG_EDGE_BACK) + continue; + if ((df = nvbb_find_dom_frontier(b, b->out[i]))) + return df; + } + return NULL; +} + struct nv_basic_block * nvbb_dom_frontier(struct nv_basic_block *b) { - struct nv_basic_block *df = b->out[0]; - - assert(df); - while (nvbb_dominated_by(df, b) || - (!nvbb_dominated_by(df->in[0], b) && - (!df->in[1] || !nvbb_dominated_by(df->in[1], b)))) { - df = df->out[0]; - assert(df); - } - assert(df); - return df; + struct nv_basic_block *df; + int i; + + for (i = 0; i < 2 && b->out[i]; ++i) + if ((df = nvbb_find_dom_frontier(b, b->out[i]))) + return df; + return NULL; } diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 987043c7a0..8b1c9b3a72 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -246,6 +246,11 @@ struct nv_instruction { ubyte quadop; }; +#define CFG_EDGE_FORWARD 0 +#define CFG_EDGE_BACK 1 +#define CFG_EDGE_LOOP_ENTER 2 +#define CFG_EDGE_LOOP_LEAVE 4 + struct nv_basic_block { struct nv_instruction *entry; /* first non-phi instruction */ struct nv_instruction *exit; @@ -253,8 +258,10 @@ struct nv_basic_block { int num_instructions; struct nv_basic_block *out[2]; /* no indirect branches -> 2 */ - struct nv_basic_block **in; + struct nv_basic_block *in[8]; /* hope that suffices */ uint num_in; + ubyte out_kind[2]; + ubyte in_kind[8]; int id; struct nv_basic_block *last_visitor; @@ -383,7 +390,6 @@ new_basic_block(struct nv_pc *pc) { struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block); - bb->in = CALLOC(sizeof(struct nv_basic_block *), 4); bb->id = pc->num_blocks++; return bb; } @@ -414,6 +420,7 @@ const char *nv_opcode_name(uint opcode); void nv_print_instruction(struct nv_instruction *); /* nv50_pc.c */ + void nv_print_program(struct nv_basic_block *b); boolean nv_op_commutative(uint opcode); @@ -424,14 +431,19 @@ ubyte nv50_supported_src_mods(uint opcode, int s); int nv_nvi_refcount(struct nv_instruction *); void nv_nvi_delete(struct nv_instruction *); void nv_nvi_permute(struct nv_instruction *, struct nv_instruction *); -void nvbb_attach_block(struct nv_basic_block *parent, struct nv_basic_block *); -int nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *); +void nvbb_attach_block(struct nv_basic_block *parent, + struct nv_basic_block *, ubyte edge_kind); +boolean nvbb_dominated_by(struct nv_basic_block *, struct nv_basic_block *); boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *, struct nv_basic_block *); struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *); int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, struct nv_value *new_val); +typedef void (*nv_pc_pass_func)(void *priv, struct nv_basic_block *b); + +void nv_pc_pass_in_order(struct nv_basic_block *, nv_pc_pass_func, void *); + int nv_pc_exec_pass0(struct nv_pc *pc); int nv_pc_exec_pass1(struct nv_pc *pc); int nv_pc_exec_pass2(struct nv_pc *pc); diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index 728e2b145d..35bd5ff10f 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -694,7 +694,7 @@ emit_flow(struct nv_pc *pc, struct nv_instruction *i, ubyte flow_op) set_pred(pc, i); - if (i->target) { + if (i->target && (i->opcode != NV_OP_BREAK)) { new_fixup(pc, NV_FIXUP_CFLOW_RELOC, i->target->bin_pos, 0x7ff800, 11); pc->emit[0] |= (i->target->bin_pos / 4) << 11; } diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index f2f8d0eaa3..e4b5d321db 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -120,15 +120,14 @@ nvi_isnop(struct nv_instruction *nvi) } static void -nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) +nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) { + struct nv_pc *pc = (struct nv_pc *)priv; struct nv_basic_block *in; struct nv_instruction *nvi, *next; int j; uint size, n32 = 0; - b->priv = 0; - for (j = pc->num_blocks - 1; j >= 0 && !pc->bb_list[j]->bin_size; --j); if (j >= 0) { in = pc->bb_list[j]; @@ -200,17 +199,6 @@ nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) assert(!b->entry || (b->exit && b->exit->is_long)); pc->bin_size += b->bin_size *= 4; - - /* descend CFG */ - - if (!b->out[0]) - return; - if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) - return; - - for (j = 0; j < 2; ++j) - if (b->out[j] && b->out[j] != b) - nv_pc_pass_pre_emission(pc, b->out[j]); } int @@ -219,9 +207,9 @@ nv_pc_exec_pass2(struct nv_pc *pc) debug_printf("preparing %u blocks for emission\n", pc->num_blocks); pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); - pc->num_blocks = 0; - nv_pc_pass_pre_emission(pc, pc->root); + + nv_pc_pass_in_order(pc->root, nv_pc_pass_pre_emission, pc); return 0; } @@ -307,8 +295,11 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) if (nvi->def[0]->refc > 1) continue; - /* cannot MOV immediate to $oX */ - if (nvi->src[0]->value->reg.file == NV_FILE_IMM) + /* cannot write to $oX when using immediate */ + for (j = 0; j < 4 && nvi->src[j]; ++j) + if (nvi->src[j]->value->reg.file == NV_FILE_IMM) + break; + if (j < 4) continue; nvi->def[0] = sti->def[0]; @@ -339,7 +330,6 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) { nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); - debug_printf("folded immediate %i\n", ld->def[0]->n); continue; } diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 172e44f62b..d45dd7f95f 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -358,6 +358,18 @@ try_join_values(struct nv_pc_pass *ctx, struct nv_value *a, struct nv_value *b) do_join_values(ctx, a, b); } +static INLINE boolean +need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p) +{ + int i = 0, n = 0; + + for (; i < 2; ++i) + if (p->out[i] && p->out_kind[i] != CFG_EDGE_LOOP_LEAVE) + ++n; + + return (b->num_in > 1) && (n == 2); +} + /* For each operand of each PHI in b, generate a new value by inserting a MOV * at the end of the block it is coming from and replace the operand with its * result. This eliminates liveness conflicts and enables us to let values be @@ -377,7 +389,7 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) p = pn = b->in[n]; assert(p); - if (b->num_in > 1 && p->out[0] && p->out[1]) { + if (need_new_else_block(b, p)) { pn = new_basic_block(ctx->pc); if (p->out[0] == b) @@ -481,32 +493,19 @@ pass_join_values(struct nv_pc_pass *ctx, int iter) } /* Order the instructions so that live intervals can be expressed in numbers. */ -static int -pass_order_instructions(struct nv_pc_pass *ctx, struct nv_basic_block *b) +static void +pass_order_instructions(void *priv, struct nv_basic_block *b) { + struct nv_pc_pass *ctx = (struct nv_pc_pass *)priv; struct nv_instruction *i; - b->priv = 0; + b->pass_seq = ctx->pc->pass_seq; assert(!b->exit || !b->exit->next); for (i = b->phi; i; i = i->next) { i->serial = ctx->num_insns; ctx->insns[ctx->num_insns++] = i; } - - b->pass_seq = ctx->pc->pass_seq; - - if (!b->out[0]) - return 0; - if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) - return 0; - - if (b->out[0] != b) - pass_order_instructions(ctx, b->out[0]); - if (b->out[1] && b->out[1] != b) - pass_order_instructions(ctx, b->out[1]); - - return 0; } static void @@ -691,13 +690,15 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) } /* remaining live-outs are live until the end */ - for (j = 0; j < ctx->pc->num_values; ++j) { - if (!(b->live_set[j / 32] & (1 << (j % 32)))) - continue; + if (b->exit) { + for (j = 0; j < ctx->pc->num_values; ++j) { + if (!(b->live_set[j / 32] & (1 << (j % 32)))) + continue; #ifdef NV50_RA_DEBUG_LIVEI - debug_printf("adding range for live value %i\n", j); + debug_printf("adding range for live value %i\n", j); #endif - add_range(&ctx->pc->values[j], b, b->exit->serial + 1); + add_range(&ctx->pc->values[j], b, b->exit->serial + 1); + } } debug_printf("%s: looping through instructions now\n", __func__); @@ -905,10 +906,7 @@ nv_pc_exec_pass1(struct nv_pc *pc) } pc->pass_seq++; - ret = pass_order_instructions(ctx, pc->root); - assert(!ret && "order instructions"); - if (ret) - goto out; + nv_pc_pass_in_order(pc->root, pass_order_instructions, ctx); pc->pass_seq++; ret = pass_build_intervals(ctx, pc->root); diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 26d1be8db8..54cd36f868 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -27,6 +27,7 @@ #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_dump.h" static INLINE unsigned bitcount4(const uint32_t val) @@ -186,6 +187,8 @@ prog_immediate(struct nv50_translation_info *ti, int c; unsigned n = ++ti->immd32_nr; + tgsi_dump_immediate(imm); + if (n == (1 << (ffs(n) - 1))) ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16); diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 654bce59f3..1184d9be3b 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -92,6 +92,15 @@ struct nv50_program { #define NV50_INTERP_FLAT (1 << 1) #define NV50_INTERP_CENTROID (1 << 2) +#define NV50_PROG_MAX_SUBROUTINES 8 + +/* analyze TGSI and see which TEMP[] are used as subroutine inputs/outputs */ +struct nv50_subroutine { + int id; + uint32_t argv[4][1]; /* 4 bitmasks, for each of xyzw, only allow 32 TEMPs */ + uint32_t retv[4][1]; +}; + struct nv50_translation_info { struct nv50_program *p; unsigned inst_nr; @@ -108,6 +117,8 @@ struct nv50_translation_info { uint32_t *immd32; unsigned immd32_nr; ubyte edgeflag_out; + struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES]; + int subr_nr; }; int nv50_generate_code(struct nv50_translation_info *ti); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 6a9259c898..da33adcaa4 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -22,6 +22,19 @@ /* XXX: need to clean this up so we get the typecasting right more naturally */ +/* LOOP FIXME 1 + * In bld_store_loop_var, only replace values that belong to the TGSI register + * written. + * For TGSI MOV, we only associate the source value with the value tracker of + * the destination, instead of generating an actual MOV. + * + * Possible solution: generate PHI functions in loop headers in advance. + */ +/* LOOP FIXME 2: + * In fetch_by_bb, when going back through a break-block, we miss all of the + * definitions from inside the loop. + */ + #include #include "nv50_context.h" @@ -48,6 +61,8 @@ struct bld_value_stack { struct nv_value *top; struct nv_value **body; unsigned size; + uint16_t loop_use; /* 1 bit per loop level, indicates if used/defd */ + uint16_t loop_def; }; static INLINE void @@ -81,19 +96,6 @@ bld_push_values(struct bld_value_stack *stacks, int n) bld_vals_push(&stacks[i * 4 + c]); } -#define FETCH_TEMP(i, c) (bld->tvs[i][c].top) -#define STORE_TEMP(i, c, v) (bld->tvs[i][c].top = (v)) -#define FETCH_ADDR(i, c) (bld->avs[i][c].top) -#define STORE_ADDR(i, c, v) (bld->avs[i][c].top = (v)) -#define FETCH_PRED(i, c) (bld->pvs[i][c].top) -#define STORE_PRED(i, c, v) (bld->pvs[i][c].top = (v)) -#define FETCH_OUTR(i, c) (bld->ovs[i][c].top) -#define STORE_OUTR(i, c, v) \ - do { \ - bld->ovs[i][c].top = (v); \ - bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \ - } while (0) - struct bld_context { struct nv50_translation_info *ti; @@ -108,6 +110,7 @@ struct bld_context { struct nv_basic_block *else_bb[BLD_MAX_COND_NESTING]; int cond_lvl; struct nv_basic_block *loop_bb[BLD_MAX_LOOP_NESTING]; + struct nv_basic_block *brkt_bb[BLD_MAX_LOOP_NESTING]; int loop_lvl; struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */ @@ -127,6 +130,51 @@ struct bld_context { uint num_immds; }; +static INLINE struct nv_value * +bld_fetch(struct bld_context *bld, struct bld_value_stack *stk, int i, int c) +{ + stk[i * 4 + c].loop_use |= 1 << bld->loop_lvl; + + return stk[i * 4 + c].top; +} + +static void +bld_store_loop_var(struct bld_context *, struct bld_value_stack *); + +static INLINE void +bld_store(struct bld_context *bld, struct bld_value_stack *stk, int i, int c, + struct nv_value *val) +{ + bld_store_loop_var(bld, &stk[i * 4 + c]); + + stk[i * 4 + c].top = val; +} + +static INLINE void +bld_clear_def_use(struct bld_value_stack *stk, int n, int lvl) +{ + int i; + const uint16_t mask = ~(1 << lvl); + + for (i = 0; i < n * 4; ++i) { + stk[i].loop_def &= mask; + stk[i].loop_use &= mask; + } +} + +#define FETCH_TEMP(i, c) bld_fetch(bld, &bld->tvs[0][0], i, c) +#define STORE_TEMP(i, c, v) bld_store(bld, &bld->tvs[0][0], i, c, (v)) +#define FETCH_ADDR(i, c) bld_fetch(bld, &bld->avs[0][0], i, c) +#define STORE_ADDR(i, c, v) bld_store(bld, &bld->avs[0][0], i, c, (v)) +#define FETCH_PRED(i, c) bld_fetch(bld, &bld->pvs[0][0], i, c) +#define STORE_PRED(i, c, v) bld_store(bld, &bld->pvs[0][0], i, c, (v)) + +#define STORE_OUTR(i, c, v) \ + do { \ + bld->ovs[i][c].top = (v); \ + bld->outputs_written[(i) / 8] |= 1 << (((i) * 4 + (c)) % 32); \ + } while (0) + static INLINE void bld_warn_uninitialized(struct bld_context *bld, int kind, struct bld_value_stack *stk, struct nv_basic_block *b) @@ -134,8 +182,8 @@ bld_warn_uninitialized(struct bld_context *bld, int kind, long i = (stk - &bld->tvs[0][0]) / 4; long c = (stk - &bld->tvs[0][0]) & 3; - debug_printf("WARNING: TEMP[%li].%li %s used uninitialized in BB:%i\n", - i, c, kind ? "may be" : "is", b->id); + debug_printf("WARNING: TEMP[%li].%c %s used uninitialized in BB:%i\n", + i, (int)('x' + c), kind ? "may be" : "is", b->id); } static INLINE struct nv_value * @@ -182,7 +230,8 @@ fetch_by_bb(struct bld_value_stack *stack, return; } for (i = 0; i < b->num_in; ++i) - fetch_by_bb(stack, vals, n, b->in[i]); + if (b->in_kind[i] != CFG_EDGE_BACK) + fetch_by_bb(stack, vals, n, b->in[i]); } static INLINE struct nv_value * @@ -237,12 +286,15 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b, } for (i = 0; i < n; ++i) { + /* if value dominates b, continue to the redefinitions */ if (nvbb_dominated_by(b, vals[i]->insn->bb)) continue; + /* if value dominates any in-block, b should be the dom frontier */ for (j = 0; j < b->num_in; ++j) if (nvbb_dominated_by(b->in[j], vals[i]->insn->bb)) break; + /* otherwise, find the dominance frontier and put the phi there */ if (j == b->num_in) { in = nvbb_dom_frontier(vals[i]->insn->bb); val = bld_phi(bld, in, stack); @@ -269,6 +321,7 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b, static INLINE struct nv_value * bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) { + stack->loop_use |= 1 << bld->loop_lvl; return bld_phi(bld, bld->pc->current_block, stack); } @@ -290,6 +343,79 @@ bld_imm_u32(struct bld_context *bld, uint32_t u) return bld->saved_immd[n]; } +static void +bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *, + struct nv_value *); + +/* When setting a variable inside a loop, and we have used it before in the + * loop, we need to insert a phi function in the loop header. + */ +static void +bld_store_loop_var(struct bld_context *bld, struct bld_value_stack *stk) +{ + struct nv_basic_block *bb; + struct nv_instruction *phi; + struct nv_value *val; + int ll; + uint16_t loop_def = stk->loop_def; + + if (!(ll = bld->loop_lvl)) + return; + stk->loop_def |= 1 << ll; + + if ((~stk->loop_use | loop_def) & (1 << ll)) + return; + +#if 0 + debug_printf("TEMP[%li].%c used before loop redef (def=%x/use=%x)\n", + (stk - &bld->tvs[0][0]) / 4, + (int)('x' + ((stk - &bld->tvs[0][0]) & 3)), + loop_def, stk->loop_use); +#endif + + stk->loop_def |= 1 << ll; + + assert(bld->loop_bb[ll - 1]->num_in == 1); + + /* get last assignment from outside this loop, could be from bld_phi */ + val = stk->body[stk->size - 1]; + + /* create the phi in the loop entry block */ + + bb = bld->pc->current_block; + bld->pc->current_block = bld->loop_bb[ll - 1]; + + phi = new_instruction(bld->pc, NV_OP_PHI); + + bld_def(phi, 0, new_value(bld->pc, val->reg.file, val->reg.type)); + + bld->pc->pass_seq++; + bld_replace_value(bld->pc, bld->loop_bb[ll - 1], val, phi->def[0]); + + assert(!stk->top); + bld_vals_push_val(stk, phi->def[0]); + + phi->target = (struct nv_basic_block *)stk; /* cheat */ + + nv_reference(bld->pc, &phi->src[0], val); + nv_reference(bld->pc, &phi->src[1], phi->def[0]); + + bld->pc->current_block = bb; +} + +static void +bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) +{ + struct nv_instruction *phi; + struct nv_value *val; + + for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = phi->next) { + val = bld_fetch_global(bld, (struct bld_value_stack *)phi->target); + nv_reference(bld->pc, &phi->src[1], val); + phi->target = NULL; + } +} + static INLINE struct nv_value * bld_imm_f32(struct bld_context *bld, float f) { @@ -432,7 +558,8 @@ bld_kil(struct bld_context *bld, struct nv_value *src) static void bld_flow(struct bld_context *bld, uint opcode, ubyte cc, - struct nv_value *src, boolean plan_reconverge) + struct nv_value *src, struct nv_basic_block *target, + boolean plan_reconverge) { struct nv_instruction *nvi; @@ -442,7 +569,9 @@ bld_flow(struct bld_context *bld, uint opcode, ubyte cc, nvi = new_instruction(bld->pc, opcode); nvi->is_terminator = 1; nvi->cc = cc; - nvi->flags_src = new_ref(bld->pc, src); + nvi->target = target; + if (src) + nvi->flags_src = new_ref(bld->pc, src); } static ubyte @@ -1105,14 +1234,14 @@ bld_instruction(struct bld_context *bld, { struct nv_basic_block *b = new_basic_block(bld->pc); - nvbb_attach_block(bld->pc->current_block, b); + nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD); bld->join_bb[bld->cond_lvl] = bld->pc->current_block; bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0)); - bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, FALSE); + bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE); ++bld->cond_lvl; bld_new_block(bld, b); @@ -1123,7 +1252,7 @@ bld_instruction(struct bld_context *bld, struct nv_basic_block *b = new_basic_block(bld->pc); --bld->cond_lvl; - nvbb_attach_block(bld->join_bb[bld->cond_lvl], b); + nvbb_attach_block(bld->join_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD); bld->cond_bb[bld->cond_lvl]->exit->target = b; bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; @@ -1134,13 +1263,13 @@ bld_instruction(struct bld_context *bld, bld_new_block(bld, b); } break; - case TGSI_OPCODE_ENDIF: /* XXX: deal with ENDIF; ENDIF; */ + case TGSI_OPCODE_ENDIF: { struct nv_basic_block *b = new_basic_block(bld->pc); --bld->cond_lvl; - nvbb_attach_block(bld->pc->current_block, b); - nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b); + nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD); + nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD); bld->cond_bb[bld->cond_lvl]->exit->target = b; @@ -1154,16 +1283,58 @@ bld_instruction(struct bld_context *bld, } break; case TGSI_OPCODE_BGNLOOP: - assert(0); + { + struct nv_basic_block *bl = new_basic_block(bld->pc); + struct nv_basic_block *bb = new_basic_block(bld->pc); + + bld->loop_bb[bld->loop_lvl] = bl; + bld->brkt_bb[bld->loop_lvl] = bb; + + bld_flow(bld, NV_OP_BREAKADDR, NV_CC_TR, NULL, bb, FALSE); + + nvbb_attach_block(bld->pc->current_block, bl, CFG_EDGE_LOOP_ENTER); + + bld_new_block(bld, bld->loop_bb[bld->loop_lvl++]); + + if (bld->loop_lvl == bld->pc->loop_nesting_bound) + bld->pc->loop_nesting_bound++; + + bld_clear_def_use(&bld->tvs[0][0], BLD_MAX_TEMPS, bld->loop_lvl); + bld_clear_def_use(&bld->avs[0][0], BLD_MAX_ADDRS, bld->loop_lvl); + bld_clear_def_use(&bld->pvs[0][0], BLD_MAX_PREDS, bld->loop_lvl); + } break; case TGSI_OPCODE_BRK: - assert(0); + { + struct nv_basic_block *bb = bld->brkt_bb[bld->loop_lvl - 1]; + + bld_flow(bld, NV_OP_BREAK, NV_CC_TR, NULL, bb, FALSE); + + /* XXX: don't do this for redundant BRKs */ + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE); + } break; case TGSI_OPCODE_CONT: - assert(0); + { + struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1]; + + bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); + + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK); + } break; case TGSI_OPCODE_ENDLOOP: - assert(0); + { + struct nv_basic_block *bb = bld->loop_bb[--bld->loop_lvl]; + + bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); + + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK); + + bld_loop_end(bld, bb); /* replace loop-side operand of the phis */ + + bld_new_block(bld, bld->brkt_bb[bld->loop_lvl]); + } break; case TGSI_OPCODE_ABS: case TGSI_OPCODE_CEIL: @@ -1298,6 +1469,17 @@ bld_instruction(struct bld_context *bld, emit_store(bld, insn, c, dst0[c]); } +static INLINE void +bld_free_value_trackers(struct bld_value_stack *base, int n) +{ + int i, c; + + for (i = 0; i < n; ++i) + for (c = 0; c < 4; ++c) + if (base[i * 4 + c].body) + FREE(base[i * 4 + c].body); +} + int nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) { @@ -1309,7 +1491,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) bld->pc = pc; bld->ti = ti; - pc->loop_nesting_bound = 1; /* XXX: should work with 0 */ + pc->loop_nesting_bound = 1; c = util_bitcount(bld->ti->p->fp.interp >> 24); if (c && ti->p->type == PIPE_SHADER_FRAGMENT) { @@ -1335,18 +1517,23 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) } } + bld_free_value_trackers(&bld->tvs[0][0], BLD_MAX_TEMPS); + bld_free_value_trackers(&bld->avs[0][0], BLD_MAX_ADDRS); + bld_free_value_trackers(&bld->pvs[0][0], BLD_MAX_PREDS); + + bld_free_value_trackers(&bld->ovs[0][0], PIPE_MAX_SHADER_OUTPUTS); + FREE(bld); return 0; } -#if 0 /* If a variable is assigned in a loop, replace all references to the value * from outside the loop with a phi value. */ static void -bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b, - struct nv_value *old_val, - struct nv_value *new_val) +bld_replace_value(struct nv_pc *pc, struct nv_basic_block *b, + struct nv_value *old_val, + struct nv_value *new_val) { struct nv_instruction *nvi; @@ -1361,12 +1548,12 @@ bld_adjust_nv_refs(struct nv_pc *pc, struct nv_basic_block *b, if (nvi->flags_src && nvi->flags_src->value == old_val) nv_reference(pc, &nvi->flags_src, new_val); } + b->pass_seq = pc->pass_seq; if (b->out[0] && b->out[0]->pass_seq < pc->pass_seq) - bld_adjust_nv_refs(pc, b, old_val, new_val); + bld_replace_value(pc, b->out[0], old_val, new_val); if (b->out[1] && b->out[1]->pass_seq < pc->pass_seq) - bld_adjust_nv_refs(pc, b, old_val, new_val); + bld_replace_value(pc, b->out[1], old_val, new_val); } -#endif -- cgit v1.2.3 From 34e0db4c509fd669a7713c63848a98d89463ce1a Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 11 Aug 2010 18:44:26 +0200 Subject: nv50: more constant folding --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 204 ++++++++++++++++++++++++---- 1 file changed, 177 insertions(+), 27 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index e4b5d321db..64ffeaf430 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -248,18 +248,24 @@ check_swap_src_0_1(struct nv_instruction *nvi) return; assert(src0 && src1); + if (src1->value->reg.file == NV_FILE_IMM) { + /* should only be present from folding a constant MUL part of a MAD */ + assert(nvi->opcode == NV_OP_ADD); + return; + } + if (is_cmem_load(src0->value->insn)) { if (!is_cmem_load(src1->value->insn)) { nvi->src[0] = src1; - nvi->src[1] = src0; - /* debug_printf("swapping cmem load to 1\n"); */ + nvi->src[1] = src0; + /* debug_printf("swapping cmem load to 1\n"); */ } } else if (is_smem_load(src1->value->insn)) { if (!is_smem_load(src0->value->insn)) { nvi->src[0] = src1; - nvi->src[1] = src0; - /* debug_printf("swapping smem load to 0\n"); */ + nvi->src[1] = src0; + /* debug_printf("swapping smem load to 0\n"); */ } } @@ -435,47 +441,168 @@ find_immediate(struct nv_ref *ref) return (src->reg.file == NV_FILE_IMM) ? src : NULL; } +static void +modifiers_apply(uint32_t *val, ubyte type, ubyte mod) +{ + if (mod & NV_MOD_ABS) { + if (type == NV_TYPE_F32) + *val &= 0x7fffffff; + else + if ((*val) & (1 << 31)) + *val = ~(*val) + 1; + } + if (mod & NV_MOD_NEG) { + if (type == NV_TYPE_F32) + *val ^= 0x80000000; + else + *val = ~(*val) + 1; + } +} + +static INLINE uint +modifiers_opcode(ubyte mod) +{ + switch (mod) { + case NV_MOD_NEG: return NV_OP_NEG; + case NV_MOD_ABS: return NV_OP_ABS; + case 0: + return NV_OP_MOV; + default: + return NV_OP_NOP; + } +} + +static void +constant_expression(struct nv_pc *pc, struct nv_instruction *nvi, + struct nv_value *src0, struct nv_value *src1) +{ + struct nv_value *val; + union { + float f32; + uint32_t u32; + int32_t s32; + } u0, u1, u; + ubyte type; + + if (!nvi->def[0]) + return; + type = nvi->def[0]->reg.type; + + u.u32 = 0; + u0.u32 = src0->reg.imm.u32; + u1.u32 = src1->reg.imm.u32; + + modifiers_apply(&u0.u32, type, nvi->src[0]->mod); + modifiers_apply(&u0.u32, type, nvi->src[1]->mod); + + switch (nvi->opcode) { + case NV_OP_MAD: + if (nvi->src[2]->value->reg.file != NV_FILE_GPR) + return; + /* fall through */ + case NV_OP_MUL: + switch (type) { + case NV_TYPE_F32: u.f32 = u0.f32 * u1.f32; break; + case NV_TYPE_U32: u.u32 = u0.u32 * u1.u32; break; + case NV_TYPE_S32: u.s32 = u0.s32 * u1.s32; break; + default: + assert(0); + break; + } + break; + case NV_OP_ADD: + switch (type) { + case NV_TYPE_F32: u.f32 = u0.f32 + u1.f32; break; + case NV_TYPE_U32: u.u32 = u0.u32 + u1.u32; break; + case NV_TYPE_S32: u.s32 = u0.s32 + u1.s32; break; + default: + assert(0); + break; + } + break; + case NV_OP_SUB: + switch (type) { + case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32; + case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32; + case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32; + default: + assert(0); + break; + } + break; + default: + return; + } + + nvi->opcode = NV_OP_MOV; + + val = new_value(pc, NV_FILE_IMM, type); + + val->reg.imm.u32 = u.u32; + + nv_reference(pc, &nvi->src[1], NULL); + nv_reference(pc, &nvi->src[0], val); + + if (nvi->src[2]) { /* from MAD */ + nvi->src[1] = nvi->src[0]; + nvi->src[0] = nvi->src[2]; + nvi->src[2] = NULL; + nvi->opcode = NV_OP_ADD; + } +} + static void constant_operand(struct nv_pc *pc, struct nv_instruction *nvi, struct nv_value *val, int s) { + union { + float f32; + uint32_t u32; + int32_t s32; + } u; int t = s ? 0 : 1; + uint op; ubyte type; if (!nvi->def[0]) return; type = nvi->def[0]->reg.type; + u.u32 = val->reg.imm.u32; + modifiers_apply(&u.u32, type, nvi->src[s]->mod); + switch (nvi->opcode) { case NV_OP_MUL: - if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) || - (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) { - nvi->opcode = NV_OP_MOV; + if ((type == NV_TYPE_F32 && u.f32 == 1.0f) || + (NV_TYPE_ISINT(type) && u.u32 == 1)) { + if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP) + break; + nvi->opcode = op; nv_reference(pc, &nvi->src[s], NULL); - if (!s) { - nvi->src[0] = nvi->src[1]; - nvi->src[1] = NULL; - } + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; } else - if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) || - (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) { + if ((type == NV_TYPE_F32 && u.f32 == 2.0f) || + (NV_TYPE_ISINT(type) && u.u32 == 2)) { nvi->opcode = NV_OP_ADD; nv_reference(pc, &nvi->src[s], nvi->src[t]->value); + nvi->src[s]->mod = nvi->src[t]->mod; } else - if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) { - nvi->opcode = NV_OP_NEG; + if (type == NV_TYPE_F32 && u.f32 == -1.0f) { + if (nvi->src[t]->mod & NV_MOD_NEG) + nvi->opcode = NV_OP_MOV; + else + nvi->opcode = NV_OP_NEG; nv_reference(pc, &nvi->src[s], NULL); nvi->src[0] = nvi->src[t]; nvi->src[1] = NULL; } else - if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) { + if (type == NV_TYPE_F32 && u.f32 == -2.0f) { nvi->opcode = NV_OP_ADD; - assert(!nvi->src[s]->mod); nv_reference(pc, &nvi->src[s], nvi->src[t]->value); - nvi->src[t]->mod ^= NV_MOD_NEG; - nvi->src[s]->mod |= NV_MOD_NEG; + nvi->src[s]->mod = (nvi->src[t]->mod ^= NV_MOD_NEG); } else - if (val->reg.imm.u32 == 0) { + if (u.u32 == 0) { nvi->opcode = NV_OP_MOV; nv_reference(pc, &nvi->src[t], NULL); if (s) { @@ -485,13 +612,29 @@ constant_operand(struct nv_pc *pc, } break; case NV_OP_ADD: - if (val->reg.imm.u32 == 0) { - nvi->opcode = NV_OP_MOV; + if (u.u32 == 0) { + if ((op = modifiers_opcode(nvi->src[t]->mod)) == NV_OP_NOP) + break; + nvi->opcode = op; nv_reference(pc, &nvi->src[s], NULL); nvi->src[0] = nvi->src[t]; nvi->src[1] = NULL; } break; + case NV_OP_RCP: + u.f32 = 1.0f / u.f32; + (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32; + nvi->opcode = NV_OP_MOV; + assert(s == 0); + nv_reference(pc, &nvi->src[0], val); + break; + case NV_OP_RSQ: + u.f32 = 1.0f / sqrtf(u.f32); + (val = new_value(pc, NV_FILE_IMM, NV_TYPE_F32))->reg.imm.f32 = u.f32; + nvi->opcode = NV_OP_MOV; + assert(s == 0); + nv_reference(pc, &nvi->src[0], val); + break; default: break; } @@ -509,11 +652,18 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) next = nvi->next; - if ((src = find_immediate(nvi->src[0])) != NULL) - constant_operand(ctx->pc, nvi, src, 0); - else - if ((src = find_immediate(nvi->src[1])) != NULL) - constant_operand(ctx->pc, nvi, src, 1); + src0 = find_immediate(nvi->src[0]); + src1 = find_immediate(nvi->src[1]); + + if (src0 && src1) + constant_expression(ctx->pc, nvi, src0, src1); + else { + if (src0) + constant_operand(ctx->pc, nvi, src0, 0); + else + if (src1) + constant_operand(ctx->pc, nvi, src1, 1); + } /* try to combine MUL, ADD into MAD */ if (nvi->opcode != NV_OP_ADD) -- cgit v1.2.3 From 4de293bb9acd1ecda683f735af32f7485a0f213e Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 15 Aug 2010 21:37:50 +0200 Subject: nv50: loops part 2 At least the mesa demo glsl/mandelbrot should work now. --- src/gallium/drivers/nv50/nv50_pc.h | 8 +- src/gallium/drivers/nv50/nv50_pc_emit.c | 1 + src/gallium/drivers/nv50/nv50_pc_optimize.c | 4 +- src/gallium/drivers/nv50/nv50_pc_print.c | 2 +- src/gallium/drivers/nv50/nv50_screen.c | 27 ++++ src/gallium/drivers/nv50/nv50_screen.h | 4 +- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 233 ++++++++++++++++++---------- 7 files changed, 189 insertions(+), 90 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 8b1c9b3a72..b24a3067b8 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -47,7 +47,7 @@ #define NV_OP_SHL 17 #define NV_OP_SHR 18 #define NV_OP_RCP 19 -/* gap */ +#define NV_OP_UNDEF 20 #define NV_OP_RSQ 21 #define NV_OP_LG2 22 #define NV_OP_SIN 23 @@ -360,6 +360,12 @@ new_value(struct nv_pc *pc, ubyte file, ubyte type) return value; } +static INLINE struct nv_value * +new_value_like(struct nv_pc *pc, struct nv_value *like) +{ + return new_value(pc, like->reg.file, like->reg.type); +} + static INLINE struct nv_ref * new_ref(struct nv_pc *pc, struct nv_value *val) { diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index 35bd5ff10f..fe44b327ab 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -1130,6 +1130,7 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) pc->emit[1] = 0xe0000000; break; case NV_OP_PHI: + case NV_OP_UNDEF: case NV_OP_SUB: NOUVEAU_ERR("operation \"%s\" should have been eliminated\n", nv_opcode_name(i->opcode)); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 64ffeaf430..daf63a3d20 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -89,7 +89,7 @@ inst_cullable(struct nv_instruction *nvi) static INLINE boolean nvi_isnop(struct nv_instruction *nvi) { - if (nvi->opcode == NV_OP_EXPORT) + if (nvi->opcode == NV_OP_EXPORT || nvi->opcode == NV_OP_UNDEF) return TRUE; if (nvi->fixed || @@ -849,7 +849,7 @@ nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) int j; struct nv_instruction *nvi, *next; - for (nvi = b->entry; nvi; nvi = next) { + for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = next) { next = nvi->next; if (inst_cullable(nvi)) { diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index c812dbd066..a4f567bde4 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -59,7 +59,7 @@ static const char *nv_opcode_names[NV_OP_COUNT + 1] = { "shl", "shr", "rcp", - "(undefined)", + "undef", "rsqrt", "lg2", "sin", diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index e0c06c29ba..78137d6940 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -253,14 +253,23 @@ nv50_screen_relocs(struct nv50_screen *screen) } } +#ifndef NOUVEAU_GETPARAM_GRAPH_UNITS +# define NOUVEAU_GETPARAM_GRAPH_UNITS 13 +#endif + +extern int nouveau_device_get_param(struct nouveau_device *dev, + uint64_t param, uint64_t *value); + struct pipe_screen * nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) { struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen); struct nouveau_channel *chan; struct pipe_screen *pscreen; + uint64_t value; unsigned chipset = dev->chipset; unsigned tesla_class = 0; + unsigned stack_size; int ret, i; const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; @@ -478,6 +487,24 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) OUT_RING (chan, 0x121 | (NV50_CB_PGP << 12)); OUT_RING (chan, 0x131 | (NV50_CB_PFP << 12)); + /* shader stack */ + nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); + + stack_size = util_bitcount(value & 0xffff); + stack_size *= util_bitcount((value >> 24) & 0xf); + stack_size *= 32 * 64 * 8; + + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, + stack_size, &screen->stack_bo); + if (ret) { + nv50_screen_destroy(pscreen); + return NULL; + } + BEGIN_RING(chan, screen->tesla, NV50TCL_STACK_ADDRESS_HIGH, 3); + OUT_RELOCh(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RING (chan, 4); + /* Vertex array limits - max them out */ for (i = 0; i < 16; i++) { BEGIN_RING(chan, screen->tesla, diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h index a491ba31b2..1517f5608f 100644 --- a/src/gallium/drivers/nv50/nv50_screen.h +++ b/src/gallium/drivers/nv50/nv50_screen.h @@ -22,11 +22,11 @@ struct nv50_screen { struct nouveau_resource *immd_heap; - struct pipe_resource *strm_vbuf[16]; - struct nouveau_bo *tic; struct nouveau_bo *tsc; + struct nouveau_bo *stack_bo; + boolean force_push; }; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index da33adcaa4..7e77ed6ef6 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -22,19 +22,6 @@ /* XXX: need to clean this up so we get the typecasting right more naturally */ -/* LOOP FIXME 1 - * In bld_store_loop_var, only replace values that belong to the TGSI register - * written. - * For TGSI MOV, we only associate the source value with the value tracker of - * the destination, instead of generating an actual MOV. - * - * Possible solution: generate PHI functions in loop headers in advance. - */ -/* LOOP FIXME 2: - * In fetch_by_bb, when going back through a break-block, we miss all of the - * definitions from inside the loop. - */ - #include #include "nv50_context.h" @@ -78,6 +65,24 @@ bld_vals_push_val(struct bld_value_stack *stk, struct nv_value *val) stk->body[stk->size++] = val; } +static INLINE boolean +bld_vals_del_val(struct bld_value_stack *stk, struct nv_value *val) +{ + unsigned i; + + for (i = stk->size - 1; i >= 0; --i) + if (stk->body[i] == val) + break; + if (i < 0) + return FALSE; + + if (i != stk->size - 1) + stk->body[i] = stk->body[stk->size - 1]; + + --stk->size; /* XXX: old size in REALLOC */ + return TRUE; +} + static INLINE void bld_vals_push(struct bld_value_stack *stk) { @@ -118,7 +123,7 @@ struct bld_context { struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4]; - uint32_t outputs_written[PIPE_MAX_SHADER_OUTPUTS / 32]; + uint32_t outputs_written[(PIPE_MAX_SHADER_OUTPUTS + 31) / 32]; struct nv_value *frgcrd[4]; struct nv_value *sysval[4]; @@ -130,6 +135,21 @@ struct bld_context { uint num_immds; }; +static INLINE ubyte +bld_stack_file(struct bld_context *bld, struct bld_value_stack *stk) +{ + if (stk < &bld->avs[0][0]) + return NV_FILE_GPR; + else + if (stk < &bld->pvs[0][0]) + return NV_FILE_ADDR; + else + if (stk < &bld->ovs[0][0]) + return NV_FILE_FLAGS; + else + return NV_FILE_OUT; +} + static INLINE struct nv_value * bld_fetch(struct bld_context *bld, struct bld_value_stack *stk, int i, int c) { @@ -138,16 +158,29 @@ bld_fetch(struct bld_context *bld, struct bld_value_stack *stk, int i, int c) return stk[i * 4 + c].top; } -static void -bld_store_loop_var(struct bld_context *, struct bld_value_stack *); +static struct nv_value * +bld_loop_phi(struct bld_context *, struct bld_value_stack *, struct nv_value *); +/* If a variable is defined in a loop without prior use, we don't need + * a phi in the loop header to account for backwards flow. + * + * However, if this variable is then also used outside the loop, we do + * need a phi after all. But we must not use this phi's def inside the + * loop, so we can eliminate the phi if it is unused later. + */ static INLINE void bld_store(struct bld_context *bld, struct bld_value_stack *stk, int i, int c, struct nv_value *val) { - bld_store_loop_var(bld, &stk[i * 4 + c]); + const uint16_t m = 1 << bld->loop_lvl; + + stk = &stk[i * 4 + c]; - stk[i * 4 + c].top = val; + if (bld->loop_lvl && !(m & (stk->loop_def | stk->loop_use))) + bld_loop_phi(bld, stk, val); + + stk->top = val; + stk->loop_def |= 1 << bld->loop_lvl; } static INLINE void @@ -182,6 +215,9 @@ bld_warn_uninitialized(struct bld_context *bld, int kind, long i = (stk - &bld->tvs[0][0]) / 4; long c = (stk - &bld->tvs[0][0]) & 3; + if (c == 3) + c = -1; + debug_printf("WARNING: TEMP[%li].%c %s used uninitialized in BB:%i\n", i, (int)('x' + c), kind ? "may be" : "is", b->id); } @@ -237,6 +273,14 @@ fetch_by_bb(struct bld_value_stack *stack, static INLINE struct nv_value * bld_load_imm_u32(struct bld_context *bld, uint32_t u); +static INLINE struct nv_value * +bld_undef(struct bld_context *bld, ubyte file) +{ + struct nv_instruction *nvi = new_instruction(bld->pc, NV_OP_UNDEF); + + return bld_def(nvi, 0, new_value(bld->pc, file, NV_TYPE_U32)); +} + static struct nv_value * bld_phi(struct bld_context *bld, struct nv_basic_block *b, struct bld_value_stack *stack) @@ -267,21 +311,19 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b, if (in->num_in == 1) { in = in->in[0]; } else { - if (!nvbb_reachable_by(in->in[0], vals[0]->insn->bb, b)) { + if (!nvbb_reachable_by(in->in[0], vals[0]->insn->bb, b)) in = in->in[0]; - break; - } - if (!nvbb_reachable_by(in->in[1], vals[0]->insn->bb, b)) { + else + if (!nvbb_reachable_by(in->in[1], vals[0]->insn->bb, b)) in = in->in[1]; - break; - } - in = in->in[0]; + else + in = in->in[0]; } } bld->pc->current_block = in; /* should make this a no-op */ - bld_vals_push_val(stack, bld_load_imm_u32(bld, 0)); + bld_vals_push_val(stack, bld_undef(bld, vals[0]->reg.file)); continue; } @@ -318,10 +360,55 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b, return phi->def[0]; } +static struct nv_value * +bld_loop_phi(struct bld_context *bld, struct bld_value_stack *stack, + struct nv_value *def) +{ + struct nv_basic_block *bb = bld->pc->current_block; + struct nv_instruction *phi; + struct nv_value *val; + + val = bld_phi(bld, bld->pc->current_block, stack); + if (!val) { + bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]->in[0]; + + val = bld_undef(bld, bld_stack_file(bld, stack)); + } + + bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]; + + phi = new_instruction(bld->pc, NV_OP_PHI); + + bld_def(phi, 0, new_value_like(bld->pc, val)); + if (!def) + def = phi->def[0]; + + bld_vals_push_val(stack, phi->def[0]); + + phi->target = (struct nv_basic_block *)stack; /* cheat */ + + nv_reference(bld->pc, &phi->src[0], val); + nv_reference(bld->pc, &phi->src[1], def); + + bld->pc->current_block = bb; + + return phi->def[0]; +} + static INLINE struct nv_value * bld_fetch_global(struct bld_context *bld, struct bld_value_stack *stack) { - stack->loop_use |= 1 << bld->loop_lvl; + const uint16_t m = 1 << bld->loop_lvl; + const uint16_t use = stack->loop_use; + + stack->loop_use |= m; + + /* If neither used nor def'd inside the loop, build a phi in foresight, + * so we don't have to replace stuff later on, which requires tracking. + */ + if (bld->loop_lvl && !((use | stack->loop_def) & m)) + return bld_loop_phi(bld, stack, NULL); + return bld_phi(bld, bld->pc->current_block, stack); } @@ -347,72 +434,50 @@ static void bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *, struct nv_value *); -/* When setting a variable inside a loop, and we have used it before in the - * loop, we need to insert a phi function in the loop header. +/* Replace the source of the phi in the loop header by the last assignment, + * or eliminate the phi function if there is no assignment inside the loop. + * + * Redundancy situation 1 - (used) but (not redefined) value: + * %3 = phi %0, %3 = %3 is used + * %3 = phi %0, %4 = is new definition + * + * Redundancy situation 2 - (not used) but (redefined) value: + * %3 = phi %0, %2 = %2 is used, %3 could be used outside, deleted by DCE */ static void -bld_store_loop_var(struct bld_context *bld, struct bld_value_stack *stk) +bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) { - struct nv_basic_block *bb; - struct nv_instruction *phi; + struct nv_instruction *phi, *next; struct nv_value *val; - int ll; - uint16_t loop_def = stk->loop_def; - - if (!(ll = bld->loop_lvl)) - return; - stk->loop_def |= 1 << ll; - - if ((~stk->loop_use | loop_def) & (1 << ll)) - return; - -#if 0 - debug_printf("TEMP[%li].%c used before loop redef (def=%x/use=%x)\n", - (stk - &bld->tvs[0][0]) / 4, - (int)('x' + ((stk - &bld->tvs[0][0]) & 3)), - loop_def, stk->loop_use); -#endif + struct bld_value_stack *stk; + int s; - stk->loop_def |= 1 << ll; + for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = next) { + next = phi->next; - assert(bld->loop_bb[ll - 1]->num_in == 1); - - /* get last assignment from outside this loop, could be from bld_phi */ - val = stk->body[stk->size - 1]; - - /* create the phi in the loop entry block */ - - bb = bld->pc->current_block; - bld->pc->current_block = bld->loop_bb[ll - 1]; - - phi = new_instruction(bld->pc, NV_OP_PHI); + stk = (struct bld_value_stack *)phi->target; + phi->target = NULL; - bld_def(phi, 0, new_value(bld->pc, val->reg.file, val->reg.type)); + val = bld_fetch_global(bld, stk); - bld->pc->pass_seq++; - bld_replace_value(bld->pc, bld->loop_bb[ll - 1], val, phi->def[0]); + nv_reference(bld->pc, &phi->src[1], val); - assert(!stk->top); - bld_vals_push_val(stk, phi->def[0]); + s = -1; + if (phi->src[0]->value == phi->def[0] || + phi->src[0]->value == phi->src[1]->value) + s = 1; + else + if (phi->src[1]->value == phi->def[0]) + s = 0; - phi->target = (struct nv_basic_block *)stk; /* cheat */ + if (s >= 0) { + bld_vals_del_val(stk, phi->def[0]); - nv_reference(bld->pc, &phi->src[0], val); - nv_reference(bld->pc, &phi->src[1], phi->def[0]); + ++bld->pc->pass_seq; + bld_replace_value(bld->pc, bb, phi->def[0], phi->src[s]->value); - bld->pc->current_block = bb; -} - -static void -bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) -{ - struct nv_instruction *phi; - struct nv_value *val; - - for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = phi->next) { - val = bld_fetch_global(bld, (struct bld_value_stack *)phi->target); - nv_reference(bld->pc, &phi->src[1], val); - phi->target = NULL; + nv_nvi_delete(phi); + } } } @@ -437,7 +502,7 @@ bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0) static struct nv_value * bld_insn_2(struct bld_context *bld, uint opcode, - struct nv_value *src0, struct nv_value *src1) + struct nv_value *src0, struct nv_value *src1) { struct nv_instruction *insn = new_instruction(bld->pc, opcode); @@ -449,8 +514,8 @@ bld_insn_2(struct bld_context *bld, uint opcode, static struct nv_value * bld_insn_3(struct bld_context *bld, uint opcode, - struct nv_value *src0, struct nv_value *src1, - struct nv_value *src2) + struct nv_value *src0, struct nv_value *src1, + struct nv_value *src2) { struct nv_instruction *insn = new_instruction(bld->pc, opcode); -- cgit v1.2.3 From e7a0bfa69a6ce45bb53baa8220eae418225c5649 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 16 Aug 2010 15:21:23 +0200 Subject: nv50: flatten simple IF/ELSE/ENDIF constructs Less branching means less instructions and less thread divergence. --- src/gallium/drivers/nv50/nv50_pc.c | 14 ++++ src/gallium/drivers/nv50/nv50_pc.h | 1 + src/gallium/drivers/nv50/nv50_pc_optimize.c | 116 +++++++++++++++++++++++----- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 16 +++- 4 files changed, 123 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 7601049126..5041fc7505 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -125,6 +125,20 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) } } +/* Return whether this instruction can be executed conditionally. */ +boolean +nv50_nvi_can_predicate(struct nv_instruction *nvi) +{ + int i; + + if (nvi->flags_src) + return FALSE; + for (i = 0; i < 4 && nvi->src[i]; ++i) + if (nvi->src[i]->value->reg.file == NV_FILE_IMM) + return FALSE; + return TRUE; +} + ubyte nv50_supported_src_mods(uint opcode, int s) { diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index b24a3067b8..28208ad247 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -432,6 +432,7 @@ void nv_print_program(struct nv_basic_block *b); boolean nv_op_commutative(uint opcode); int nv50_indirect_opnd(struct nv_instruction *); boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s); +boolean nv50_nvi_can_predicate(struct nv_instruction *); boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *); ubyte nv50_supported_src_mods(uint opcode, int s); int nv_nvi_refcount(struct nv_instruction *); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index daf63a3d20..4cf387257d 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -119,6 +119,15 @@ nvi_isnop(struct nv_instruction *nvi) return values_equal(nvi->def[0], nvi->src[0]->value); } +struct nv_pass { + struct nv_pc *pc; + int n; + void *priv; +}; + +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b); + static void nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) { @@ -204,6 +213,13 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) int nv_pc_exec_pass2(struct nv_pc *pc) { + struct nv_pass pass; + + pass.pc = pc; + + pc->pass_seq++; + nv_pass_flatten(&pass, pc->root); + debug_printf("preparing %u blocks for emission\n", pc->num_blocks); pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); @@ -273,12 +289,6 @@ check_swap_src_0_1(struct nv_instruction *nvi) nvi->set_cond = cc_swapped[nvi->set_cond]; } -struct nv_pass { - struct nv_pc *pc; - int n; - void *priv; -}; - static int nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) { @@ -863,24 +873,95 @@ nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) return 0; } +/* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE. + * Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with + * BREAK and dummy ELSE block. + */ static INLINE boolean -bb_simple_if_endif(struct nv_basic_block *bb) +bb_is_if_else_endif(struct nv_basic_block *bb) +{ + if (!bb->out[0] || !bb->out[1]) + return FALSE; + + if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) { + return (bb->out[0]->out[1] == bb->out[1]->out[0] && + !bb->out[1]->out[1]); + } else { + return (bb->out[0]->out[0] == bb->out[1]->out[0] && + !bb->out[0]->out[1] && + !bb->out[1]->out[1]); + } +} + +/* predicate instructions and remove branch at the end */ +static void +predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b, + struct nv_value *p, ubyte cc) { - return (bb->out[0] && bb->out[1] && - bb->out[0]->out[0] == bb->out[1] && - !bb->out[0]->out[1]); + struct nv_instruction *nvi; + + if (!b->entry) + return; + for (nvi = b->entry; nvi->next; nvi = nvi->next) { + if (!nvi_isnop(nvi)) { + nvi->cc = cc; + nv_reference(pc, &nvi->flags_src, p); + } + } + + if (nvi->opcode == NV_OP_BRA) + nv_nvi_delete(nvi); + else + if (!nvi_isnop(nvi)) { + nvi->cc = cc; + nv_reference(pc, &nvi->flags_src, p); + } } +/* NOTE: Run this after register allocation, we can just cut out the cflow + * instructions and hook the predicates to the conditional OPs if they are + * not using immediates; better than inserting SELECT to join definitions. + * + * NOTE: Should adapt prior optimization to make this possible more often. + */ static int nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) { - int j; + struct nv_instruction *nvi; + struct nv_value *pred; + int i; + int n0 = 0, n1 = 0; + + if (bb_is_if_else_endif(b)) { + + debug_printf("nv_pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id); - if (bb_simple_if_endif(b)) { - ++ctx->n; - debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n); + for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0) + if (!nv50_nvi_can_predicate(nvi)) + break; + if (!nvi) { + for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1) + if (!nv50_nvi_can_predicate(nvi)) + break; + if (nvi) { + debug_printf("cannot predicate: "); nv_print_instruction(nvi); + } + } else { + debug_printf("cannot predicate: "); nv_print_instruction(nvi); + } + + if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */ + assert(b->exit && b->exit->flags_src); + pred = b->exit->flags_src->value; + + predicate_instructions(ctx->pc, b->out[0], pred, NV_CC_NE | NV_CC_U); + predicate_instructions(ctx->pc, b->out[1], pred, NV_CC_EQ); + + assert(b->exit && b->exit->opcode == NV_OP_BRA); + nv_nvi_delete(b->exit); + } } - DESCEND_ARBITRARY(j, nv_pass_flatten); + DESCEND_ARBITRARY(i, nv_pass_flatten); return 0; } @@ -960,11 +1041,6 @@ nv_pc_exec_pass0(struct nv_pc *pc) pass.n = 0; pass.pc = pc; - pc->pass_seq++; - ret = nv_pass_flatten(&pass, pc->root); - if (ret) - return ret; - /* Do this first, so we don't have to pay attention * to whether sources are supported memory loads. */ diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 7e77ed6ef6..b23c285dc1 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -591,7 +591,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) static struct nv_value * -bld_predicate(struct bld_context *bld, struct nv_value *src) +bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only) { struct nv_instruction *nvi = src->insn; @@ -600,6 +600,14 @@ bld_predicate(struct bld_context *bld, struct nv_value *src) nvi->bb != bld->pc->current_block) { nvi = new_instruction(bld->pc, NV_OP_CVT); nv_reference(bld->pc, &nvi->src[0], src); + } else + if (bool_only) { + while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_CVT || + nvi->opcode == NV_OP_NEG) { + /* TGSI SET gets conversion to f32, we only need source 0/~0 */ + if (!nvi->def[0]->insn->flags_src) + nvi = nvi->src[0]->value->insn; + } } if (!nvi->flags_def) { @@ -614,7 +622,7 @@ bld_kil(struct bld_context *bld, struct nv_value *src) { struct nv_instruction *nvi; - src = bld_predicate(bld, src); + src = bld_predicate(bld, src, FALSE); nvi = new_instruction(bld->pc, NV_OP_KIL); nvi->fixed = 1; nvi->flags_src = new_ref(bld->pc, src); @@ -1223,7 +1231,7 @@ bld_instruction(struct bld_context *bld, src0 = emit_fetch(bld, insn, 0, c); src1 = emit_fetch(bld, insn, 1, c); src2 = emit_fetch(bld, insn, 2, c); - src0 = bld_predicate(bld, src0); + src0 = bld_predicate(bld, src0, FALSE); src1 = bld_insn_1(bld, NV_OP_MOV, src1); src1->insn->flags_src = new_ref(bld->pc, src0); @@ -1304,7 +1312,7 @@ bld_instruction(struct bld_context *bld, bld->join_bb[bld->cond_lvl] = bld->pc->current_block; bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; - src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0)); + src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0), TRUE); bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE); -- cgit v1.2.3 From 6c5c55723d32f8933ffb5fc6b5beb209eca84ca8 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 16 Aug 2010 17:18:30 +0200 Subject: nv50: fix thinko in store to output reg possible check --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 4cf387257d..5d575461ca 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -315,7 +315,7 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) for (j = 0; j < 4 && nvi->src[j]; ++j) if (nvi->src[j]->value->reg.file == NV_FILE_IMM) break; - if (j < 4) + if (j < 4 && nvi->src[j]) continue; nvi->def[0] = sti->def[0]; -- cgit v1.2.3 From 62f933a6f617050a267079b27360eaae2d0e1a70 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 16 Aug 2010 18:00:39 +0200 Subject: nv50: generate JOINs for outermost IF clauses --- src/gallium/drivers/nv50/nv50_pc.h | 3 ++- src/gallium/drivers/nv50/nv50_pc_emit.c | 11 ++++++++++- src/gallium/drivers/nv50/nv50_pc_optimize.c | 16 +++++++++++++--- src/gallium/drivers/nv50/nv50_pc_print.c | 1 + src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 11 +++++------ 5 files changed, 31 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 28208ad247..d24375100d 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -83,7 +83,8 @@ #define NV_OP_NOP 53 #define NV_OP_SELECT 54 #define NV_OP_EXPORT 55 -#define NV_OP_COUNT 56 +#define NV_OP_JOIN 56 +#define NV_OP_COUNT 57 #define NV_FILE_GPR 0 #define NV_FILE_OUT 1 diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index fe44b327ab..3a3b277c13 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -38,7 +38,7 @@ const ubyte nv50_inst_min_size_tab[NV_OP_COUNT] = 0, 0, 0, 8, 8, 4, 4, 4, 8, 4, 4, 8, 8, 8, 8, 8, /* 15 */ 8, 8, 8, 4, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 31 */ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, /* 47 */ - 4, 8, 8, 8, 8, 8, 0, 0 + 4, 8, 8, 8, 8, 8, 0, 0, 8 }; /* XXX: silence, you ! */ @@ -71,6 +71,9 @@ nv50_inst_min_size(struct nv_instruction *i) if (i->flags_def || i->flags_src || i->src[4]) return 8; + if (i->is_join) + return 8; + if (i->src[2]) { if (i->saturate || i->src[2]->mod) return 8; @@ -1126,6 +1129,7 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) emit_flow(pc, i, 0xa); break; case NV_OP_NOP: + case NV_OP_JOIN: pc->emit[0] = 0xf0000001; pc->emit[1] = 0xe0000000; break; @@ -1141,5 +1145,10 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) break; } + if (i->is_join) { + assert(i->is_long && !(pc->emit[1] & 1)); + pc->emit[1] |= 2; + } + assert((pc->emit[0] & 1) == i->is_long); } diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 5d575461ca..b35dd72841 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -80,7 +80,7 @@ inst_commutation_legal(struct nv_instruction *a, static INLINE boolean inst_cullable(struct nv_instruction *nvi) { - return (!(nvi->is_terminator || + return (!(nvi->is_terminator || nvi->is_join || nvi->target || nvi->fixed || nv_nvi_refcount(nvi))); @@ -95,7 +95,8 @@ nvi_isnop(struct nv_instruction *nvi) if (nvi->fixed || nvi->is_terminator || nvi->flags_src || - nvi->flags_def) + nvi->flags_def || + nvi->is_join) return FALSE; if (nvi->def[0]->join->reg.id < 0) @@ -934,7 +935,7 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) if (bb_is_if_else_endif(b)) { - debug_printf("nv_pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id); + debug_printf("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id); for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0) if (!nv50_nvi_can_predicate(nvi)) @@ -959,6 +960,15 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) assert(b->exit && b->exit->opcode == NV_OP_BRA); nv_nvi_delete(b->exit); + + if (b->exit && b->exit->opcode == NV_OP_JOINAT) + nv_nvi_delete(b->exit); + + if ((nvi = b->out[0]->out[0]->entry)) { + nvi->is_join = 0; + if (nvi->opcode == NV_OP_JOIN) + nv_nvi_delete(nvi); + } } } DESCEND_ARBITRARY(i, nv_pass_flatten); diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index a4f567bde4..7bdeb1c78d 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -95,6 +95,7 @@ static const char *nv_opcode_names[NV_OP_COUNT + 1] = { "nop", "select", "export", + "join", "BAD_OP" }; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index b23c285dc1..d6c5a8d660 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1314,7 +1314,7 @@ bld_instruction(struct bld_context *bld, src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0), TRUE); - bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE); + bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, (bld->cond_lvl == 0)); ++bld->cond_lvl; bld_new_block(bld, b); @@ -1346,13 +1346,12 @@ bld_instruction(struct bld_context *bld, bld->cond_bb[bld->cond_lvl]->exit->target = b; - if (0 && bld->join_bb[bld->cond_lvl]) { - bld->join_bb[bld->cond_lvl]->exit->prev->target = b; + bld_new_block(bld, b); - new_instruction(bld->pc, NV_OP_NOP)->is_join = TRUE; + if (!bld->cond_lvl && bld->join_bb[bld->cond_lvl]) { + bld->join_bb[bld->cond_lvl]->exit->prev->target = b; + new_instruction(bld->pc, NV_OP_JOIN)->is_join = TRUE; } - - bld_new_block(bld, b); } break; case TGSI_OPCODE_BGNLOOP: -- cgit v1.2.3 From ce1629564d1cce80b2762d266640e3181a68e848 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 17 Aug 2010 11:51:51 +0200 Subject: nv50: more TGSI opcodes (SIN, SCS, ARL, RET, KILP) --- src/gallium/drivers/nv50/nv50_pc_emit.c | 22 +++++++++++++++++ src/gallium/drivers/nv50/nv50_pc_optimize.c | 13 +++++----- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 38 ++++++++++++++++++++++++++--- 3 files changed, 63 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index 3a3b277c13..b5f4383aa1 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -747,9 +747,31 @@ emit_bitop2(struct nv_pc *pc, struct nv_instruction *i) } } +static void +emit_arl(struct nv_pc *pc, struct nv_instruction *i) +{ + assert(SFILE(i, 0) == NV_FILE_GPR); + assert(SFILE(i, 1) == NV_FILE_IMM); + + assert(!i->flags_def); + + pc->emit[0] = 0x00000001; + pc->emit[1] = 0xc0000000; + + set_dst(pc, i->def[0]); + set_pred(pc, i); + set_src_0(pc, i->src[0]); + pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x3f) << 16; +} + static void emit_shift(struct nv_pc *pc, struct nv_instruction *i) { + if (DFILE(i, 0) == NV_FILE_ADDR) { + emit_arl(pc, i); + return; + } + pc->emit[0] = 0x30000001; pc->emit[1] = 0xc4000000; diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index b35dd72841..3e6e09a904 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -293,14 +293,15 @@ check_swap_src_0_1(struct nv_instruction *nvi) static int nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) { - struct nv_instruction *nvi, *sti; + struct nv_instruction *nvi, *sti, *next; int j; - for (sti = b->entry; sti; sti = sti->next) { - if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT) - continue; + for (sti = b->entry; sti; sti = next) { + next = sti->next; /* only handling MOV to $oX here */ + if (!sti->def[0] || sti->def[0]->reg.file != NV_FILE_OUT) + continue; if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA) continue; @@ -320,9 +321,9 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) continue; nvi->def[0] = sti->def[0]; - sti->def[0] = NULL; nvi->fixed = sti->fixed; - sti->fixed = 0; + + nv_nvi_delete(sti); } DESCEND_ARBITRARY(j, nv_pass_fold_stores); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index d6c5a8d660..dafff725b8 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -685,6 +685,8 @@ translate_opcode(uint opcode) case TGSI_OPCODE_CEIL: return NV_OP_CEIL; case TGSI_OPCODE_FLR: return NV_OP_FLOOR; case TGSI_OPCODE_TRUNC: return NV_OP_TRUNC; + case TGSI_OPCODE_COS: return NV_OP_COS; + case TGSI_OPCODE_SIN: return NV_OP_SIN; case TGSI_OPCODE_DDX: return NV_OP_DFDX; case TGSI_OPCODE_DDY: return NV_OP_DFDY; case TGSI_OPCODE_F2I: @@ -1226,6 +1228,14 @@ bld_instruction(struct bld_context *bld, dst0[c] = bld_insn_2(bld, opcode, src0, src1); } break; + case TGSI_OPCODE_ARL: + src1 = bld_imm_u32(bld, 4); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + (temp = bld_insn_1(bld, NV_OP_FLOOR, temp))->reg.type = NV_TYPE_S32; + dst0[c] = bld_insn_2(bld, NV_OP_SHL, temp, src1); + } + break; case TGSI_OPCODE_CMP: FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { src0 = emit_fetch(bld, insn, 0, c); @@ -1245,19 +1255,19 @@ bld_instruction(struct bld_context *bld, } break; case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: src0 = emit_fetch(bld, insn, 0, 0); temp = bld_insn_1(bld, NV_OP_PRESIN, src0); if (insn->Dst[0].Register.WriteMask & 7) - temp = bld_insn_1(bld, NV_OP_COS, temp); + temp = bld_insn_1(bld, opcode, temp); for (c = 0; c < 3; ++c) if (insn->Dst[0].Register.WriteMask & (1 << c)) dst0[c] = temp; if (!(insn->Dst[0].Register.WriteMask & (1 << 3))) break; - /* XXX: if src0.x is src0.w, don't emit new insns */ src0 = emit_fetch(bld, insn, 0, 3); temp = bld_insn_1(bld, NV_OP_PRESIN, src0); - dst0[3] = bld_insn_1(bld, NV_OP_COS, temp); + dst0[3] = bld_insn_1(bld, opcode, temp); break; case TGSI_OPCODE_DP3: src0 = emit_fetch(bld, insn, 0, 0); @@ -1303,6 +1313,9 @@ bld_instruction(struct bld_context *bld, bld_kil(bld, src0); } break; + case TGSI_OPCODE_KILP: + (new_instruction(bld->pc, NV_OP_KIL))->fixed = 1; + break; case TGSI_OPCODE_IF: { struct nv_basic_block *b = new_basic_block(bld->pc); @@ -1496,6 +1509,20 @@ bld_instruction(struct bld_context *bld, dst0[c]->reg.type = NV_TYPE_F32; } break; + case TGSI_OPCODE_SCS: + if (insn->Dst[0].Register.WriteMask & 0x3) { + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_PRESIN, src0); + if (insn->Dst[0].Register.WriteMask & 0x1) + dst0[0] = bld_insn_1(bld, NV_OP_COS, temp); + if (insn->Dst[0].Register.WriteMask & 0x2) + dst0[1] = bld_insn_1(bld, NV_OP_SIN, temp); + } + if (insn->Dst[0].Register.WriteMask & 0x4) + dst0[2] = bld_imm_f32(bld, 0.0f); + if (insn->Dst[0].Register.WriteMask & 0x8) + dst0[3] = bld_imm_f32(bld, 1.0f); + break; case TGSI_OPCODE_SUB: FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { src0 = emit_fetch(bld, insn, 0, c); @@ -1527,12 +1554,15 @@ bld_instruction(struct bld_context *bld, dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG; } break; + case TGSI_OPCODE_RET: + (new_instruction(bld->pc, NV_OP_RET))->fixed = 1; + break; case TGSI_OPCODE_END: if (bld->ti->p->type == PIPE_SHADER_FRAGMENT) bld_export_outputs(bld); break; default: - NOUVEAU_ERR("nv_bld: unhandled opcode %u\n", insn->Instruction.Opcode); + NOUVEAU_ERR("unhandled opcode %u\n", insn->Instruction.Opcode); abort(); break; } -- cgit v1.2.3 From cb75082768d516d684a69588266b92b06e19b7bd Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 17 Aug 2010 13:07:12 +0200 Subject: nv50: fix PSIZ and PRIMID mapping Initializing map to 0x40 (0x80) instead of 0 now, so need to clear it first. --- src/gallium/drivers/nv50/nv50_shader_state.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c index 3d5df596ef..5f70df3662 100644 --- a/src/gallium/drivers/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -496,16 +496,19 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) m = nv50_vec4_map(map, m, lin, &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy); } + /* PrimitiveID either is replaced by the system value, or * written by the geometry shader into an output register */ if (fp->gp.primid < 0x40) { - map[m / 4] |= vp->gp.primid << ((m % 4) * 8); + i = (m % 4) * 8; + map[m / 4] = (map[m / 4] & ~(0xff << i)) | (vp->gp.primid << i); primid = m++; } if (nv50->rasterizer->pipe.point_size_per_vertex) { - map[m / 4] |= vp->vp.psiz << ((m % 4) * 8); + i = (m % 4) * 8; + map[m / 4] = (map[m / 4] & ~(0xff << i)) | (vp->vp.psiz << i); psiz = (m++ << 4) | 1; } @@ -532,7 +535,6 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) so_datap (so, map, n); } - //colors = 0x01000404; so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); so_data (so, colors); so_data (so, clip); -- cgit v1.2.3 From 3e27785f3ebe6620805f97cb5c17ec8bd28bc1e8 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 17 Aug 2010 15:27:56 +0200 Subject: nv50: check dst compatibility in CSE --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 3e6e09a904..80f3bb34b0 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -1007,6 +1007,13 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) ik->flags_def || ir->flags_def) continue; /* and also not with flags, for now */ + assert(ik->def[0] && ir->def[0]); + + if (ik->def[0]->reg.file == NV_FILE_OUT || + ir->def[0]->reg.file == NV_FILE_OUT || + !values_equal(ik->def[0], ir->def[0])) + continue; + for (s = 0; s < 3; ++s) { struct nv_value *a, *b; -- cgit v1.2.3 From 1bbbc8e0c8230d33cb1eae89cc47b5296edefc10 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 17 Aug 2010 19:03:11 +0200 Subject: nv50: initialize edgeflag input index --- src/gallium/drivers/nv50/nv50_program.c | 1 + 1 file changed, 1 insertion(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 54cd36f868..d47941d3b1 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -484,6 +484,7 @@ nv50_prog_scan(struct nv50_translation_info *ti) struct tgsi_parse_context parse; int ret; + p->vp.edgeflag = 0x40; p->vp.psiz = 0x40; p->vp.bfc[0] = 0x40; p->vp.bfc[1] = 0x40; -- cgit v1.2.3 From eaab76457818fad0926b84c663440e8987e1f19f Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 18 Aug 2010 14:36:47 +0200 Subject: nv50: emit predicate for interp --- src/gallium/drivers/nv50/nv50_pc_emit.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index b5f4383aa1..bc151c3a80 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -596,8 +596,12 @@ emit_interp(struct nv_pc *pc, struct nv_instruction *i) if (i->centroid) pc->emit[0] |= 1 << 24; + assert(i->is_long || !i->flags_src); + if (i->is_long) { - pc->emit[1] |= 0x0780 | + set_pred(pc, i); + + pc->emit[1] |= (pc->emit[0] & (3 << 24)) >> (24 - 16) | (pc->emit[0] & (1 << 8)) >> (18 - 8); -- cgit v1.2.3 From 33f45c5a8afd353ad9bbd8647fa5c6dfc59cdfd7 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 22 Aug 2010 22:59:01 +0200 Subject: nv50: DP2, fix ARL --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 44 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index dafff725b8..7b2ccef704 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1199,6 +1199,25 @@ bld_tex(struct bld_context *bld, struct nv_value *dst0[4], nvi->tex_argc = arg; } +static INLINE struct nv_value * +bld_dot(struct bld_context *bld, const struct tgsi_full_instruction *insn, + int n) +{ + struct nv_value *dotp, *src0, *src1; + int c; + + src0 = emit_fetch(bld, insn, 0, 0); + src1 = emit_fetch(bld, insn, 1, 0); + dotp = bld_insn_2(bld, NV_OP_MUL, src0, src1); + + for (c = 1; c < n; ++c) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = emit_fetch(bld, insn, 1, c); + dotp = bld_insn_3(bld, NV_OP_MAD, src0, src1, dotp); + } + return dotp; +} + #define FOR_EACH_DST0_ENABLED_CHANNEL(chan, inst) \ for (chan = 0; chan < 4; ++chan) \ if ((inst)->Dst[0].Register.WriteMask & (1 << chan)) @@ -1232,7 +1251,7 @@ bld_instruction(struct bld_context *bld, src1 = bld_imm_u32(bld, 4); FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { src0 = emit_fetch(bld, insn, 0, c); - (temp = bld_insn_1(bld, NV_OP_FLOOR, temp))->reg.type = NV_TYPE_S32; + (temp = bld_insn_1(bld, NV_OP_FLOOR, src0))->reg.type = NV_TYPE_S32; dst0[c] = bld_insn_2(bld, NV_OP_SHL, temp, src1); } break; @@ -1269,27 +1288,18 @@ bld_instruction(struct bld_context *bld, temp = bld_insn_1(bld, NV_OP_PRESIN, src0); dst0[3] = bld_insn_1(bld, opcode, temp); break; + case TGSI_OPCODE_DP2: + temp = bld_dot(bld, insn, 2); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; case TGSI_OPCODE_DP3: - src0 = emit_fetch(bld, insn, 0, 0); - src1 = emit_fetch(bld, insn, 1, 0); - temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); - for (c = 1; c < 3; ++c) { - src0 = emit_fetch(bld, insn, 0, c); - src1 = emit_fetch(bld, insn, 1, c); - temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); - } + temp = bld_dot(bld, insn, 3); FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) dst0[c] = temp; break; case TGSI_OPCODE_DP4: - src0 = emit_fetch(bld, insn, 0, 0); - src1 = emit_fetch(bld, insn, 1, 0); - temp = bld_insn_2(bld, NV_OP_MUL, src0, src1); - for (c = 1; c < 4; ++c) { - src0 = emit_fetch(bld, insn, 0, c); - src1 = emit_fetch(bld, insn, 1, c); - temp = bld_insn_3(bld, NV_OP_MAD, src0, src1, temp); - } + temp = bld_dot(bld, insn, 4); FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) dst0[c] = temp; break; -- cgit v1.2.3 From 0df5e84b01f5420e37006a32c916835af2aa4314 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 22 Aug 2010 23:09:55 +0200 Subject: nv50: yet another case we need a nop.exit --- src/gallium/drivers/nv50/nv50_pc.c | 2 +- src/gallium/drivers/nv50/nv50_shader_state.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 5041fc7505..b9d274414d 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -327,7 +327,7 @@ nv50_emit_program(struct nv_pc *pc) assert(pc->emit == &code[pc->bin_size / 4]); /* XXX: we can do better than this ... */ - if (!(pc->emit[-2] & 1) || (pc->emit[-2] & 2) || (pc->emit[-1] & 3) == 3) { + if (!(pc->emit[-2] & 1) || (pc->emit[-2] & 2) || (pc->emit[-1] & 3)) { pc->emit[0] = 0xf0000001; pc->emit[1] = 0xe0000000; pc->bin_size += 8; diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c index 5f70df3662..a244753c4d 100644 --- a/src/gallium/drivers/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -547,7 +547,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); so_datap (so, lin, 4); - if (nv50->rasterizer->pipe.sprite_coord_enable) { + if (nv50->rasterizer->pipe.sprite_coord_enable) { /* XXX: gl_PointCoord */ so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); so_data (so, nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff)); -- cgit v1.2.3 From bae181f78d6ff5e37ef3c022563b2077c0247c2b Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 23 Aug 2010 14:25:13 +0200 Subject: nv50: fix check for sprite/point coord enable --- src/gallium/drivers/nv50/nv50_shader_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c index a244753c4d..f187a074e6 100644 --- a/src/gallium/drivers/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -384,7 +384,7 @@ nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned m) break; if (j < vp->out_nr) { - ubyte en = nv50->rasterizer->pipe.sprite_coord_enable; + uint32_t en = nv50->rasterizer->pipe.sprite_coord_enable; if (!(en & (1 << vp->out[j].si))) { m += n; @@ -547,7 +547,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4); so_datap (so, lin, 4); - if (nv50->rasterizer->pipe.sprite_coord_enable) { /* XXX: gl_PointCoord */ + if (nv50->rasterizer->pipe.point_quad_rasterization) { so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1); so_data (so, nv50_pntc_replace(nv50, pntc, (interp >> 8) & 0xff)); -- cgit v1.2.3 From db1874272c325e3e19fb7f386ec82f36e7a24496 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 24 Aug 2010 11:21:06 +0200 Subject: nv50: handle TEXTURE_SWIZZLE and GEOMETRY_SHADER4 caps GP support will probably be re-added soon. --- src/gallium/drivers/nv50/nv50_screen.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index 78137d6940..fc75d81d54 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -84,6 +84,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 1; case PIPE_CAP_GLSL: return 1; + case PIPE_CAP_GEOMETRY_SHADER4: + return 0; case PIPE_CAP_ANISOTROPIC_FILTER: return 1; case PIPE_CAP_POINT_SPRITE: @@ -105,6 +107,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_MIRROR_CLAMP: case PIPE_CAP_TEXTURE_MIRROR_REPEAT: return 1; + case PIPE_CAP_TEXTURE_SWIZZLE: + return 1; case PIPE_CAP_TGSI_CONT_SUPPORTED: return 1; case PIPE_CAP_BLEND_EQUATION_SEPARATE: -- cgit v1.2.3 From 3844c365947082550565accefd996c10fbb15cc4 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sat, 28 Aug 2010 17:05:11 +0200 Subject: nv50: set the FragDepth output index --- src/gallium/drivers/nv50/nv50_program.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index d47941d3b1..d4a75dc64a 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -461,7 +461,7 @@ nv50_fragprog_prepare(struct nv50_translation_info *ti) } if (depr < p->out_nr) { p->out[depr].mask = 0x4; - p->out[depr].hw = p->max_out++; + p->out[depr].hw = ti->output_map[depr][2] = p->max_out++; } return 0; -- cgit v1.2.3 From d90502b2b468732e2a42985580bbbe9d9fdfd14e Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 31 Aug 2010 13:17:07 +0200 Subject: nv50: turn off verbose debug output by default --- src/gallium/drivers/nv50/nv50_pc.c | 12 ++++++--- src/gallium/drivers/nv50/nv50_pc.h | 6 +++++ src/gallium/drivers/nv50/nv50_pc_emit.c | 11 ++++---- src/gallium/drivers/nv50/nv50_pc_optimize.c | 30 ++++++---------------- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 40 ++++++----------------------- src/gallium/drivers/nv50/nv50_program.c | 9 ++++--- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 12 +++++---- 7 files changed, 49 insertions(+), 71 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index b9d274414d..1c12fe1b9e 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -20,6 +20,8 @@ * SOFTWARE. */ +/* #define NV50PC_DEBUG */ + #include "nv50_pc.h" #include "nv50_program.h" @@ -311,7 +313,7 @@ nv50_emit_program(struct nv_pc *pc) uint32_t *code = pc->emit; int n; - debug_printf("emitting program: size = %u\n", pc->bin_size); + NV50_DBGMSG("emitting program: size = %u\n", pc->bin_size); for (n = 0; n < pc->num_blocks; ++n) { struct nv_instruction *i; @@ -336,7 +338,9 @@ nv50_emit_program(struct nv_pc *pc) pc->emit = code; code[pc->bin_size / 4 - 1] |= 1; +#ifdef NV50PC_DEBUG nvcg_show_bincode(pc); +#endif return 0; } @@ -354,7 +358,9 @@ nv50_generate_code(struct nv50_translation_info *ti) ret = nv50_tgsi_to_nc(pc, ti); if (ret) goto out; +#ifdef NV50PC_DEBUG nv_print_program(pc->root); +#endif /* optimization */ ret = nv_pc_exec_pass0(pc); @@ -392,7 +398,7 @@ nv50_generate_code(struct nv50_translation_info *ti) ti->p->fixups = pc->fixups; ti->p->num_fixups = pc->num_fixups; - debug_printf("SHADER TRANSLATION - %s\n", ret ? "failure" : "success"); + NV50_DBGMSG("SHADER TRANSLATION - %s\n", ret ? "failure" : "success"); out: nv_pc_free_refs(pc); @@ -492,7 +498,7 @@ nv_nvi_delete(struct nv_instruction *nvi) if (nvi == b->phi) { if (nvi->opcode != NV_OP_PHI) - debug_printf("NOTE: b->phi points to non-PHI instruction\n"); + NV50_DBGMSG("NOTE: b->phi points to non-PHI instruction\n"); assert(!nvi->prev); if (!nvi->next || nvi->next->opcode != NV_OP_PHI) diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index d24375100d..48918f46d5 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -23,6 +23,12 @@ #ifndef __NV50_COMPILER_H__ #define __NV50_COMPILER_H__ +#ifdef NV50PC_DEBUG +# define NV50_DBGMSG(args...) debug_printf(args) +#else +# define NV50_DBGMSG(args...) +#endif + #include "pipe/p_defines.h" #include "util/u_inlines.h" #include "util/u_memory.h" diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index bc151c3a80..7808335e50 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -239,8 +239,7 @@ set_dst(struct nv_pc *pc, struct nv_value *value) struct nv_reg *reg = &value->join->reg; if (reg->id < 0) { - debug_printf("WARNING: unused dst, hope we can bucket it !\n"); - pc->emit[0] |= 127 << 2; + pc->emit[0] |= (127 << 2) | 1; /* set 'long'-bit to catch bugs */ pc->emit[1] |= 0x8; return; } @@ -249,7 +248,7 @@ set_dst(struct nv_pc *pc, struct nv_value *value) pc->emit[1] |= 0x8; else if (reg->file == NV_FILE_ADDR) - assert(0); + assert(0); pc->emit[0] |= reg->id << 2; } @@ -801,8 +800,8 @@ emit_flop(struct nv_pc *pc, struct nv_instruction *i) pc->emit[0] = 0x90000000; - assert(SREG(src0)->type == NV_TYPE_F32); - assert(SREG(src0)->file == NV_FILE_GPR); + assert(STYPE(i, 0) == NV_TYPE_F32); + assert(SFILE(i, 0) == NV_FILE_GPR); if (!i->is_long) { emit_form_MUL(pc, i); @@ -1057,7 +1056,7 @@ emit_ddy(struct nv_pc *pc, struct nv_instruction *i) void nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) { - // nv_print_instruction(i); + /* nv_print_instruction(i); */ switch (i->opcode) { case NV_OP_MOV: diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 80f3bb34b0..4b1cd56fc1 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -20,6 +20,8 @@ * SOFTWARE. */ +/* #define NV50PC_DEBUG */ + #include "nv50_pc.h" #define DESCEND_ARBITRARY(j, f) \ @@ -109,7 +111,7 @@ nvi_isnop(struct nv_instruction *nvi) return FALSE; if (nvi->src[0]->value->join->reg.id < 0) { - debug_printf("nvi_isnop: orphaned value detected\n"); + NV50_DBGMSG("nvi_isnop: orphaned value detected\n"); return TRUE; } @@ -176,9 +178,6 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) nv50_inst_min_size(nvi->next) == 4 && inst_commutation_legal(nvi, nvi->next)) { ++n32; - debug_printf("permuting: "); - nv_print_instruction(nvi); - nv_print_instruction(nvi->next); nv_nvi_permute(nvi, nvi->next); next = nvi; } else { @@ -193,7 +192,7 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) } if (!b->entry) { - debug_printf("block %p is now empty\n", b); + NV50_DBGMSG("block %p is now empty\n", b); } else if (!b->exit->is_long) { assert(n32); @@ -221,7 +220,7 @@ nv_pc_exec_pass2(struct nv_pc *pc) pc->pass_seq++; nv_pass_flatten(&pass, pc->root); - debug_printf("preparing %u blocks for emission\n", pc->num_blocks); + NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks); pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); pc->num_blocks = 0; @@ -708,21 +707,6 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) return 0; } -/* -set $r2 g f32 $r2 $r3 -cvt abs rn f32 $r2 s32 $r2 -cvt f32 $c0 # f32 $r2 -e $c0 bra 0x80 -*/ -#if 0 -static int -nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b) -{ - /* XXX: easier in IR builder for now */ - return 0; -} -#endif - /* TODO: redundant store elimination */ struct load_record { @@ -936,7 +920,7 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) if (bb_is_if_else_endif(b)) { - debug_printf("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id); + NV50_DBGMSG("pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id); for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0) if (!nv50_nvi_can_predicate(nvi)) @@ -945,11 +929,13 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1) if (!nv50_nvi_can_predicate(nvi)) break; +#ifdef NV50_PC_DEBUG if (nvi) { debug_printf("cannot predicate: "); nv_print_instruction(nvi); } } else { debug_printf("cannot predicate: "); nv_print_instruction(nvi); +#endif } if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */ diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index d45dd7f95f..59462cc11e 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -20,6 +20,8 @@ * SOFTWARE. */ +/* #define NV50PC_DEBUG */ + #include "nv50_context.h" #include "nv50_pc.h" @@ -112,15 +114,8 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end) if (bgn < b->entry->serial || bgn > b->exit->serial) bgn = b->entry->serial; - if (bgn > end) { - debug_printf("Aieee! BLOCK [%i, %i], RANGE [%i, %i)\n", - b->entry->serial, b->exit->serial, bgn, end); - } assert(bgn <= end); - if (bgn < val->insn->serial) - debug_printf("WARNING: leaking value %i ?\n", val->n); - add_range_ex(val, bgn, end, NULL); } @@ -559,12 +554,8 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) struct nv_instruction *i; int j, n, ret = 0; - debug_printf("pass_build_live_sets BB:%i\n", b->id); - - if (b->pass_seq >= ctx->pc->pass_seq) { - debug_printf("already visited\n"); + if (b->pass_seq >= ctx->pc->pass_seq) return 0; - } b->pass_seq = ctx->pc->pass_seq; /* slight hack for undecidedness: set phi = entry if it's undefined */ @@ -595,13 +586,10 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) break; assert(i->src[j]->value->insn); - if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) { + if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) live_set_add(b, i->src[j]->value); - debug_printf("BB:%i liveset + %i\n", b->id, i->src[j]->value->n); - } else { + else live_set_rem(b, i->src[j]->value); - debug_printf("BB:%i liveset - %i\n", b->id, i->src[j]->value->n); - } } } } @@ -653,7 +641,7 @@ static void collect_live_values(struct nv_basic_block *b, const int n) } } -/* NOTE: the live intervals of phi functions start the the first non-phi instruction */ +/* NOTE: the live intervals of phi functions start at the first non-phi insn. */ static int pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) { @@ -661,8 +649,6 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) int j, s; const int n = (ctx->pc->num_values + 31) / 32; - debug_printf("building intervals for BB %i\n", b->id); - /* verify that first block does not have live-in values */ if (b->num_in == 0) for (j = 0; j < n; ++j) @@ -700,7 +686,6 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) add_range(&ctx->pc->values[j], b, b->exit->serial + 1); } } - debug_printf("%s: looping through instructions now\n", __func__); i_stop = b->entry ? b->entry->prev : NULL; @@ -763,8 +748,6 @@ insert_ordered_tail(struct nv_value *list, struct nv_value *nval) { struct nv_value *elem = list->prev; - // debug_printf("inserting value %i\n", nval->n); - for (elem = list->prev; elem != list && elem->livei->bgn > nval->livei->bgn; elem = elem->prev); @@ -818,8 +801,6 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter) foreach_s(cur, tmp[0], &unhandled) { remove_from_list(cur); - /* debug_printf("handling value %i\n", cur->n); */ - foreach_s(val, tmp[1], &active) { if (livei_end(val) <= cur->livei->bgn) { reg_release(&free, val); @@ -878,23 +859,19 @@ nv_pc_exec_pass1(struct nv_pc *pc) struct nv_pc_pass *ctx; int i, ret; - debug_printf("REGISTER ALLOCATION - entering\n"); + NV50_DBGMSG("REGISTER ALLOCATION - entering\n"); ctx = CALLOC_STRUCT(nv_pc_pass); if (!ctx) return -1; ctx->pc = pc; - nv_print_program(ctx->pc->root); - ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *)); pc->pass_seq++; ret = pass_generate_phi_movs(ctx, pc->root); assert(!ret); - nv_print_program(ctx->pc->root); - for (i = 0; i < pc->loop_nesting_bound; ++i) { pc->pass_seq++; ret = pass_build_live_sets(ctx, pc->root); @@ -934,8 +911,7 @@ nv_pc_exec_pass1(struct nv_pc *pc) for (i = 0; i < pc->num_values; ++i) livei_release(&pc->values[i]); - debug_printf("REGISTER ALLOCATION - leaving\n"); - nv_print_program(ctx->pc->root); + NV50_DBGMSG("REGISTER ALLOCATION - leaving\n"); out: FREE(ctx); diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index d4a75dc64a..182a591eb3 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -20,6 +20,8 @@ * SOFTWARE. */ +/* #define NV50_PROGRAM_DEBUG */ + #include "nv50_program.h" #include "nv50_pc.h" #include "nv50_context.h" @@ -187,8 +189,6 @@ prog_immediate(struct nv50_translation_info *ti, int c; unsigned n = ++ti->immd32_nr; - tgsi_dump_immediate(imm); - if (n == (1 << (ffs(n) - 1))) ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16); @@ -228,7 +228,6 @@ prog_decl(struct nv50_translation_info *ti, sn = decl->Semantic.Name; si = decl->Semantic.Index; } - tgsi_dump_declaration(decl); switch (decl->Declaration.File) { case TGSI_FILE_INPUT: @@ -492,6 +491,10 @@ nv50_prog_scan(struct nv50_translation_info *ti) tgsi_scan_shader(p->pipe.tokens, &ti->scan); +#ifdef NV50_PROGRAM_DEBUG + tgsi_dump(p->pipe.tokens, 0); +#endif + tgsi_parse_init(&parse, p->pipe.tokens); while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 7b2ccef704..115b5df939 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -20,6 +20,8 @@ * SOFTWARE. */ +/* #define NV50_TGSI2NC_DEBUG */ + /* XXX: need to clean this up so we get the typecasting right more naturally */ #include @@ -1015,10 +1017,8 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, abort(); break; } - if (!res) { - debug_printf("WARNING: undefined source value in TGSI instruction\n"); - return bld_load_imm_u32(bld, 0); - } + if (!res) + return bld_undef(bld, NV_FILE_GPR); switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) { case TGSI_UTIL_SIGN_KEEP: @@ -1234,7 +1234,9 @@ bld_instruction(struct bld_context *bld, int c; uint opcode = translate_opcode(insn->Instruction.Opcode); - tgsi_dump_instruction(insn, 1); +#ifdef NV50_TGSI2NC_DEBUG + debug_printf("bld_instruction:"); tgsi_dump_instruction(insn, 1); +#endif switch (insn->Instruction.Opcode) { case TGSI_OPCODE_ADD: -- cgit v1.2.3 From 0a8292e096bc37eeb225bf7d3854b6b6edc4bceb Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 1 Sep 2010 17:54:56 +0200 Subject: nv50: attempt at making more complicated loops work Nested loops, and loops with multiple exits (BREAK, CONT). --- src/gallium/drivers/nv50/nv50_pc.c | 20 +++++-- src/gallium/drivers/nv50/nv50_pc.h | 6 ++ src/gallium/drivers/nv50/nv50_pc_optimize.c | 14 +++-- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 85 ++++++++++++++++++----------- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 70 +++++++++++++++++++----- 5 files changed, 138 insertions(+), 57 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 1c12fe1b9e..b03f5b27f6 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -220,6 +220,7 @@ edge_name(ubyte type) case CFG_EDGE_BACK: return "back"; case CFG_EDGE_LOOP_ENTER: return "loop"; case CFG_EDGE_LOOP_LEAVE: return "break"; + case CFG_EDGE_FAKE: return "fake"; default: return "?"; } @@ -247,6 +248,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv) case CFG_EDGE_BACK: continue; case CFG_EDGE_FORWARD: + case CFG_EDGE_FAKE: if (++b->out[j]->priv == b->out[j]->num_in) bb[p++] = b->out[j]; break; @@ -264,9 +266,11 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv) f(priv, b); - if (!p) - while (pp > 0) - bb[p++] = bbb[--pp]; + if (!p) { + p = pp; + for (; pp > 0; --pp) + bb[pp - 1] = bbb[pp - 1]; + } } } @@ -366,11 +370,17 @@ nv50_generate_code(struct nv50_translation_info *ti) ret = nv_pc_exec_pass0(pc); if (ret) goto out; +#ifdef NV50PC_DEBUG + nv_print_program(pc->root); +#endif /* register allocation */ ret = nv_pc_exec_pass1(pc); if (ret) goto out; +#ifdef NV50PC_DEBUG + nv_print_program(pc->root); +#endif /* prepare for emission */ ret = nv_pc_exec_pass2(pc); @@ -580,10 +590,10 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, if (bp == bt) return FALSE; - if (bp->out[0] && bp->out_kind[0] != CFG_EDGE_BACK && + if (bp->out[0] && !IS_WALL_EDGE(bp->out_kind[0]) && nvbb_reachable_by(bf, bp->out[0], bt)) return TRUE; - if (bp->out[1] && bp->out_kind[1] != CFG_EDGE_BACK && + if (bp->out[1] && !IS_WALL_EDGE(bp->out_kind[1]) && nvbb_reachable_by(bf, bp->out[1], bt)) return TRUE; return FALSE; diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 48918f46d5..2bb3ea4374 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -257,6 +257,12 @@ struct nv_instruction { #define CFG_EDGE_BACK 1 #define CFG_EDGE_LOOP_ENTER 2 #define CFG_EDGE_LOOP_LEAVE 4 +#define CFG_EDGE_FAKE 8 + +/* 'WALL' edge means where reachability check doesn't follow */ +/* 'LOOP' edge means just having to do with loops */ +#define IS_LOOP_EDGE(k) ((k) & 7) +#define IS_WALL_EDGE(k) ((k) & 9) struct nv_basic_block { struct nv_instruction *entry; /* first non-phi instruction */ diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 4b1cd56fc1..1d2710a8ac 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -362,6 +362,9 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); if (ld->src[4]) nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); + + if (!nv_nvi_refcount(ld)) + nv_nvi_delete(ld); } } DESCEND_ARBITRARY(j, nv_pass_fold_loads); @@ -504,7 +507,7 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi, u1.u32 = src1->reg.imm.u32; modifiers_apply(&u0.u32, type, nvi->src[0]->mod); - modifiers_apply(&u0.u32, type, nvi->src[1]->mod); + modifiers_apply(&u1.u32, type, nvi->src[1]->mod); switch (nvi->opcode) { case NV_OP_MAD: @@ -951,7 +954,9 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) if (b->exit && b->exit->opcode == NV_OP_JOINAT) nv_nvi_delete(b->exit); - if ((nvi = b->out[0]->out[0]->entry)) { + i = (b->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) ? 1 : 0; + + if ((nvi = b->out[0]->out[i]->entry)) { nvi->is_join = 0; if (nvi->opcode == NV_OP_JOIN) nv_nvi_delete(nvi); @@ -980,7 +985,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) if (ir->opcode != ik->opcode) continue; - if (ik->opcode == NV_OP_LDA || + if (!ir->def[0] || !ik->def[0] || + ik->opcode == NV_OP_LDA || ik->opcode == NV_OP_STA || ik->opcode == NV_OP_MOV || nv_is_vector_op(ik->opcode)) @@ -993,8 +999,6 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) ik->flags_def || ir->flags_def) continue; /* and also not with flags, for now */ - assert(ik->def[0] && ir->def[0]); - if (ik->def[0]->reg.file == NV_FILE_OUT || ir->def[0]->reg.file == NV_FILE_OUT || !values_equal(ik->def[0], ir->def[0])) diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 59462cc11e..81decf8d4a 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -22,6 +22,10 @@ /* #define NV50PC_DEBUG */ +/* #define NV50_RA_DEBUG_LIVEI */ +/* #define NV50_RA_DEBUG_LIVE_SETS */ +/* #define NV50_RA_DEBUG_JOIN */ + #include "nv50_context.h" #include "nv50_pc.h" @@ -119,7 +123,7 @@ add_range(struct nv_value *val, struct nv_basic_block *b, int end) add_range_ex(val, bgn, end, NULL); } -#ifdef NV50_RA_DEBUG_JOIN +#if defined(NV50_RA_DEBUG_JOIN) || defined(NV50_RA_DEBUG_LIVEI) static void livei_print(struct nv_value *a) { @@ -359,16 +363,37 @@ need_new_else_block(struct nv_basic_block *b, struct nv_basic_block *p) int i = 0, n = 0; for (; i < 2; ++i) - if (p->out[i] && p->out_kind[i] != CFG_EDGE_LOOP_LEAVE) + if (p->out[i] && !IS_LOOP_EDGE(p->out_kind[i])) ++n; return (b->num_in > 1) && (n == 2); } +static int +phi_opnd_for_bb(struct nv_instruction *phi, struct nv_basic_block *b, + struct nv_basic_block *tb) +{ + int i, j; + + for (j = -1, i = 0; i < 4 && phi->src[i]; ++i) { + if (!nvbb_reachable_by(b, phi->src[i]->value->insn->bb, tb)) + continue; + /* NOTE: back-edges are ignored by the reachable-by check */ + if (j < 0 || !nvbb_reachable_by(phi->src[j]->value->insn->bb, + phi->src[i]->value->insn->bb, tb)) + j = i; + } + return j; +} + /* For each operand of each PHI in b, generate a new value by inserting a MOV * at the end of the block it is coming from and replace the operand with its * result. This eliminates liveness conflicts and enables us to let values be * copied to the right register if such a conflict exists nonetheless. + * + * These MOVs are also crucial in making sure the live intervals of phi srces + * are extended until the end of the loop, since they are not included in the + * live-in sets. */ static int pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) @@ -404,14 +429,17 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) ctx->pc->current_block = pn; for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) { - for (j = 0; j < 4 && i->src[j]; ++j) { - if (nvbb_reachable_by(p, i->src[j]->value->insn->bb, b)) - break; - } - if (j >= 4 || !i->src[j]) + if ((j = phi_opnd_for_bb(i, p, b)) < 0) continue; val = i->src[j]->value; + if (i->src[j]->flags) { + val = val->insn->src[0]->value; + while (j < 4 && i->src[j]) + ++j; + assert(j < 4); + } + ni = new_instruction(ctx->pc, NV_OP_MOV); /* TODO: insert instruction at correct position in the first place */ @@ -423,6 +451,8 @@ pass_generate_phi_movs(struct nv_pc_pass *ctx, struct nv_basic_block *b) ni->src[0] = new_ref(ctx->pc, val); nv_reference(ctx->pc, &i->src[j], ni->def[0]); + + i->src[j]->flags = 1; } if (pn != p && pn->exit) { @@ -452,8 +482,8 @@ pass_join_values(struct nv_pc_pass *ctx, int iter) case NV_OP_PHI: if (!iter) continue; - try_join_values(ctx, i->src[0]->value, i->src[1]->value); - try_join_values(ctx, i->def[0], i->src[0]->value); + for (c = 0; c < 4 && i->src[c]; ++c) + try_join_values(ctx, i->def[0], i->src[c]->value); break; case NV_OP_MOV: if (iter && i->src[0]->value->insn && @@ -576,22 +606,6 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) for (j = 0; j < (ctx->pc->num_values + 31) / 32; ++j) b->live_set[j] |= b->out[n]->live_set[j]; } - - /* Kick values out of our live set that are created in incoming - * blocks of our successors that are not us. - */ - for (i = b->out[n]->phi; i && i->opcode == NV_OP_PHI; i = i->next) { - for (j = 0; j < 4; ++j) { - if (!i->src[j]) - break; - assert(i->src[j]->value->insn); - - if (nvbb_reachable_by(b, i->src[j]->value->insn->bb, b->out[n])) - live_set_add(b, i->src[j]->value); - else - live_set_rem(b, i->src[j]->value); - } - } } if (!b->entry) @@ -599,7 +613,7 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) bb_live_set_print(ctx->pc, b); - for (i = b->exit; i; i = i->prev) { + for (i = b->exit; i != b->entry->prev; i = i->prev) { for (j = 0; j < 4; j++) { if (!i->def[j]) break; @@ -617,6 +631,9 @@ pass_build_live_sets(struct nv_pc_pass *ctx, struct nv_basic_block *b) if (i->flags_src) live_set_add(b, i->flags_src->value); } + for (i = b->phi; i && i->opcode == NV_OP_PHI; i = i->next) + live_set_rem(b, i->def[0]); + bb_live_set_print(ctx->pc, b); return 0; @@ -680,10 +697,12 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) for (j = 0; j < ctx->pc->num_values; ++j) { if (!(b->live_set[j / 32] & (1 << (j % 32)))) continue; + add_range(&ctx->pc->values[j], b, b->exit->serial + 1); #ifdef NV50_RA_DEBUG_LIVEI - debug_printf("adding range for live value %i\n", j); + debug_printf("adding range for live value %i: ", j); + livei_print(&ctx->pc->values[j]); #endif - add_range(&ctx->pc->values[j], b, b->exit->serial + 1); + } } @@ -702,20 +721,22 @@ pass_build_intervals(struct nv_pc_pass *ctx, struct nv_basic_block *b) for (j = 0; j < 5; ++j) { if (i->src[j] && !live_set_test(b, i->src[j])) { live_set_add(b, i->src[j]->value); + add_range(i->src[j]->value, b, i->serial); #ifdef NV50_RA_DEBUG_LIVEI - debug_printf("adding range for source that ends living: %i\n", + debug_printf("adding range for source %i (ends living): ", i->src[j]->value->n); + livei_print(i->src[j]->value); #endif - add_range(i->src[j]->value, b, i->serial); } } if (i->flags_src && !live_set_test(b, i->flags_src)) { live_set_add(b, i->flags_src->value); + add_range(i->flags_src->value, b, i->serial); #ifdef NV50_RA_DEBUG_LIVEI - debug_printf("adding range for source that ends living: %i\n", + debug_printf("adding range for source %i (ends living): ", i->flags_src->value->n); + livei_print(i->flags_src->value); #endif - add_range(i->flags_src->value, b, i->serial); } } diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 115b5df939..8b18a9c025 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -120,6 +120,8 @@ struct bld_context { struct nv_basic_block *brkt_bb[BLD_MAX_LOOP_NESTING]; int loop_lvl; + ubyte out_kind; /* CFG_EDGE_FORWARD, or FAKE in case of BREAK/CONT */ + struct bld_value_stack tvs[BLD_MAX_TEMPS][4]; /* TGSI_FILE_TEMPORARY */ struct bld_value_stack avs[BLD_MAX_ADDRS][4]; /* TGSI_FILE_ADDRESS */ struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ @@ -268,7 +270,7 @@ fetch_by_bb(struct bld_value_stack *stack, return; } for (i = 0; i < b->num_in; ++i) - if (b->in_kind[i] != CFG_EDGE_BACK) + if (!IS_WALL_EDGE(b->in_kind[i])) fetch_by_bb(stack, vals, n, b->in[i]); } @@ -362,18 +364,31 @@ bld_phi(struct bld_context *bld, struct nv_basic_block *b, return phi->def[0]; } +/* Insert a phi function in the loop header. + * For nested loops, we need to insert phi functions in all the outer + * loop headers if they don't have one yet. + * + * @def: redefinition from inside loop, or NULL if to be replaced later + */ static struct nv_value * bld_loop_phi(struct bld_context *bld, struct bld_value_stack *stack, struct nv_value *def) { - struct nv_basic_block *bb = bld->pc->current_block; struct nv_instruction *phi; - struct nv_value *val; + struct nv_basic_block *bb = bld->pc->current_block; + struct nv_value *val = NULL; - val = bld_phi(bld, bld->pc->current_block, stack); + if (bld->loop_lvl > 1) { + --bld->loop_lvl; + if (!((stack->loop_def | stack->loop_use) & (1 << bld->loop_lvl))) + val = bld_loop_phi(bld, stack, NULL); + ++bld->loop_lvl; + } + + if (!val) + val = bld_phi(bld, bld->pc->current_block, stack); /* old definition */ if (!val) { bld->pc->current_block = bld->loop_bb[bld->loop_lvl - 1]->in[0]; - val = bld_undef(bld, bld_stack_file(bld, stack)); } @@ -449,10 +464,11 @@ bld_replace_value(struct nv_pc *, struct nv_basic_block *, struct nv_value *, static void bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) { + struct nv_basic_block *save = bld->pc->current_block; struct nv_instruction *phi, *next; struct nv_value *val; struct bld_value_stack *stk; - int s; + int i, s, n; for (phi = bb->phi; phi && phi->opcode == NV_OP_PHI; phi = next) { next = phi->next; @@ -460,19 +476,33 @@ bld_loop_end(struct bld_context *bld, struct nv_basic_block *bb) stk = (struct bld_value_stack *)phi->target; phi->target = NULL; - val = bld_fetch_global(bld, stk); + for (s = 1, n = 0; n < bb->num_in; ++n) { + if (bb->in_kind[n] != CFG_EDGE_BACK) + continue; - nv_reference(bld->pc, &phi->src[1], val); + assert(s < 4); + bld->pc->current_block = bb->in[n]; + val = bld_fetch_global(bld, stk); + + for (i = 0; i < 4; ++i) + if (phi->src[i] && phi->src[i]->value == val) + break; + if (i == 4) + nv_reference(bld->pc, &phi->src[s++], val); + } + bld->pc->current_block = save; - s = -1; if (phi->src[0]->value == phi->def[0] || phi->src[0]->value == phi->src[1]->value) s = 1; else if (phi->src[1]->value == phi->def[0]) s = 0; + else + continue; if (s >= 0) { + /* eliminate the phi */ bld_vals_del_val(stk, phi->def[0]); ++bld->pc->pass_seq; @@ -915,6 +945,8 @@ bld_new_block(struct bld_context *bld, struct nv_basic_block *b) for (i = 0; i < 128; ++i) bld->saved_inputs[i] = NULL; + + bld->out_kind = CFG_EDGE_FORWARD; } static struct nv_value * @@ -1366,7 +1398,7 @@ bld_instruction(struct bld_context *bld, struct nv_basic_block *b = new_basic_block(bld->pc); --bld->cond_lvl; - nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD); + nvbb_attach_block(bld->pc->current_block, b, bld->out_kind); nvbb_attach_block(bld->cond_bb[bld->cond_lvl], b, CFG_EDGE_FORWARD); bld->cond_bb[bld->cond_lvl]->exit->target = b; @@ -1407,8 +1439,10 @@ bld_instruction(struct bld_context *bld, bld_flow(bld, NV_OP_BREAK, NV_CC_TR, NULL, bb, FALSE); - /* XXX: don't do this for redundant BRKs */ - nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE); + if (bld->out_kind == CFG_EDGE_FORWARD) /* else we already had BRK/CONT */ + nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_LOOP_LEAVE); + + bld->out_kind = CFG_EDGE_FAKE; } break; case TGSI_OPCODE_CONT: @@ -1418,11 +1452,17 @@ bld_instruction(struct bld_context *bld, bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); nvbb_attach_block(bld->pc->current_block, bb, CFG_EDGE_BACK); + + if ((bb = bld->join_bb[bld->cond_lvl - 1])) { + bld->join_bb[bld->cond_lvl - 1] = NULL; + nv_nvi_delete(bb->exit->prev); + } + bld->out_kind = CFG_EDGE_FAKE; } break; case TGSI_OPCODE_ENDLOOP: { - struct nv_basic_block *bb = bld->loop_bb[--bld->loop_lvl]; + struct nv_basic_block *bb = bld->loop_bb[bld->loop_lvl - 1]; bld_flow(bld, NV_OP_BRA, NV_CC_TR, NULL, bb, FALSE); @@ -1430,7 +1470,7 @@ bld_instruction(struct bld_context *bld, bld_loop_end(bld, bb); /* replace loop-side operand of the phis */ - bld_new_block(bld, bld->brkt_bb[bld->loop_lvl]); + bld_new_block(bld, bld->brkt_bb[--bld->loop_lvl]); } break; case TGSI_OPCODE_ABS: @@ -1651,7 +1691,7 @@ bld_replace_value(struct nv_pc *pc, struct nv_basic_block *b, { struct nv_instruction *nvi; - for (nvi = b->entry; nvi; nvi = nvi->next) { + for (nvi = b->phi ? b->phi : b->entry; nvi; nvi = nvi->next) { int s; for (s = 0; s < 5; ++s) { if (!nvi->src[s]) -- cgit v1.2.3 From 7145ab214f1bd0d84671936dddb87db05f2861f6 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Sat, 28 Aug 2010 18:08:26 +0200 Subject: nv50: DST --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 8b18a9c025..0ea2912846 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1337,6 +1337,19 @@ bld_instruction(struct bld_context *bld, FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) dst0[c] = temp; break; + case TGSI_OPCODE_DST: + if (insn->Dst[0].Register.WriteMask & 1) + dst0[0] = bld_imm_f32(bld, 1.0f); + if (insn->Dst[0].Register.WriteMask & 2) { + src0 = emit_fetch(bld, insn, 0, 1); + src1 = emit_fetch(bld, insn, 1, 1); + dst0[1] = bld_insn_2(bld, NV_OP_MUL, src0, src1); + } + if (insn->Dst[0].Register.WriteMask & 4) + dst0[2] = emit_fetch(bld, insn, 0, 2); + if (insn->Dst[0].Register.WriteMask & 8) + dst0[3] = emit_fetch(bld, insn, 1, 3); + break; case TGSI_OPCODE_EX2: src0 = emit_fetch(bld, insn, 0, 0); temp = bld_insn_1(bld, NV_OP_PREEX2, src0); -- cgit v1.2.3 From e02c63bc10fd935537441917a10fef63fb3f9bfa Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Sat, 28 Aug 2010 18:10:09 +0200 Subject: nv50: DPH --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 0ea2912846..5ac61f108e 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1337,6 +1337,13 @@ bld_instruction(struct bld_context *bld, FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) dst0[c] = temp; break; + case TGSI_OPCODE_DPH: + src0 = bld_dot(bld, insn, 3); + src1 = emit_fetch(bld, insn, 1, 3); + temp = bld_insn_2(bld, NV_OP_ADD, src0, src1); + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) + dst0[c] = temp; + break; case TGSI_OPCODE_DST: if (insn->Dst[0].Register.WriteMask & 1) dst0[0] = bld_imm_f32(bld, 1.0f); -- cgit v1.2.3 From 917c79b384af9da95d2fe3ad86d488478d0d7718 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 31 Aug 2010 19:03:35 +0200 Subject: nv50: SSG --- src/gallium/drivers/nv50/nv50_pc_emit.c | 4 ++-- src/gallium/drivers/nv50/nv50_pc_optimize.c | 13 +++++-------- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 11 +++++++++++ 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index 7808335e50..e1d7bc6459 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -729,7 +729,7 @@ emit_bitop2(struct nv_pc *pc, struct nv_instruction *i) { pc->emit[0] = 0xd0000000; - if (SFILE(i, 0) == NV_FILE_IMM) { + if (SFILE(i, 1) == NV_FILE_IMM) { emit_form_IMM(pc, i, 0); if (i->opcode == NV_OP_OR) @@ -761,7 +761,7 @@ emit_arl(struct nv_pc *pc, struct nv_instruction *i) pc->emit[0] = 0x00000001; pc->emit[1] = 0xc0000000; - set_dst(pc, i->def[0]); + pc->emit[0] |= (i->def[0]->reg.id + 1) << 2; set_pred(pc, i); set_src_0(pc, i->src[0]); pc->emit[0] |= (get_immd_u32(i->src[1]) & 0x3f) << 16; diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 1d2710a8ac..4a3a51512e 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -264,11 +264,8 @@ check_swap_src_0_1(struct nv_instruction *nvi) return; assert(src0 && src1); - if (src1->value->reg.file == NV_FILE_IMM) { - /* should only be present from folding a constant MUL part of a MAD */ - assert(nvi->opcode == NV_OP_ADD); + if (src1->value->reg.file == NV_FILE_IMM) return; - } if (is_cmem_load(src0->value->insn)) { if (!is_cmem_load(src1->value->insn)) { @@ -305,7 +302,7 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) continue; nvi = sti->src[0]->value->insn; - if (!nvi || nvi->opcode == NV_OP_PHI) + if (!nvi || nvi->opcode == NV_OP_PHI || nv_is_vector_op(nvi->opcode)) continue; assert(nvi->def[0] == sti->src[0]->value); @@ -536,9 +533,9 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi, break; case NV_OP_SUB: switch (type) { - case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32; - case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32; - case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32; + case NV_TYPE_F32: u.f32 = u0.f32 - u1.f32; break; + case NV_TYPE_U32: u.u32 = u0.u32 - u1.u32; break; + case NV_TYPE_S32: u.s32 = u0.s32 - u1.s32; break; default: assert(0); break; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 5ac61f108e..0a4c88c817 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1595,6 +1595,17 @@ bld_instruction(struct bld_context *bld, if (insn->Dst[0].Register.WriteMask & 0x8) dst0[3] = bld_imm_f32(bld, 1.0f); break; + case TGSI_OPCODE_SSG: + FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { + src0 = emit_fetch(bld, insn, 0, c); + src1 = bld_predicate(bld, src0, FALSE); + temp = bld_insn_2(bld, NV_OP_AND, src0, bld_imm_u32(bld, 0x80000000)); + temp = bld_insn_2(bld, NV_OP_OR, temp, bld_imm_f32(bld, 1.0f)); + dst0[c] = bld_insn_2(bld, NV_OP_XOR, temp, temp); + dst0[c]->insn->cc = NV_CC_EQ; + nv_reference(bld->pc, &dst0[c]->insn->flags_src, src1); + } + break; case TGSI_OPCODE_SUB: FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { src0 = emit_fetch(bld, insn, 0, c); -- cgit v1.2.3 From 07fe7c2f02dbf4e0c385aaf3f21ee858f0ae974c Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 31 Aug 2010 19:09:15 +0200 Subject: nv50: make FrontFacing -1 or +1 --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 0a4c88c817..c98d5e126a 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -964,6 +964,14 @@ bld_saved_input(struct bld_context *bld, unsigned i, unsigned c) static struct nv_value * bld_interpolate(struct bld_context *bld, unsigned mode, struct nv_value *val) { + if (val->reg.id == 255) { + /* gl_FrontFacing: 0/~0 to -1.0/+1.0 */ + val = bld_insn_1(bld, NV_OP_LINTERP, val); + val = bld_insn_2(bld, NV_OP_SHL, val, bld_imm_u32(bld, 31)); + val->insn->src[0]->typecast = NV_TYPE_U32; + val = bld_insn_2(bld, NV_OP_XOR, val, bld_imm_f32(bld, -1.0f)); + val->insn->src[0]->typecast = NV_TYPE_U32; + } else if (mode & (NV50_INTERP_LINEAR | NV50_INTERP_FLAT)) val = bld_insn_1(bld, NV_OP_LINTERP, val); else @@ -1029,9 +1037,8 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, } else { assert(src->Dimension.Dimension == 0); res = bld_insn_1(bld, NV_OP_LDA, res); + assert(res->reg.type == type); } - assert(res->reg.type == type); - bld->saved_inputs[bld->ti->input_map[idx][swz]] = res; break; case TGSI_FILE_TEMPORARY: -- cgit v1.2.3 From 6f9978050eb8648888a728fc09b99e279c2b7b15 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 31 Aug 2010 19:17:46 +0200 Subject: nv50: re-add proper TEXBIAS sequence --- src/gallium/drivers/nv50/nv50_pc.c | 29 ++++ src/gallium/drivers/nv50/nv50_pc.h | 9 +- src/gallium/drivers/nv50/nv50_pc_emit.c | 28 +++- src/gallium/drivers/nv50/nv50_pc_optimize.c | 34 ++--- src/gallium/drivers/nv50/nv50_pc_print.c | 5 +- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 8 +- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 212 ++++++++++++++++++++++++---- 7 files changed, 258 insertions(+), 67 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index b03f5b27f6..28e32eadb7 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -204,6 +204,35 @@ nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, return n; } +struct nv_value * +nvcg_find_constant(struct nv_ref *ref) +{ + struct nv_value *src; + + if (!ref) + return NULL; + + src = ref->value; + while (src->insn && src->insn->opcode == NV_OP_MOV) { + assert(!src->insn->src[0]->mod); + src = src->insn->src[0]->value; + } + if ((src->reg.file == NV_FILE_IMM) || + (src->insn && src->insn->opcode == NV_OP_LDA && + src->insn->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && + src->insn->src[0]->value->reg.file <= NV_FILE_MEM_C(15))) + return src; + return NULL; +} + +struct nv_value * +nvcg_find_immediate(struct nv_ref *ref) +{ + struct nv_value *src = nvcg_find_constant(ref); + + return (src && src->reg.file == NV_FILE_IMM) ? src : NULL; +} + static void nv_pc_free_refs(struct nv_pc *pc) { diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 2bb3ea4374..adc46dec8d 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -126,6 +126,7 @@ #define NV_TYPE_ISINT(t) ((t) <= 5) #define NV_TYPE_ISFLT(t) ((t) & 0x08) +/* $cX registers contain 4 bits: OCSZ (Z is bit 0) */ #define NV_CC_FL 0x0 #define NV_CC_LT 0x1 #define NV_CC_EQ 0x2 @@ -135,6 +136,10 @@ #define NV_CC_GE 0x6 #define NV_CC_U 0x8 #define NV_CC_TR 0xf +#define NV_CC_O 0x10 +#define NV_CC_C 0x11 +#define NV_CC_A 0x12 +#define NV_CC_S 0x13 #define NV_PC_MAX_INSTRUCTIONS 2048 #define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4) @@ -241,7 +246,7 @@ struct nv_instruction { ubyte saturate : 1; ubyte centroid : 1; ubyte flat : 1; - ubyte padding : 4; + ubyte lanes : 4; ubyte tex_live : 1; /* */ ubyte tex_t; /* TIC binding */ @@ -459,6 +464,8 @@ boolean nvbb_reachable_by(struct nv_basic_block *, struct nv_basic_block *, struct nv_basic_block *nvbb_dom_frontier(struct nv_basic_block *); int nvcg_replace_value(struct nv_pc *pc, struct nv_value *old_val, struct nv_value *new_val); +struct nv_value *nvcg_find_immediate(struct nv_ref *); +struct nv_value *nvcg_find_constant(struct nv_ref *); typedef void (*nv_pc_pass_func)(void *priv, struct nv_basic_block *b); diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index e1d7bc6459..bb0a6f32d1 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -540,8 +540,9 @@ emit_mov(struct nv_pc *pc, struct nv_instruction *i) set_a16_bits(pc, SREG(i->src[0])->id); } else if (DFILE(i, 0) == NV_FILE_FLAGS) { - pc->emit[0] = 0x000001fd; - pc->emit[1] = 0xa0000788 | (1 << 6); + pc->emit[0] = 0x00000001; + pc->emit[1] = 0xa0000000 | (1 << 6); + set_pred(pc, i); pc->emit[0] |= SREG(i->src[0])->id << 9; pc->emit[1] |= DREG(i->def[0])->id << 4; } else @@ -984,7 +985,7 @@ emit_tex(struct nv_pc *pc, struct nv_instruction *i) pc->emit[0] |= i->tex_t << 9; pc->emit[0] |= i->tex_s << 17; - pc->emit[0] |= i->tex_argc << 22; + pc->emit[0] |= (i->tex_argc - 1) << 22; pc->emit[0] |= (i->tex_mask & 0x3) << 25; pc->emit[1] |= (i->tex_mask & 0xc) << 12; @@ -1000,8 +1001,6 @@ emit_tex(struct nv_pc *pc, struct nv_instruction *i) else if (i->opcode == NV_OP_TXL) pc->emit[1] |= 0x40000000; - else - pc->emit[0] -= 1 << 22; } static void @@ -1053,6 +1052,20 @@ emit_ddy(struct nv_pc *pc, struct nv_instruction *i) set_pred_wr(pc, i); } +static void +emit_quadop(struct nv_pc *pc, struct nv_instruction *i) +{ + pc->emit[0] = 0xc0000000; + pc->emit[1] = 0x80000000; + + emit_form_ADD(pc, i); + + pc->emit[0] |= i->lanes << 16; + + pc->emit[0] |= (i->quadop & 0x03) << 20; + pc->emit[1] |= (i->quadop & 0xfc) << 20; +} + void nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) { @@ -1132,6 +1145,9 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) case NV_OP_TXL: emit_tex(pc, i); break; + case NV_OP_QUADOP: + emit_quadop(pc, i); + break; case NV_OP_KIL: emit_flow(pc, i, 0x0); break; @@ -1162,7 +1178,7 @@ nv50_emit_instruction(struct nv_pc *pc, struct nv_instruction *i) case NV_OP_UNDEF: case NV_OP_SUB: NOUVEAU_ERR("operation \"%s\" should have been eliminated\n", - nv_opcode_name(i->opcode)); + nv_opcode_name(i->opcode)); break; default: NOUVEAU_ERR("unhandled NV_OP: %d\n", i->opcode); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 4a3a51512e..fb95da30f2 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -94,14 +94,17 @@ nvi_isnop(struct nv_instruction *nvi) if (nvi->opcode == NV_OP_EXPORT || nvi->opcode == NV_OP_UNDEF) return TRUE; - if (nvi->fixed || - nvi->is_terminator || - nvi->flags_src || + /* NOTE: 'fixed' now only means that it shouldn't be optimized away, + * but we can still remove it if it is a no-op move. + */ + if (/* nvi->fixed || */ + /* nvi->flags_src || */ /* cond. MOV to same register is still NOP */ nvi->flags_def || + nvi->is_terminator || nvi->is_join) return FALSE; - if (nvi->def[0]->join->reg.id < 0) + if (nvi->def[0] && nvi->def[0]->join->reg.id < 0) return TRUE; if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) @@ -436,22 +439,6 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) #define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) -static struct nv_value * -find_immediate(struct nv_ref *ref) -{ - struct nv_value *src; - - if (!ref) - return NULL; - - src = ref->value; - while (src->insn && src->insn->opcode == NV_OP_MOV) { - assert(!src->insn->src[0]->mod); - src = src->insn->src[0]->value; - } - return (src->reg.file == NV_FILE_IMM) ? src : NULL; -} - static void modifiers_apply(uint32_t *val, ubyte type, ubyte mod) { @@ -663,8 +650,8 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) next = nvi->next; - src0 = find_immediate(nvi->src[0]); - src1 = find_immediate(nvi->src[1]); + src0 = nvcg_find_immediate(nvi->src[0]); + src1 = nvcg_find_immediate(nvi->src[1]); if (src0 && src1) constant_expression(ctx->pc, nvi, src0, src1); @@ -778,6 +765,7 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) if (ld->def[0]->reg.id >= 0) it->value = ld->def[0]; else + if (!ld->fixed) nvcg_replace_value(ctx->pc, ld->def[0], it->value); } else { if (ctx->alloc == LOAD_RECORD_POOL_SIZE) @@ -979,7 +967,7 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) for (ir = entry; ir; ir = next) { next = ir->next; for (ik = entry; ik != ir; ik = ik->next) { - if (ir->opcode != ik->opcode) + if (ir->opcode != ik->opcode || ir->fixed) continue; if (!ir->def[0] || !ik->def[0] || diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 7bdeb1c78d..01a6f00997 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -102,7 +102,8 @@ static const char *nv_opcode_names[NV_OP_COUNT + 1] = { static const char *nv_cond_names[] = { "never", "lt" , "eq" , "le" , "gt" , "ne" , "ge" , "", - "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "" + "never", "ltu", "equ", "leu", "gtu", "neu", "geu", "", + "o", "c", "a", "s" }; static const char *nv_modifier_strings[] = @@ -144,7 +145,7 @@ nv_type_name(ubyte type) static INLINE const char * nv_cond_name(ubyte cc) { - return nv_cond_names[MIN2(cc, 15)]; + return nv_cond_names[MIN2(cc, 19)]; } static INLINE const char * diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 81decf8d4a..e689d349f1 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -493,10 +493,10 @@ pass_join_values(struct nv_pc_pass *ctx, int iter) case NV_OP_SELECT: if (!iter) break; - assert(join_allowed(ctx, i->def[0], i->src[0]->value)); - assert(join_allowed(ctx, i->def[0], i->src[1]->value)); - do_join_values(ctx, i->def[0], i->src[0]->value); - do_join_values(ctx, i->def[0], i->src[1]->value); + for (c = 0; c < 4 && i->src[c]; ++c) { + assert(join_allowed(ctx, i->def[0], i->src[c]->value)); + do_join_values(ctx, i->def[0], i->src[c]->value); + } break; case NV_OP_TEX: case NV_OP_TXB: diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index c98d5e126a..27d851e9fd 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1156,8 +1156,8 @@ get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg) static void load_proj_tex_coords(struct bld_context *bld, - struct nv_value *t[4], int dim, - const struct tgsi_full_instruction *insn) + struct nv_value *t[4], int dim, + const struct tgsi_full_instruction *insn) { int c, mask = 0; @@ -1188,59 +1188,209 @@ load_proj_tex_coords(struct bld_context *bld, } } +/* For a quad of threads / top left, top right, bottom left, bottom right + * pixels, do a different operation, and take src0 from a specific thread. + */ +#define QOP_ADD 0 +#define QOP_SUBR 1 +#define QOP_SUB 2 +#define QOP_MOV1 3 + +#define QOP(a, b, c, d) \ + ((QOP_##a << 0) | (QOP_##b << 2) | (QOP_##c << 4) | (QOP_##d << 6)) + +static INLINE struct nv_value * +bld_quadop(struct bld_context *bld, ubyte qop, struct nv_value *src0, int lane, + struct nv_value *src1, boolean wp) +{ + struct nv_value *val = bld_insn_2(bld, NV_OP_QUADOP, src0, src1); + val->insn->lanes = lane; + val->insn->quadop = qop; + if (wp) { + val->insn->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); + val->insn->flags_def->insn = val->insn; + } + return val; +} + +static INLINE struct nv_value * +bld_cmov(struct bld_context *bld, + struct nv_value *src, ubyte cc, struct nv_value *cr) +{ + src = bld_insn_1(bld, NV_OP_MOV, src); + + src->insn->cc = cc; + src->insn->flags_src = new_ref(bld->pc, cr); + + return src; +} + +static struct nv_instruction * +emit_tex(struct bld_context *bld, uint opcode, + struct nv_value *dst[4], struct nv_value *t_in[4], + int argc, int tic, int tsc, int cube) +{ + struct nv_value *t[4]; + struct nv_instruction *nvi; + int c; + + /* the inputs to a tex instruction must be separate values */ + for (c = 0; c < argc; ++c) { + t[c] = bld_insn_1(bld, NV_OP_MOV, t_in[c]); + t[c]->reg.type = NV_TYPE_F32; + t[c]->insn->fixed = 1; + } + + nvi = new_instruction(bld->pc, opcode); + + for (c = 0; c < 4; ++c) + dst[c] = bld_def(nvi, c, new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32)); + + for (c = 0; c < argc; ++c) + nvi->src[c] = new_ref(bld->pc, t[c]); + + nvi->tex_t = tic; + nvi->tex_s = tsc; + nvi->tex_mask = 0xf; + nvi->tex_cube = cube; + nvi->tex_live = 0; + nvi->tex_argc = argc; + + return nvi; +} + +static void +bld_texlod_sequence(struct bld_context *bld, + struct nv_value *dst[4], struct nv_value *t[4], int arg, + int tic, int tsc, int cube) +{ + emit_tex(bld, NV_OP_TXL, dst, t, arg, tic, tsc, cube); /* TODO */ +} + + +/* The lanes of a quad are grouped by the bit in the condition register + * they have set, which is selected by differing bias values. + * Move the input values for TEX into a new register set for each group + * and execute TEX only for a specific group. + * We always need to use 4 new registers for the inputs/outputs because + * the implicitly calculated derivatives must be correct. + */ +static void +bld_texbias_sequence(struct bld_context *bld, + struct nv_value *dst[4], struct nv_value *t[4], int arg, + int tic, int tsc, int cube) +{ + struct nv_instruction *sel, *tex; + struct nv_value *bit[4], *cr[4], *res[4][4], *val; + int l, c; + + const ubyte cc[4] = { NV_CC_EQ, NV_CC_S, NV_CC_C, NV_CC_O }; + + for (l = 0; l < 4; ++l) { + bit[l] = bld_load_imm_u32(bld, 1 << l); + + val = bld_quadop(bld, QOP(SUBR, SUBR, SUBR, SUBR), + t[arg - 1], l, t[arg - 1], TRUE); + + cr[l] = bld_cmov(bld, bit[l], NV_CC_EQ, val->insn->flags_def); + + cr[l]->reg.file = NV_FILE_FLAGS; + cr[l]->reg.type = NV_TYPE_U16; + } + + sel = new_instruction(bld->pc, NV_OP_SELECT); + + for (l = 0; l < 4; ++l) + sel->src[l] = new_ref(bld->pc, cr[l]); + + bld_def(sel, 0, new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16)); + + for (l = 0; l < 4; ++l) { + tex = emit_tex(bld, NV_OP_TXB, dst, t, arg, tic, tsc, cube); + + tex->cc = cc[l]; + tex->flags_src = new_ref(bld->pc, sel->def[0]); + + for (c = 0; c < 4; ++c) + res[l][c] = tex->def[c]; + } + + for (l = 0; l < 4; ++l) + for (c = 0; c < 4; ++c) + res[l][c] = bld_cmov(bld, res[l][c], cc[l], sel->def[0]); + + for (c = 0; c < 4; ++c) { + sel = new_instruction(bld->pc, NV_OP_SELECT); + + for (l = 0; l < 4; ++l) + sel->src[l] = new_ref(bld->pc, res[l][c]); + + bld_def(sel, 0, (dst[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32))); + } +} + +static boolean +bld_is_constant(struct nv_value *val) +{ + if (val->reg.file == NV_FILE_IMM) + return TRUE; + return val->insn && nvcg_find_constant(val->insn->src[0]); +} + static void bld_tex(struct bld_context *bld, struct nv_value *dst0[4], const struct tgsi_full_instruction *insn) { - struct nv_value *t[4]; - struct nv_instruction *nvi; + struct nv_value *t[4], *s[3]; uint opcode = translate_opcode(insn->Instruction.Opcode); int arg, dim, c; + const int tic = insn->Src[1].Register.Index; + const int tsc = 0; + const int cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; get_tex_dim(insn, &dim, &arg); - if (insn->Texture.Texture == TGSI_TEXTURE_CUBE) { - } - // else - if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) { + if (!cube && insn->Instruction.Opcode == TGSI_OPCODE_TXP) load_proj_tex_coords(bld, t, dim, insn); - } else + else for (c = 0; c < dim; ++c) t[c] = emit_fetch(bld, insn, 0, c); - if (arg != dim) - t[dim] = emit_fetch(bld, insn, 0, 2); + if (cube) { + assert(dim >= 3); + for (c = 0; c < 3; ++c) + s[c] = bld_insn_1(bld, NV_OP_ABS, t[c]); - if (insn->Instruction.Opcode == TGSI_OPCODE_TXB || - insn->Instruction.Opcode == TGSI_OPCODE_TXL) { - t[arg++] = emit_fetch(bld, insn, 0, 3); - } + s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[1]); + s[0] = bld_insn_2(bld, NV_OP_MAX, s[0], s[2]); + s[0] = bld_insn_1(bld, NV_OP_RCP, s[0]); - for (c = 0; c < arg; ++c) { - t[c] = bld_insn_1(bld, NV_OP_MOV, t[c]); - t[c]->reg.type = NV_TYPE_F32; + for (c = 0; c < 3; ++c) + t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], s[0]); } - nvi = new_instruction(bld->pc, opcode); + if (arg != dim) + t[dim] = emit_fetch(bld, insn, 0, 2); - for (c = 0; c < 4; ++c) { - nvi->def[c] = dst0[c] = new_value(bld->pc, NV_FILE_GPR, NV_TYPE_F32); - nvi->def[c]->insn = nvi; + if (opcode == NV_OP_TXB || opcode == NV_OP_TXL) { + t[arg++] = emit_fetch(bld, insn, 0, 3); + + if ((bld->ti->p->type == PIPE_SHADER_FRAGMENT) && + !bld_is_constant(t[arg - 1])) { + if (opcode == NV_OP_TXB) + bld_texbias_sequence(bld, dst0, t, arg, tic, tsc, cube); + else + bld_texlod_sequence(bld, dst0, t, arg, tic, tsc, cube); + return; + } } - for (c = 0; c < arg; ++c) - nvi->src[c] = new_ref(bld->pc, t[c]); - nvi->tex_t = insn->Src[1].Register.Index; - nvi->tex_s = 0; - nvi->tex_mask = 0xf; - nvi->tex_cube = (insn->Texture.Texture == TGSI_TEXTURE_CUBE) ? 1 : 0; - nvi->tex_live = 0; - nvi->tex_argc = arg; + emit_tex(bld, opcode, dst0, t, arg, tic, tsc, cube); } static INLINE struct nv_value * bld_dot(struct bld_context *bld, const struct tgsi_full_instruction *insn, - int n) + int n) { struct nv_value *dotp, *src0, *src1; int c; -- cgit v1.2.3 From e08f70a41d1012a0270468866614485a3415168e Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 31 Aug 2010 20:36:45 +0200 Subject: nv50: make use of TGSI immediate type --- src/gallium/drivers/nv50/nv50_program.c | 14 ++++++++++---- src/gallium/drivers/nv50/nv50_program.h | 1 + src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 10 +++++++++- 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 182a591eb3..523603ca3a 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -187,13 +187,14 @@ prog_immediate(struct nv50_translation_info *ti, const struct tgsi_full_immediate *imm) { int c; - unsigned n = ++ti->immd32_nr; + unsigned n = ti->immd32_nr++; - if (n == (1 << (ffs(n) - 1))) - ti->immd32 = REALLOC(ti->immd32, (n / 2) * 16, (n * 2) * 16); + assert(ti->immd32_nr <= ti->scan.immediate_count); for (c = 0; c < 4; ++c) - ti->immd32[(n - 1) * 4 + c] = imm->u[c].Uint; + ti->immd32[n * 4 + c] = imm->u[c].Uint; + + ti->immd32_ty[n] = imm->Immediate.DataType; } static INLINE unsigned @@ -495,6 +496,9 @@ nv50_prog_scan(struct nv50_translation_info *ti) tgsi_dump(p->pipe.tokens, 0); #endif + ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16); + ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte)); + tgsi_parse_init(&parse, p->pipe.tokens); while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); @@ -561,6 +565,8 @@ nv50_program_tx(struct nv50_program *p) out: if (ti->immd32) FREE(ti->immd32); + if (ti->immd32_ty) + FREE(ti->immd32_ty); FREE(ti); return ret ? FALSE : TRUE; } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 1184d9be3b..639f06217e 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -116,6 +116,7 @@ struct nv50_translation_info { struct tgsi_shader_info scan; uint32_t *immd32; unsigned immd32_nr; + ubyte *immd32_ty; ubyte edgeflag_out; struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES]; int subr_nr; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 27d851e9fd..141d2cd325 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1022,7 +1022,15 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, case TGSI_FILE_IMMEDIATE: assert(idx < bld->ti->immd32_nr); res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]); - res->reg.type = type; + + switch (bld->ti->immd32_ty[idx]) { + case TGSI_IMM_FLOAT32: res->reg.type = NV_TYPE_F32; break; + case TGSI_IMM_UINT32: res->reg.type = NV_TYPE_U32; break; + case TGSI_IMM_INT32: res->reg.type = NV_TYPE_S32; break; + default: + res->reg.type = type; + break; + } break; case TGSI_FILE_INPUT: res = bld_saved_input(bld, idx, swz); -- cgit v1.2.3 From 8e6ba3c8cc41701b4391d0772bf2318604972ae9 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 1 Sep 2010 12:41:59 +0200 Subject: nv50: must join SELECT inputs before MOV inputs --- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 32 +++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index e689d349f1..d401706b5b 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -480,18 +480,18 @@ pass_join_values(struct nv_pc_pass *ctx, int iter) switch (i->opcode) { case NV_OP_PHI: - if (!iter) - continue; + if (iter != 2) + break; for (c = 0; c < 4 && i->src[c]; ++c) try_join_values(ctx, i->def[0], i->src[c]->value); break; case NV_OP_MOV: - if (iter && i->src[0]->value->insn && + if ((iter == 2) && i->src[0]->value->insn && !nv_is_vector_op(i->src[0]->value->join->insn->opcode)) try_join_values(ctx, i->def[0], i->src[0]->value); break; case NV_OP_SELECT: - if (!iter) + if (iter != 1) break; for (c = 0; c < 4 && i->src[c]; ++c) { assert(join_allowed(ctx, i->def[0], i->src[c]->value)); @@ -919,15 +919,21 @@ nv_pc_exec_pass1(struct nv_pc *pc) livei_print(&pc->values[i]); #endif - for (i = 0; i < 2; ++i) { - ret = pass_join_values(ctx, i); - if (ret) - goto out; - ret = pass_linear_scan(ctx, i); - if (ret) - goto out; - } - assert(!ret && "joining"); + ret = pass_join_values(ctx, 0); + if (ret) + goto out; + ret = pass_linear_scan(ctx, 0); + if (ret) + goto out; + ret = pass_join_values(ctx, 1); + if (ret) + goto out; + ret = pass_join_values(ctx, 2); + if (ret) + goto out; + ret = pass_linear_scan(ctx, 1); + if (ret) + goto out; for (i = 0; i < pc->num_values; ++i) livei_release(&pc->values[i]); -- cgit v1.2.3 From a79da61a4b5dd94fdacc0e7196510e8d27c8a157 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 1 Sep 2010 12:42:15 +0200 Subject: nv50: fix XPD, was negated --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 141d2cd325..6bd2de4c74 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1791,12 +1791,12 @@ bld_instruction(struct bld_context *bld, dst0[3] = bld_imm_f32(bld, 1.0f); break; } - src0 = emit_fetch(bld, insn, 0, (c + 1) % 3); - src1 = emit_fetch(bld, insn, 1, (c + 2) % 3); + src0 = emit_fetch(bld, insn, 1, (c + 1) % 3); + src1 = emit_fetch(bld, insn, 0, (c + 2) % 3); dst0[c] = bld_insn_2(bld, NV_OP_MUL, src0, src1); - src0 = emit_fetch(bld, insn, 0, (c + 2) % 3); - src1 = emit_fetch(bld, insn, 1, (c + 1) % 3); + src0 = emit_fetch(bld, insn, 0, (c + 1) % 3); + src1 = emit_fetch(bld, insn, 1, (c + 2) % 3); dst0[c] = bld_insn_3(bld, NV_OP_MAD, src0, src1, dst0[c]); dst0[c]->insn->src[2]->mod ^= NV_MOD_NEG; -- cgit v1.2.3 From 9f9ae4eee1939dd15853b8cd1a4fad2c7197aa9a Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 2 Sep 2010 18:28:39 +0200 Subject: nv50: fix find_dom_frontier --- src/gallium/drivers/nv50/nv50_pc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 28e32eadb7..c2f2ab3ef3 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -631,6 +631,7 @@ nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, static struct nv_basic_block * nvbb_find_dom_frontier(struct nv_basic_block *b, struct nv_basic_block *df) { + struct nv_basic_block *out; int i; if (!nvbb_dominated_by(df, b)) { @@ -641,11 +642,11 @@ nvbb_find_dom_frontier(struct nv_basic_block *b, struct nv_basic_block *df) return df; } } - for (i = 0; i < 2 && b->out[i]; ++i) { - if (b->out_kind[i] == CFG_EDGE_BACK) + for (i = 0; i < 2 && df->out[i]; ++i) { + if (df->out_kind[i] == CFG_EDGE_BACK) continue; - if ((df = nvbb_find_dom_frontier(b, b->out[i]))) - return df; + if ((out = nvbb_find_dom_frontier(b, df->out[i]))) + return out; } return NULL; } -- cgit v1.2.3 From 443abc80db9e1a288ce770e76cccd43664348098 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 2 Sep 2010 18:27:01 +0200 Subject: nv50: fix build-predicate function --- src/gallium/drivers/nv50/nv50_pc.c | 15 ++++++++++++- src/gallium/drivers/nv50/nv50_pc.h | 20 ++++++++++++++++- src/gallium/drivers/nv50/nv50_pc_optimize.c | 9 ++++++++ src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 34 +++++++++++++++++++---------- 4 files changed, 65 insertions(+), 13 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index c2f2ab3ef3..e34c0553eb 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -121,7 +121,7 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) return FALSE; case NV_OP_MOV: assert(s == 0); - return TRUE; + return /* TRUE */ FALSE; /* don't turn MOVs into loads */ default: return FALSE; } @@ -507,6 +507,19 @@ nvbb_insert_tail(struct nv_basic_block *b, struct nv_instruction *i) b->num_instructions++; } +void +nvi_insert_after(struct nv_instruction *at, struct nv_instruction *ni) +{ + if (!at->next) { + nvbb_insert_tail(at->bb, ni); + return; + } + ni->next = at->next; + ni->prev = at; + ni->next->prev = ni; + ni->prev->next = ni; +} + void nv_nvi_delete(struct nv_instruction *nvi) { diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index adc46dec8d..703d32d334 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -347,9 +347,10 @@ struct nv_pc { }; void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *); +void nvi_insert_after(struct nv_instruction *, struct nv_instruction *); static INLINE struct nv_instruction * -new_instruction(struct nv_pc *pc, uint opcode) +nv_alloc_instruction(struct nv_pc *pc, uint opcode) { struct nv_instruction *insn; @@ -359,10 +360,27 @@ new_instruction(struct nv_pc *pc, uint opcode) insn->cc = NV_CC_TR; insn->opcode = opcode; + return insn; +} + +static INLINE struct nv_instruction * +new_instruction(struct nv_pc *pc, uint opcode) +{ + struct nv_instruction *insn = nv_alloc_instruction(pc, opcode); + nvbb_insert_tail(pc->current_block, insn); return insn; } +static INLINE struct nv_instruction * +new_instruction_at(struct nv_pc *pc, struct nv_instruction *at, uint opcode) +{ + struct nv_instruction *insn = nv_alloc_instruction(pc, opcode); + + nvi_insert_after(at, insn); + return insn; +} + static INLINE struct nv_value * new_value(struct nv_pc *pc, ubyte file, ubyte type) { diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index fb95da30f2..1ed5032175 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -636,6 +636,15 @@ constant_operand(struct nv_pc *pc, default: break; } + + if (nvi->opcode == NV_OP_MOV && nvi->flags_def) { + struct nv_instruction *cvt = new_instruction_at(pc, nvi, NV_OP_CVT); + + nv_reference(pc, &cvt->src[0], nvi->def[0]); + + cvt->flags_def = nvi->flags_def; + nvi->flags_def = NULL; + } } static int diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 6bd2de4c74..e1c6ed87bf 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -625,23 +625,35 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) static struct nv_value * bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only) { - struct nv_instruction *nvi = src->insn; + struct nv_instruction *s0i, *nvi = src->insn; - if (nvi->opcode == NV_OP_LDA || - nvi->opcode == NV_OP_PHI || - nvi->bb != bld->pc->current_block) { - nvi = new_instruction(bld->pc, NV_OP_CVT); - nv_reference(bld->pc, &nvi->src[0], src); + if (!nvi) { + nvi = bld_insn_1(bld, + (src->reg.file == NV_FILE_IMM) ? NV_OP_MOV : NV_OP_LDA, + src)->insn; + src = nvi->def[0]; } else if (bool_only) { - while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_CVT || - nvi->opcode == NV_OP_NEG) { - /* TGSI SET gets conversion to f32, we only need source 0/~0 */ - if (!nvi->def[0]->insn->flags_src) - nvi = nvi->src[0]->value->insn; + while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_NEG || + nvi->opcode == NV_OP_CVT) { + s0i = nvi->src[0]->value->insn; + if (!s0i || + s0i->opcode == NV_OP_LDA || + s0i->opcode == NV_OP_MOV || + s0i->opcode == NV_OP_PHI) + break; + nvi = s0i; + assert(!nvi->flags_src); } } + if (nvi->opcode == NV_OP_LDA || + nvi->opcode == NV_OP_MOV || + nvi->opcode == NV_OP_PHI || nvi->bb != bld->pc->current_block) { + nvi = new_instruction(bld->pc, NV_OP_CVT); + nv_reference(bld->pc, &nvi->src[0], src); + } + if (!nvi->flags_def) { nvi->flags_def = new_value(bld->pc, NV_FILE_FLAGS, NV_TYPE_U16); nvi->flags_def->insn = nvi; -- cgit v1.2.3 From 9e4901402cf50405be28ce6311f10e22196fbc35 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Fri, 3 Sep 2010 14:26:47 +0200 Subject: nv50: load address register before using it, not after --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index e1c6ed87bf..386dbda423 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1000,6 +1000,7 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, { const struct tgsi_full_src_register *src = &insn->Src[s]; struct nv_value *res; + struct nv_value *ptr = NULL; unsigned idx, swz, dim_idx, ind_idx, ind_swz; ubyte type = infer_src_type(insn->Instruction.Opcode); @@ -1012,7 +1013,11 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, if (src->Register.Indirect) { ind_idx = src->Indirect.Index; ind_swz = tgsi_util_get_src_register_swizzle(&src->Indirect, 0); + + ptr = FETCH_ADDR(ind_idx, ind_swz); } + if (idx >= (128 / 4) && src->Register.File == TGSI_FILE_CONSTANT) + ptr = bld_get_address(bld, (idx * 16) & ~0x1ff, ptr); switch (src->Register.File) { case TGSI_FILE_CONSTANT: @@ -1025,11 +1030,8 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, res->reg.id = (idx * 4 + swz) & 127; res = bld_insn_1(bld, NV_OP_LDA, res); - if (src->Register.Indirect) - res->insn->src[4] = new_ref(bld->pc, FETCH_ADDR(ind_idx, ind_swz)); - if (idx >= (128 / 4)) - res->insn->src[4] = - new_ref(bld->pc, bld_get_address(bld, (idx * 16) & ~0x1ff, NULL)); + if (ptr) + res->insn->src[4] = new_ref(bld->pc, ptr); break; case TGSI_FILE_IMMEDIATE: assert(idx < bld->ti->immd32_nr); -- cgit v1.2.3 From 217542a061ef31150b1b04f1b45b6099bcc153fe Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 5 Sep 2010 19:06:17 +0200 Subject: nv50: save tgsi instructions --- src/gallium/drivers/nv50/nv50_program.c | 5 +++++ src/gallium/drivers/nv50/nv50_program.h | 1 + 2 files changed, 6 insertions(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 523603ca3a..d7d3030e2f 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -499,6 +499,8 @@ nv50_prog_scan(struct nv50_translation_info *ti) ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16); ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte)); + ti->insns = MALLOC(ti->scan.num_instructions * sizeof(ti->insns[0])); + tgsi_parse_init(&parse, p->pipe.tokens); while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); @@ -511,6 +513,7 @@ nv50_prog_scan(struct nv50_translation_info *ti) prog_decl(ti, &parse.FullToken.FullDeclaration); break; case TGSI_TOKEN_TYPE_INSTRUCTION: + ti->insns[ti->inst_nr] = parse.FullToken.FullInstruction; prog_inst(ti, &parse.FullToken.FullInstruction, ++ti->inst_nr); break; } @@ -567,6 +570,8 @@ out: FREE(ti->immd32); if (ti->immd32_ty) FREE(ti->immd32_ty); + if (ti->insns) + FREE(ti->insns); FREE(ti); return ret ? FALSE : TRUE; } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 639f06217e..3c3f1f7f97 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -104,6 +104,7 @@ struct nv50_subroutine { struct nv50_translation_info { struct nv50_program *p; unsigned inst_nr; + struct tgsi_full_instruction *insns; ubyte input_file; ubyte output_file; ubyte input_map[PIPE_MAX_SHADER_INPUTS][4]; -- cgit v1.2.3 From d91b8865ec2bb41f9b58ad5ce2df7f6f48f98281 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 7 Sep 2010 15:40:34 +0200 Subject: nv50: prepare for having multiple functions At some point we'll want to support real subroutines instead of just inlining them into the main shader. Since recursive calls are forbidden, we can just save all used registers to a fixed local memory region and restore them on a return, no need for a stack pointer. --- src/gallium/drivers/nv50/nv50_pc.c | 48 ++++++++++++++++------ src/gallium/drivers/nv50/nv50_pc.h | 12 +++--- src/gallium/drivers/nv50/nv50_pc_optimize.c | 56 +++++++++++++++++-------- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 23 ++++++++--- src/gallium/drivers/nv50/nv50_program.c | 63 +++++++++++++++++++++++++++-- src/gallium/drivers/nv50/nv50_program.h | 16 ++++---- src/gallium/drivers/nv50/nv50_screen.c | 3 +- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 2 +- 8 files changed, 171 insertions(+), 52 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index e34c0553eb..c54f16e4c5 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -304,7 +304,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv) } static void -nv_do_print_program(void *priv, struct nv_basic_block *b) +nv_do_print_function(void *priv, struct nv_basic_block *b) { struct nv_instruction *i = b->phi; @@ -323,11 +323,23 @@ nv_do_print_program(void *priv, struct nv_basic_block *b) } void -nv_print_program(struct nv_basic_block *root) +nv_print_function(struct nv_basic_block *root) { - nv_pc_pass_in_order(root, nv_do_print_program, root); + if (root->subroutine) + debug_printf("SUBROUTINE %i\n", root->subroutine); + else + debug_printf("MAIN\n"); - debug_printf("END\n\n"); + nv_pc_pass_in_order(root, nv_do_print_function, root); +} + +void +nv_print_program(struct nv_pc *pc) +{ + int i; + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i]) + nv_print_function(pc->root[i]); } static INLINE void @@ -388,11 +400,18 @@ nv50_generate_code(struct nv50_translation_info *ti) if (!pc) return 1; + pc->root = CALLOC(ti->subr_nr + 1, sizeof(pc->root[0])); + if (!pc->root) { + FREE(pc); + return 1; + } + pc->num_subroutines = ti->subr_nr; + ret = nv50_tgsi_to_nc(pc, ti); if (ret) goto out; #ifdef NV50PC_DEBUG - nv_print_program(pc->root); + nv_print_program(pc); #endif /* optimization */ @@ -400,7 +419,7 @@ nv50_generate_code(struct nv50_translation_info *ti) if (ret) goto out; #ifdef NV50PC_DEBUG - nv_print_program(pc->root); + nv_print_program(pc); #endif /* register allocation */ @@ -408,7 +427,7 @@ nv50_generate_code(struct nv50_translation_info *ti) if (ret) goto out; #ifdef NV50PC_DEBUG - nv_print_program(pc->root); + nv_print_program(pc); #endif /* prepare for emission */ @@ -441,16 +460,19 @@ nv50_generate_code(struct nv50_translation_info *ti) out: nv_pc_free_refs(pc); - if (ret) { + + if (pc->bb_list) + FREE(pc->bb_list); + + if (ret) { /* on success, these will be referenced by nv50_program */ if (pc->emit) - free(pc->emit); + FREE(pc->emit); if (pc->immd_buf) - free(pc->immd_buf); + FREE(pc->immd_buf); if (pc->fixups) - free(pc->fixups); + FREE(pc->fixups); } - free(pc); - + FREE(pc); return ret; } diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 703d32d334..d9cc775572 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -282,7 +282,7 @@ struct nv_basic_block { ubyte in_kind[8]; int id; - struct nv_basic_block *last_visitor; + int subroutine; uint priv; uint pass_seq; @@ -314,10 +314,10 @@ nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data) bin[fixup->offset / 4] = val; } -struct nv_pc { - struct nv50_translation_info *ti; +struct nv50_translation_info; - struct nv_basic_block *root; +struct nv_pc { + struct nv_basic_block **root; struct nv_basic_block *current_block; struct nv_basic_block *parent_block; @@ -332,6 +332,7 @@ struct nv_pc { int num_instructions; int num_refs; int num_blocks; + int num_subroutines; int max_reg[4]; @@ -463,7 +464,8 @@ void nv_print_instruction(struct nv_instruction *); /* nv50_pc.c */ -void nv_print_program(struct nv_basic_block *b); +void nv_print_function(struct nv_basic_block *root); +void nv_print_program(struct nv_pc *); boolean nv_op_commutative(uint opcode); int nv50_indirect_opnd(struct nv_instruction *); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 1ed5032175..4f5bdc1f9f 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -213,23 +213,36 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b) pc->bin_size += b->bin_size *= 4; } -int -nv_pc_exec_pass2(struct nv_pc *pc) +static int +nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root) { struct nv_pass pass; pass.pc = pc; pc->pass_seq++; - nv_pass_flatten(&pass, pc->root); + + nv_pass_flatten(&pass, root); + + nv_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc); + + return 0; +} + +int +nv_pc_exec_pass2(struct nv_pc *pc) +{ + int i, ret; NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks); - pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); - pc->num_blocks = 0; + pc->bb_list = CALLOC(pc->num_blocks, sizeof(pc->bb_list[0])); - nv_pc_pass_in_order(pc->root, nv_pc_pass_pre_emission, pc); + pc->num_blocks = 0; + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i]))) + return ret; return 0; } @@ -1032,8 +1045,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b) return 0; } -int -nv_pc_exec_pass0(struct nv_pc *pc) +static int +nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) { struct nv_pass_reld_elim *reldelim; struct nv_pass pass; @@ -1047,35 +1060,35 @@ nv_pc_exec_pass0(struct nv_pc *pc) * to whether sources are supported memory loads. */ pc->pass_seq++; - ret = nv_pass_lower_arith(&pass, pc->root); + ret = nv_pass_lower_arith(&pass, root); if (ret) return ret; pc->pass_seq++; - ret = nv_pass_fold_loads(&pass, pc->root); + ret = nv_pass_fold_loads(&pass, root); if (ret) return ret; pc->pass_seq++; - ret = nv_pass_fold_stores(&pass, pc->root); + ret = nv_pass_fold_stores(&pass, root); if (ret) return ret; reldelim = CALLOC_STRUCT(nv_pass_reld_elim); reldelim->pc = pc; pc->pass_seq++; - ret = nv_pass_reload_elim(reldelim, pc->root); + ret = nv_pass_reload_elim(reldelim, root); FREE(reldelim); if (ret) return ret; pc->pass_seq++; - ret = nv_pass_cse(&pass, pc->root); + ret = nv_pass_cse(&pass, root); if (ret) return ret; pc->pass_seq++; - ret = nv_pass_lower_mods(&pass, pc->root); + ret = nv_pass_lower_mods(&pass, root); if (ret) return ret; @@ -1083,14 +1096,25 @@ nv_pc_exec_pass0(struct nv_pc *pc) do { dce.removed = 0; pc->pass_seq++; - ret = nv_pass_dce(&dce, pc->root); + ret = nv_pass_dce(&dce, root); if (ret) return ret; } while (dce.removed); - ret = nv_pass_tex_mask(&pass, pc->root); + ret = nv_pass_tex_mask(&pass, root); if (ret) return ret; return ret; } + +int +nv_pc_exec_pass0(struct nv_pc *pc) +{ + int i, ret; + + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i]))) + return ret; + return 0; +} diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index d401706b5b..2998343db5 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -874,8 +874,8 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter) return 0; } -int -nv_pc_exec_pass1(struct nv_pc *pc) +static int +nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root) { struct nv_pc_pass *ctx; int i, ret; @@ -890,12 +890,12 @@ nv_pc_exec_pass1(struct nv_pc *pc) ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *)); pc->pass_seq++; - ret = pass_generate_phi_movs(ctx, pc->root); + ret = pass_generate_phi_movs(ctx, root); assert(!ret); for (i = 0; i < pc->loop_nesting_bound; ++i) { pc->pass_seq++; - ret = pass_build_live_sets(ctx, pc->root); + ret = pass_build_live_sets(ctx, root); assert(!ret && "live sets"); if (ret) { NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i); @@ -904,10 +904,10 @@ nv_pc_exec_pass1(struct nv_pc *pc) } pc->pass_seq++; - nv_pc_pass_in_order(pc->root, pass_order_instructions, ctx); + nv_pc_pass_in_order(root, pass_order_instructions, ctx); pc->pass_seq++; - ret = pass_build_intervals(ctx, pc->root); + ret = pass_build_intervals(ctx, root); assert(!ret && "build intervals"); if (ret) { NOUVEAU_ERR("failed to build live intervals\n"); @@ -944,3 +944,14 @@ out: FREE(ctx); return ret; } + +int +nv_pc_exec_pass1(struct nv_pc *pc) +{ + int i, ret; + + for (i = 0; i < pc->num_subroutines + 1; ++i) + if (pc->root[i] && (ret = nv_pc_pass1(pc, pc->root[i]))) + return ret; + return 0; +} diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index d7d3030e2f..925028700c 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -147,10 +147,17 @@ prog_inst(struct nv50_translation_info *ti, int s, c, k; unsigned mask; + if (inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) { + ti->subr[ti->subr_nr].pos = id - 1; + ti->subr[ti->subr_nr].id = ti->subr_nr + 1; /* id 0 is main program */ + ++ti->subr_nr; + } + if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) { + dst = &inst->Dst[0].Register; + for (c = 0; c < 4; ++c) { - dst = &inst->Dst[0].Register; - if (inst->Dst[0].Register.Indirect) + if (dst->Indirect) nv50_indirect_outputs(ti, id); if (!(dst->WriteMask & (1 << c))) continue; @@ -182,6 +189,44 @@ prog_inst(struct nv50_translation_info *ti, } } +/* Probably should introduce something like struct tgsi_function_declaration + * instead of trying to guess inputs/outputs. + */ +static void +prog_subroutine_inst(struct nv50_subroutine *subr, + const struct tgsi_full_instruction *inst) +{ + const struct tgsi_dst_register *dst; + const struct tgsi_src_register *src; + int s, c, k; + unsigned mask; + + for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { + src = &inst->Src[s].Register; + if (src->File != TGSI_FILE_TEMPORARY) + continue; + mask = nv50_tgsi_src_mask(inst, s); + + assert(!inst->Src[s].Register.Indirect); + + for (c = 0; c < 4; ++c) { + k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c); + + if ((mask & (1 << c)) && k < TGSI_SWIZZLE_W) + if (!(subr->retv[src->Index / 32][k] & (1 << (src->Index % 32)))) + subr->argv[src->Index / 32][k] |= 1 << (src->Index % 32); + } + } + + if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) { + dst = &inst->Dst[0].Register; + + for (c = 0; c < 4; ++c) + if (dst->WriteMask & (1 << c)) + subr->retv[dst->Index / 32][c] |= 1 << (dst->Index % 32); + } +} + static void prog_immediate(struct nv50_translation_info *ti, const struct tgsi_full_immediate *imm) @@ -482,7 +527,7 @@ nv50_prog_scan(struct nv50_translation_info *ti) { struct nv50_program *p = ti->p; struct tgsi_parse_context parse; - int ret; + int ret, i; p->vp.edgeflag = 0x40; p->vp.psiz = 0x40; @@ -496,6 +541,9 @@ nv50_prog_scan(struct nv50_translation_info *ti) tgsi_dump(p->pipe.tokens, 0); #endif + ti->subr = + CALLOC(ti->scan.opcode_count[TGSI_OPCODE_BGNSUB], sizeof(ti->subr[0])); + ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16); ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte)); @@ -519,6 +567,13 @@ nv50_prog_scan(struct nv50_translation_info *ti) } } + /* Scan to determine which registers are inputs/outputs of a subroutine. */ + for (i = 0; i < ti->subr_nr; ++i) { + int pc = ti->subr[i].id; + while (ti->insns[pc].Instruction.Opcode != TGSI_OPCODE_ENDSUB) + prog_subroutine_inst(&ti->subr[i], &ti->insns[pc++]); + } + p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1; p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1; @@ -572,6 +627,8 @@ out: FREE(ti->immd32_ty); if (ti->insns) FREE(ti->insns); + if (ti->subr) + FREE(ti->subr); FREE(ti); return ret ? FALSE : TRUE; } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 3c3f1f7f97..918baf325f 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -27,6 +27,8 @@ #include "tgsi/tgsi_scan.h" #include "nouveau/nouveau_class.h" +#define NV50_CAP_MAX_PROGRAM_TEMPS (128 / 4) + struct nv50_varying { uint8_t id; /* tgsi index */ uint8_t hw; /* hw index, nv50 wants flat FP inputs last */ @@ -92,13 +94,13 @@ struct nv50_program { #define NV50_INTERP_FLAT (1 << 1) #define NV50_INTERP_CENTROID (1 << 2) -#define NV50_PROG_MAX_SUBROUTINES 8 - /* analyze TGSI and see which TEMP[] are used as subroutine inputs/outputs */ struct nv50_subroutine { - int id; - uint32_t argv[4][1]; /* 4 bitmasks, for each of xyzw, only allow 32 TEMPs */ - uint32_t retv[4][1]; + unsigned id; + unsigned pos; + /* function inputs and outputs */ + uint32_t argv[NV50_CAP_MAX_PROGRAM_TEMPS][4]; + uint32_t retv[NV50_CAP_MAX_PROGRAM_TEMPS][4]; }; struct nv50_translation_info { @@ -119,8 +121,8 @@ struct nv50_translation_info { unsigned immd32_nr; ubyte *immd32_ty; ubyte edgeflag_out; - struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES]; - int subr_nr; + struct nv50_subroutine *subr; + unsigned subr_nr; }; int nv50_generate_code(struct nv50_translation_info *ti); diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index fc75d81d54..c1efa443da 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -26,6 +26,7 @@ #include "nv50_context.h" #include "nv50_screen.h" #include "nv50_resource.h" +#include "nv50_program.h" #include "nouveau/nouveau_stateobj.h" @@ -152,7 +153,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 0; case PIPE_CAP_MAX_VS_TEMPS: case PIPE_CAP_MAX_FS_TEMPS: /* no spilling atm */ - return 128 / 4; + return NV50_CAP_MAX_PROGRAM_TEMPS; case PIPE_CAP_DEPTH_CLAMP: return 1; default: diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 386dbda423..dea8fa0663 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1850,7 +1850,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) struct bld_context *bld = CALLOC_STRUCT(bld_context); int c; - pc->root = pc->current_block = new_basic_block(pc); + pc->root[0] = pc->current_block = new_basic_block(pc); bld->pc = pc; bld->ti = ti; -- cgit v1.2.3 From d8dcff79702860eae92d3d35b461c9b71114c1c5 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Tue, 7 Sep 2010 19:02:10 +0200 Subject: nv50: don't parse again in tgsi_2_nc --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index dea8fa0663..983fcb2fbf 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1849,6 +1849,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) { struct bld_context *bld = CALLOC_STRUCT(bld_context); int c; + unsigned ip; pc->root[0] = pc->current_block = new_basic_block(pc); @@ -1865,21 +1866,8 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti) bld->frgcrd[3] = bld_insn_1(bld, NV_OP_RCP, bld->frgcrd[3]); } - tgsi_parse_init(&bld->parse[0], ti->p->pipe.tokens); - - while (!tgsi_parse_end_of_tokens(&bld->parse[bld->call_lvl])) { - const union tgsi_full_token *tok = &bld->parse[bld->call_lvl].FullToken; - - tgsi_parse_token(&bld->parse[bld->call_lvl]); - - switch (tok->Token.Type) { - case TGSI_TOKEN_TYPE_INSTRUCTION: - bld_instruction(bld, &tok->FullInstruction); - break; - default: - break; - } - } + for (ip = 0; ip < ti->inst_nr; ++ip) + bld_instruction(bld, &ti->insns[ip]); bld_free_value_trackers(&bld->tvs[0][0], BLD_MAX_TEMPS); bld_free_value_trackers(&bld->avs[0][0], BLD_MAX_ADDRS); -- cgit v1.2.3 From f30810cb68a53c4fef360778a230126ed0ee0ee3 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 9 Sep 2010 19:12:54 +0200 Subject: nv50: use actual loads/stores if TEMPs are accessed indirectly --- src/gallium/drivers/nv50/nv50_pc.c | 2 ++ src/gallium/drivers/nv50/nv50_pc.h | 3 ++ src/gallium/drivers/nv50/nv50_pc_emit.c | 28 +++++++++++---- src/gallium/drivers/nv50/nv50_pc_optimize.c | 19 ++++++---- src/gallium/drivers/nv50/nv50_pc_print.c | 3 ++ src/gallium/drivers/nv50/nv50_program.c | 7 ++++ src/gallium/drivers/nv50/nv50_program.h | 1 + src/gallium/drivers/nv50/nv50_screen.c | 25 ++++++++++--- src/gallium/drivers/nv50/nv50_screen.h | 3 +- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 54 ++++++++++++++++++++++++++--- 10 files changed, 122 insertions(+), 23 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index c54f16e4c5..637b3cf2fe 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -414,6 +414,8 @@ nv50_generate_code(struct nv50_translation_info *ti) nv_print_program(pc); #endif + pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE; + /* optimization */ ret = nv_pc_exec_pass0(pc); if (ret) diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index d9cc775572..ba32ab08ab 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -345,6 +345,9 @@ struct nv_pc { struct nv_fixup *fixups; int num_fixups; + + /* optimization enables */ + boolean opt_reload_elim; }; void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *); diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index bb0a6f32d1..8c64b19875 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -412,25 +412,25 @@ emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask) } static void -set_ld_st_size(struct nv_pc *pc, ubyte type) +set_ld_st_size(struct nv_pc *pc, int s, ubyte type) { switch (type) { case NV_TYPE_F64: - pc->emit[1] |= 0x8000; + pc->emit[1] |= 0x8000 << s; break; case NV_TYPE_F32: case NV_TYPE_S32: case NV_TYPE_U32: - pc->emit[1] |= 0xc000; + pc->emit[1] |= 0xc000 << s; break; case NV_TYPE_S16: - pc->emit[1] |= 0x6000; + pc->emit[1] |= 0x6000 << s; break; case NV_TYPE_U16: - pc->emit[1] |= 0x4000; + pc->emit[1] |= 0x4000 << s; break; case NV_TYPE_S8: - pc->emit[1] |= 0x2000; + pc->emit[1] |= 0x2000 << s; break; default: break; @@ -473,12 +473,14 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i) if (sf == NV_FILE_MEM_L) { pc->emit[0] = 0xd0000001; pc->emit[1] = 0x40000000; + + set_addr(pc, i); } else { NOUVEAU_ERR("invalid ld source file\n"); abort(); } - set_ld_st_size(pc, STYPE(i, 0)); + set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0)); set_dst(pc, i->def[0]); set_pred_wr(pc, i); @@ -495,7 +497,19 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i) static void emit_st(struct nv_pc *pc, struct nv_instruction *i) { + assert(SFILE(i, 1) == NV_FILE_GPR); + assert(SFILE(i, 0) == NV_FILE_MEM_L); + + pc->emit[0] = 0xd0000001; + pc->emit[1] = 0x60000000; + SID(pc, i->src[1], 2); + SID(pc, i->src[0], 9); + + set_ld_st_size(pc, 8, STYPE(i, 1)); + + set_addr(pc, i); + set_pred(pc, i); } static int diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 4f5bdc1f9f..09d232abda 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -82,6 +82,8 @@ inst_commutation_legal(struct nv_instruction *a, static INLINE boolean inst_cullable(struct nv_instruction *nvi) { + if (nvi->opcode == NV_OP_STA) + return FALSE; return (!(nvi->is_terminator || nvi->is_join || nvi->target || nvi->fixed || @@ -739,6 +741,7 @@ struct nv_pass_reld_elim { int alloc; }; +/* TODO: properly handle loads from l[] memory in the presence of stores */ static int nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) { @@ -1074,13 +1077,15 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) if (ret) return ret; - reldelim = CALLOC_STRUCT(nv_pass_reld_elim); - reldelim->pc = pc; - pc->pass_seq++; - ret = nv_pass_reload_elim(reldelim, root); - FREE(reldelim); - if (ret) - return ret; + if (pc->opt_reload_elim) { + reldelim = CALLOC_STRUCT(nv_pass_reld_elim); + reldelim->pc = pc; + pc->pass_seq++; + ret = nv_pass_reload_elim(reldelim, root); + FREE(reldelim); + if (ret) + return ret; + } pc->pass_seq++; ret = nv_pass_cse(&pass, root); diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 01a6f00997..74c3970f40 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -217,6 +217,9 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) case NV_FILE_FLAGS: PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value)); break; + case NV_FILE_MEM_L: + nv_print_address('l', -1, ind, 4 * nv_value_id(value)); + break; case NV_FILE_MEM_S: nv_print_address('s', -1, ind, 4 * nv_value_id(value)); break; diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 925028700c..24952f70f1 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -168,10 +168,17 @@ prog_inst(struct nv50_translation_info *ti, inst->Src[0].Register.File == TGSI_FILE_INPUT && dst->Index == ti->edgeflag_out) ti->p->vp.edgeflag = inst->Src[0].Register.Index; + } else + if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) { + if (inst->Dst[0].Register.Indirect) + ti->store_to_memory = TRUE; } for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) { src = &inst->Src[s].Register; + if (src->File == TGSI_FILE_TEMPORARY) + if (inst->Src[s].Register.Indirect) + ti->store_to_memory = TRUE; if (src->File != TGSI_FILE_INPUT) continue; mask = nv50_tgsi_src_mask(inst, s); diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 918baf325f..a1b2bde97b 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -116,6 +116,7 @@ struct nv50_translation_info { int output_access[PIPE_MAX_SHADER_OUTPUTS][4]; boolean indirect_inputs; boolean indirect_outputs; + boolean store_to_memory; struct tgsi_shader_info scan; uint32_t *immd32; unsigned immd32_nr; diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index c1efa443da..24a6d8055c 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -274,7 +274,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) uint64_t value; unsigned chipset = dev->chipset; unsigned tesla_class = 0; - unsigned stack_size; + unsigned stack_size, local_size, max_warps; int ret, i; const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; @@ -495,9 +495,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) /* shader stack */ nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); - stack_size = util_bitcount(value & 0xffff); - stack_size *= util_bitcount((value >> 24) & 0xf); - stack_size *= 32 * 64 * 8; + max_warps = util_bitcount(value & 0xffff); + max_warps *= util_bitcount((value >> 24) & 0xf) * 32; + + stack_size = max_warps * 64 * 8; ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, stack_size, &screen->stack_bo); @@ -510,6 +511,22 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); OUT_RING (chan, 4); + local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32; + + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, + local_size, &screen->local_bo); + if (ret) { + nv50_screen_destroy(pscreen); + return NULL; + } + + local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16; + + BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3); + OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RING (chan, util_unsigned_logbase2(local_size / 8)); + /* Vertex array limits - max them out */ for (i = 0; i < 16; i++) { BEGIN_RING(chan, screen->tesla, diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h index 1517f5608f..ad6bdeb27c 100644 --- a/src/gallium/drivers/nv50/nv50_screen.h +++ b/src/gallium/drivers/nv50/nv50_screen.h @@ -25,7 +25,8 @@ struct nv50_screen { struct nouveau_bo *tic; struct nouveau_bo *tsc; - struct nouveau_bo *stack_bo; + struct nouveau_bo *stack_bo; /* control flow stack */ + struct nouveau_bo *local_bo; /* l[] memory */ boolean force_push; }; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 983fcb2fbf..f4fee4e0f2 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -558,6 +558,38 @@ bld_insn_3(struct bld_context *bld, uint opcode, return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); } +static void +bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst, + struct nv_value *val) +{ + struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA); + struct nv_value *loc; + + loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32); + + loc->reg.id = ofst * 4; + + nv_reference(bld->pc, &insn->src[0], loc); + nv_reference(bld->pc, &insn->src[1], val); + nv_reference(bld->pc, &insn->src[4], ptr); +} + +static struct nv_value * +bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst) +{ + struct nv_value *loc, *val; + + loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32); + + loc->reg.id = ofst * 4; + + val = bld_insn_1(bld, NV_OP_LDA, loc); + + nv_reference(bld->pc, &val->insn->src[4], ptr); + + return val; +} + #define BLD_INSN_1_EX(d, op, dt, s0, s0t) \ do { \ (d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \ @@ -854,10 +886,18 @@ infer_dst_type(unsigned opcode) static void emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, - unsigned chan, struct nv_value *value) + unsigned chan, struct nv_value *value) { + struct nv_value *ptr; const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + if (reg->Register.Indirect) { + ptr = FETCH_ADDR(reg->Indirect.Index, + tgsi_util_get_src_register_swizzle(®->Indirect, 0)); + } else { + ptr = NULL; + } + assert(chan < 4); if (inst->Instruction.Opcode != TGSI_OPCODE_MOV) @@ -893,7 +933,11 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, value->reg.file = NV_FILE_GPR; if (value->insn->bb != bld->pc->current_block) value = bld_insn_1(bld, NV_OP_MOV, value); - STORE_TEMP(reg->Register.Index, chan, value); + + if (bld->ti->store_to_memory) + bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value); + else + STORE_TEMP(reg->Register.Index, chan, value); break; case TGSI_FILE_ADDRESS: assert(reg->Register.Index < BLD_MAX_ADDRS); @@ -1064,8 +1108,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, bld->saved_inputs[bld->ti->input_map[idx][swz]] = res; break; case TGSI_FILE_TEMPORARY: - /* this should be load from l[], with reload elimination later on */ - res = bld_fetch_global(bld, &bld->tvs[idx][swz]); + if (bld->ti->store_to_memory) + res = bld_lmem_load(bld, ptr, idx * 4 + swz); + else + res = bld_fetch_global(bld, &bld->tvs[idx][swz]); break; case TGSI_FILE_ADDRESS: res = bld_fetch_global(bld, &bld->avs[idx][swz]); -- cgit v1.2.3 From 9cc80e25db3d0bfd38015a197de3a1a80b6733ab Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 9 Sep 2010 19:17:55 +0200 Subject: nv50: create value references with the right type Since atm our OPs aren't typed but instead values are, we need to take care if they're used as different types (e.g. a load makes a value u32 by default). Maybe this should be changed (also to match TGSI), but it should work as well if done properly. --- src/gallium/drivers/nv50/nv50_pc.h | 9 ++++-- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 46 +++++++++++++++--------------- 2 files changed, 29 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index ba32ab08ab..ccddae063c 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -189,6 +189,7 @@ struct nv_reg { int id; ubyte file; ubyte type; /* type of generating instruction's result */ + ubyte as_type; /* default type for new references to this value */ union { float f32; double f64; @@ -396,14 +397,16 @@ new_value(struct nv_pc *pc, ubyte file, ubyte type) value->join = value; value->reg.id = -1; value->reg.file = file; - value->reg.type = type; + value->reg.type = value->reg.as_type = type; return value; } static INLINE struct nv_value * new_value_like(struct nv_pc *pc, struct nv_value *like) { - return new_value(pc, like->reg.file, like->reg.type); + struct nv_value *val = new_value(pc, like->reg.file, like->reg.type); + val->reg.as_type = like->reg.as_type; + return val; } static INLINE struct nv_ref * @@ -425,7 +428,7 @@ new_ref(struct nv_pc *pc, struct nv_value *val) ref = pc->refs[pc->num_refs++]; ref->value = val; - ref->typecast = val->reg.type; + ref->typecast = val->reg.as_type; ++val->refc; return ref; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index f4fee4e0f2..50f0151b53 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -22,8 +22,6 @@ /* #define NV50_TGSI2NC_DEBUG */ -/* XXX: need to clean this up so we get the typecasting right more naturally */ - #include #include "nv50_context.h" @@ -519,17 +517,16 @@ bld_imm_f32(struct bld_context *bld, float f) return bld_imm_u32(bld, fui(f)); } -#define SET_TYPE(v, t) ((v)->reg.type = NV_TYPE_##t) +#define SET_TYPE(v, t) ((v)->reg.type = (v)->reg.as_type = (t)) static struct nv_value * bld_insn_1(struct bld_context *bld, uint opcode, struct nv_value *src0) { struct nv_instruction *insn = new_instruction(bld->pc, opcode); - assert(insn); - nv_reference(bld->pc, &insn->src[0], src0); /* NOTE: new_ref would suffice */ + nv_reference(bld->pc, &insn->src[0], src0); - return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type)); } static struct nv_value * @@ -541,7 +538,7 @@ bld_insn_2(struct bld_context *bld, uint opcode, nv_reference(bld->pc, &insn->src[0], src0); nv_reference(bld->pc, &insn->src[1], src1); - return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type)); } static struct nv_value * @@ -555,7 +552,7 @@ bld_insn_3(struct bld_context *bld, uint opcode, nv_reference(bld->pc, &insn->src[1], src1); nv_reference(bld->pc, &insn->src[2], src2); - return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type)); + return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type)); } static void @@ -593,14 +590,14 @@ bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst) #define BLD_INSN_1_EX(d, op, dt, s0, s0t) \ do { \ (d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \ - (d)->reg.type = NV_TYPE_##dt; \ + SET_TYPE(d, NV_TYPE_##dt); \ (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ } while(0) #define BLD_INSN_2_EX(d, op, dt, s0, s0t, s1, s1t) \ do { \ (d) = bld_insn_2(bld, (NV_OP_##op), (s0), (s1)); \ - (d)->reg.type = NV_TYPE_##dt; \ + SET_TYPE(d, NV_TYPE_##dt); \ (d)->insn->src[0]->typecast = NV_TYPE_##s0t; \ (d)->insn->src[1]->typecast = NV_TYPE_##s1t; \ } while(0) @@ -910,9 +907,9 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, BLD_INSN_1_EX(value, SAT, F32, value, F32); break; case TGSI_SAT_MINUS_PLUS_ONE: + value->reg.as_type = NV_TYPE_F32; value = bld_insn_2(bld, NV_OP_MAX, value, bld_load_imm_f32(bld, -1.0f)); value = bld_insn_2(bld, NV_OP_MIN, value, bld_load_imm_f32(bld, 1.0f)); - value->reg.type = NV_TYPE_F32; break; } @@ -1070,7 +1067,7 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, assert(dim_idx == 1); /* for now */ res = new_value(bld->pc, NV_FILE_MEM_C(dim_idx), type); - res->reg.type = type; + SET_TYPE(res, type); res->reg.id = (idx * 4 + swz) & 127; res = bld_insn_1(bld, NV_OP_LDA, res); @@ -1082,11 +1079,11 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, res = bld_load_imm_u32(bld, bld->ti->immd32[idx * 4 + swz]); switch (bld->ti->immd32_ty[idx]) { - case TGSI_IMM_FLOAT32: res->reg.type = NV_TYPE_F32; break; - case TGSI_IMM_UINT32: res->reg.type = NV_TYPE_U32; break; - case TGSI_IMM_INT32: res->reg.type = NV_TYPE_S32; break; + case TGSI_IMM_FLOAT32: SET_TYPE(res, NV_TYPE_F32); break; + case TGSI_IMM_UINT32: SET_TYPE(res, NV_TYPE_U32); break; + case TGSI_IMM_INT32: SET_TYPE(res, NV_TYPE_S32); break; default: - res->reg.type = type; + SET_TYPE(res, type); break; } break; @@ -1127,6 +1124,9 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, if (!res) return bld_undef(bld, NV_FILE_GPR); + if (insn->Instruction.Opcode != TGSI_OPCODE_MOV) + res->reg.as_type = type; + switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) { case TGSI_UTIL_SIGN_KEEP: break; @@ -1305,7 +1305,7 @@ emit_tex(struct bld_context *bld, uint opcode, /* the inputs to a tex instruction must be separate values */ for (c = 0; c < argc; ++c) { t[c] = bld_insn_1(bld, NV_OP_MOV, t_in[c]); - t[c]->reg.type = NV_TYPE_F32; + SET_TYPE(t[c], NV_TYPE_F32); t[c]->insn->fixed = 1; } @@ -1363,7 +1363,7 @@ bld_texbias_sequence(struct bld_context *bld, cr[l] = bld_cmov(bld, bit[l], NV_CC_EQ, val->insn->flags_def); cr[l]->reg.file = NV_FILE_FLAGS; - cr[l]->reg.type = NV_TYPE_U16; + SET_TYPE(cr[l], NV_TYPE_U16); } sel = new_instruction(bld->pc, NV_OP_SELECT); @@ -1510,7 +1510,8 @@ bld_instruction(struct bld_context *bld, src1 = bld_imm_u32(bld, 4); FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) { src0 = emit_fetch(bld, insn, 0, c); - (temp = bld_insn_1(bld, NV_OP_FLOOR, src0))->reg.type = NV_TYPE_S32; + temp = bld_insn_1(bld, NV_OP_FLOOR, src0); + SET_TYPE(temp, NV_TYPE_S32); dst0[c] = bld_insn_2(bld, NV_OP_SHL, temp, src1); } break; @@ -1791,7 +1792,7 @@ bld_instruction(struct bld_context *bld, src1 = emit_fetch(bld, insn, 1, c); dst0[c] = bld_insn_2(bld, NV_OP_SET, src0, src1); dst0[c]->insn->set_cond = translate_setcc(insn->Instruction.Opcode); - dst0[c]->reg.type = infer_dst_type(insn->Instruction.Opcode); + SET_TYPE(dst0[c], infer_dst_type(insn->Instruction.Opcode)); dst0[c]->insn->src[0]->typecast = dst0[c]->insn->src[1]->typecast = @@ -1799,11 +1800,10 @@ bld_instruction(struct bld_context *bld, if (dst0[c]->reg.type != NV_TYPE_F32) break; + dst0[c]->reg.as_type = NV_TYPE_S32; dst0[c] = bld_insn_1(bld, NV_OP_ABS, dst0[c]); - dst0[c]->insn->src[0]->typecast = NV_TYPE_S32; - dst0[c]->reg.type = NV_TYPE_S32; dst0[c] = bld_insn_1(bld, NV_OP_CVT, dst0[c]); - dst0[c]->reg.type = NV_TYPE_F32; + SET_TYPE(dst0[c], NV_TYPE_F32); } break; case TGSI_OPCODE_SCS: -- cgit v1.2.3 From 246ebd7df1854db22a7f46302ecb1b5d56b68855 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 9 Sep 2010 19:18:42 +0200 Subject: nv50: duplicate interps in load_proj_tex_coords Otherwise we might clobber the origin interpolation result or use the result of the RCP before its definition. --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 39 +++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 50f0151b53..4168bbbc95 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -555,6 +555,34 @@ bld_insn_3(struct bld_context *bld, uint opcode, return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.as_type)); } +static struct nv_value * +bld_duplicate_insn(struct bld_context *bld, struct nv_instruction *nvi) +{ + struct nv_instruction *dupi = new_instruction(bld->pc, nvi->opcode); + int c; + + if (nvi->def[0]) + bld_def(dupi, 0, new_value_like(bld->pc, nvi->def[0])); + + if (nvi->flags_def) { + dupi->flags_def = new_value_like(bld->pc, nvi->flags_def); + dupi->flags_def->insn = dupi; + } + + for (c = 0; c < 5; ++c) + if (nvi->src[c]) + nv_reference(bld->pc, &dupi->src[c], nvi->src[c]->value); + if (nvi->flags_src) + nv_reference(bld->pc, &dupi->flags_src, nvi->flags_src->value); + + dupi->cc = nvi->cc; + dupi->saturate = nvi->saturate; + dupi->centroid = nvi->centroid; + dupi->flat = nvi->flat; + + return dupi->def[0]; +} + static void bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst, struct nv_value *val) @@ -1232,6 +1260,7 @@ load_proj_tex_coords(struct bld_context *bld, t[3] = emit_fetch(bld, insn, 0, 3); if (t[3]->insn->opcode == NV_OP_PINTERP) { + t[3] = bld_duplicate_insn(bld, t[3]->insn); t[3]->insn->opcode = NV_OP_LINTERP; nv_reference(bld->pc, &t[3]->insn->src[1], NULL); } @@ -1240,13 +1269,15 @@ load_proj_tex_coords(struct bld_context *bld, for (c = 0; c < dim; ++c) { t[c] = emit_fetch(bld, insn, 0, c); - if (t[c]->insn->opcode == NV_OP_LINTERP) - t[c]->insn->opcode = NV_OP_PINTERP; - if (t[c]->insn->opcode == NV_OP_PINTERP) + if (t[c]->insn->opcode == NV_OP_LINTERP || + t[c]->insn->opcode == NV_OP_PINTERP) { + t[c] = bld_duplicate_insn(bld, t[c]->insn); + t[c]->insn->opcode = NV_OP_PINTERP; nv_reference(bld->pc, &t[c]->insn->src[1], t[3]); - else + } else { mask |= 1 << c; + } } for (c = 0; mask; ++c, mask >>= 1) { -- cgit v1.2.3 From 6b14a3eb191ab798e524f2413180256fbcc2b33e Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 9 Sep 2010 19:19:08 +0200 Subject: nv50: address regs are 16 bit --- src/gallium/drivers/nv50/nv50_pc_print.c | 4 +++- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index 74c3970f40..a71401979c 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -171,12 +171,14 @@ nv_value_allocated(struct nv_value *value) static INLINE void nv_print_address(const char c, int buf, struct nv_value *a, int offset) { + const char ac = (a && nv_value_allocated(a)) ? '$' : '%'; + if (buf >= 0) PRINT(" %s%c%i[", cyan, c, buf); else PRINT(" %s%c[", cyan, c); if (a) - PRINT("%s$a%i%s+", mgta, nv_value_id(a), cyan); + PRINT("%s%ca%i%s+", mgta, ac, nv_value_id(a), cyan); PRINT("%s0x%x%s]", orng, offset, cyan); } diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 4168bbbc95..6fd749b35f 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -674,6 +674,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) bld->saved_addr[i][0] = bld_load_imm_u32(bld, id); bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR; + bld->saved_addr[i][0]->reg.type = NV_TYPE_U16; bld->saved_addr[i][1] = indirect; return bld->saved_addr[i][0]; } @@ -967,6 +968,7 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, case TGSI_FILE_ADDRESS: assert(reg->Register.Index < BLD_MAX_ADDRS); value->reg.file = NV_FILE_ADDR; + value->reg.type = NV_TYPE_U16; STORE_ADDR(reg->Register.Index, chan, value); break; } -- cgit v1.2.3 From 6997da9f3cf22b9d11ffdfa6ad25b68ef4913fc3 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Thu, 9 Sep 2010 19:09:38 +0200 Subject: nv50: fix can_load check for 3rd source --- src/gallium/drivers/nv50/nv50_pc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 637b3cf2fe..e4df742a80 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -112,13 +112,11 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) if (s == 0 && (value->reg.file == NV_FILE_MEM_S || value->reg.file == NV_FILE_MEM_P)) return TRUE; - if (s == 1 && - value->reg.file >= NV_FILE_MEM_C(0) && - value->reg.file <= NV_FILE_MEM_C(15)) - return TRUE; - if (s == 2 && nvi->src[1]->value->reg.file == NV_FILE_GPR) - return TRUE; - return FALSE; + if (value->reg.file < NV_FILE_MEM_C(0) || + value->reg.file > NV_FILE_MEM_C(15)) + return FALSE; + return (s == 1) || + ((s == 2) && (nvi->src[1]->value->reg.file == NV_FILE_GPR)); case NV_OP_MOV: assert(s == 0); return /* TRUE */ FALSE; /* don't turn MOVs into loads */ -- cgit v1.2.3 From 7a4a537be1460b09b192fdf4d92680aad6c9e951 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 00:46:38 +0200 Subject: nv50: reduce bb_reachable_by runtime from pot to linear As a by-product, remove the memory leak of nv_basic_blocks. --- src/gallium/drivers/nv50/nv50_pc.c | 105 ++++++++++++++++++++++++---- src/gallium/drivers/nv50/nv50_pc.h | 16 +++-- src/gallium/drivers/nv50/nv50_pc_optimize.c | 4 +- 3 files changed, 104 insertions(+), 21 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index e4df742a80..e063888eb5 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -340,6 +340,66 @@ nv_print_program(struct nv_pc *pc) nv_print_function(pc->root[i]); } +#ifdef NV50_PC_DEBUG +static void +nv_do_print_cfgraph(struct nv_pc *pc, FILE *f, struct nv_basic_block *b) +{ + int i; + + b->pass_seq = pc->pass_seq; + + fprintf(f, "\t%i [shape=box]\n", b->id); + + for (i = 0; i < 2; ++i) { + if (!b->out[i]) + continue; + switch (b->out_kind[i]) { + case CFG_EDGE_FORWARD: + fprintf(f, "\t%i -> %i;\n", b->id, b->out[i]->id); + break; + case CFG_EDGE_LOOP_ENTER: + fprintf(f, "\t%i -> %i [color=green];\n", b->id, b->out[i]->id); + break; + case CFG_EDGE_LOOP_LEAVE: + fprintf(f, "\t%i -> %i [color=red];\n", b->id, b->out[i]->id); + break; + case CFG_EDGE_BACK: + fprintf(f, "\t%i -> %i;\n", b->id, b->out[i]->id); + continue; + case CFG_EDGE_FAKE: + fprintf(f, "\t%i -> %i [style=dotted];\n", b->id, b->out[i]->id); + break; + default: + assert(0); + break; + } + if (b->out[i]->pass_seq < pc->pass_seq) + nv_do_print_cfgraph(pc, f, b->out[i]); + } +} + +/* Print the control flow graph of subroutine @subr (0 == MAIN) to a file. */ +static void +nv_print_cfgraph(struct nv_pc *pc, const char *filepath, int subr) +{ + FILE *f; + + f = fopen(filepath, "a"); + if (!f) + return; + + fprintf(f, "digraph G {\n"); + + ++pc->pass_seq; + + nv_do_print_cfgraph(pc, f, pc->root[subr]); + + fprintf(f, "}\n"); + + fclose(f); +} +#endif + static INLINE void nvcg_show_bincode(struct nv_pc *pc) { @@ -393,6 +453,7 @@ nv50_generate_code(struct nv50_translation_info *ti) { struct nv_pc *pc; int ret; + int i; pc = CALLOC_STRUCT(nv_pc); if (!pc) @@ -428,6 +489,7 @@ nv50_generate_code(struct nv50_translation_info *ti) goto out; #ifdef NV50PC_DEBUG nv_print_program(pc); + nv_print_cfgraph(pc, "nv50_shader_cfgraph.dot", 0); #endif /* prepare for emission */ @@ -461,8 +523,8 @@ nv50_generate_code(struct nv50_translation_info *ti) out: nv_pc_free_refs(pc); - if (pc->bb_list) - FREE(pc->bb_list); + for (i = 0; i < pc->num_blocks; ++i) + FREE(pc->bb_list[i]); if (ret) { /* on success, these will be referenced by nv50_program */ if (pc->emit) @@ -644,23 +706,38 @@ nvbb_dominated_by(struct nv_basic_block *b, struct nv_basic_block *d) return j ? TRUE : FALSE; } -/* check if bf (future) can be reached from bp (past) */ +/* check if @bf (future) can be reached from @bp (past), stop at @bt */ boolean nvbb_reachable_by(struct nv_basic_block *bf, struct nv_basic_block *bp, struct nv_basic_block *bt) { - if (bf == bp) - return TRUE; - if (bp == bt) - return FALSE; + struct nv_basic_block *q[NV_PC_MAX_BASIC_BLOCKS], *b; + int i, p, n; - if (bp->out[0] && !IS_WALL_EDGE(bp->out_kind[0]) && - nvbb_reachable_by(bf, bp->out[0], bt)) - return TRUE; - if (bp->out[1] && !IS_WALL_EDGE(bp->out_kind[1]) && - nvbb_reachable_by(bf, bp->out[1], bt)) - return TRUE; - return FALSE; + p = 0; + n = 1; + q[0] = bp; + + while (p < n) { + b = q[p++]; + + if (b == bf) + break; + if (b == bt) + continue; + assert(n <= (1024 - 2)); + + for (i = 0; i < 2; ++i) { + if (b->out[i] && !IS_WALL_EDGE(b->out_kind[i]) && !b->out[i]->priv) { + q[n] = b->out[i]; + q[n++]->priv = 1; + } + } + } + for (--n; n >= 0; --n) + q[n]->priv = 0; + + return (b == bf); } static struct nv_basic_block * diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index ccddae063c..e8d9942307 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -144,6 +144,8 @@ #define NV_PC_MAX_INSTRUCTIONS 2048 #define NV_PC_MAX_VALUES (NV_PC_MAX_INSTRUCTIONS * 4) +#define NV_PC_MAX_BASIC_BLOCKS 1024 + static INLINE boolean nv_is_vector_op(uint opcode) { @@ -284,7 +286,7 @@ struct nv_basic_block { int id; int subroutine; - uint priv; + uint priv; /* reset to 0 after you're done */ uint pass_seq; uint32_t bin_pos; /* position, size in emitted code */ @@ -328,7 +330,7 @@ struct nv_pc { struct nv_value values[NV_PC_MAX_VALUES]; struct nv_instruction instructions[NV_PC_MAX_INSTRUCTIONS]; struct nv_ref **refs; - struct nv_basic_block **bb_list; + struct nv_basic_block *bb_list[NV_PC_MAX_BASIC_BLOCKS]; int num_values; int num_instructions; int num_refs; @@ -437,9 +439,15 @@ new_ref(struct nv_pc *pc, struct nv_value *val) static INLINE struct nv_basic_block * new_basic_block(struct nv_pc *pc) { - struct nv_basic_block *bb = CALLOC_STRUCT(nv_basic_block); + struct nv_basic_block *bb; + + if (pc->num_blocks >= NV_PC_MAX_BASIC_BLOCKS) + return NULL; + + bb = CALLOC_STRUCT(nv_basic_block); - bb->id = pc->num_blocks++; + bb->id = pc->num_blocks; + pc->bb_list[pc->num_blocks++] = bb; return bb; } diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 09d232abda..edda6c0691 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -238,9 +238,7 @@ nv_pc_exec_pass2(struct nv_pc *pc) NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks); - pc->bb_list = CALLOC(pc->num_blocks, sizeof(pc->bb_list[0])); - - pc->num_blocks = 0; + pc->num_blocks = 0; /* will reorder bb_list */ for (i = 0; i < pc->num_subroutines + 1; ++i) if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i]))) -- cgit v1.2.3 From fc31a25afa2d28dea9bbda08ce8deab5aa96b684 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 00:56:16 +0200 Subject: nv50: minor compiler fixes and cleanups --- src/gallium/drivers/nv50/nv50_pc.c | 4 +++- src/gallium/drivers/nv50/nv50_pc_regalloc.c | 5 +++++ src/gallium/drivers/nv50/nv50_shader_state.c | 2 +- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 16 ++++++++++------ 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index e063888eb5..26ad9b4e3d 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -237,6 +237,7 @@ nv_pc_free_refs(struct nv_pc *pc) int i; for (i = 0; i < pc->num_refs; i += 64) FREE(pc->refs[i]); + FREE(pc->refs); } static const char * @@ -525,7 +526,8 @@ out: for (i = 0; i < pc->num_blocks; ++i) FREE(pc->bb_list[i]); - + if (pc->root) + FREE(pc->root); if (ret) { /* on success, these will be referenced by nv50_program */ if (pc->emit) FREE(pc->emit); diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c index 2998343db5..b9d5ba5ef6 100644 --- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c +++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c @@ -888,6 +888,10 @@ nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root) ctx->pc = pc; ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *)); + if (!ctx->insns) { + FREE(ctx); + return -1; + } pc->pass_seq++; ret = pass_generate_phi_movs(ctx, root); @@ -941,6 +945,7 @@ nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root) NV50_DBGMSG("REGISTER ALLOCATION - leaving\n"); out: + FREE(ctx->insns); FREE(ctx); return ret; } diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c index f187a074e6..564f7e5324 100644 --- a/src/gallium/drivers/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nv50/nv50_shader_state.c @@ -44,7 +44,7 @@ nv50_transfer_constbuf(struct nv50_context *nv50, if (!map) return; - count = buf->width0; /* MIN2(buf->width0, size); */ + count = (buf->width0 + 3) / 4; start = 0; while (count) { diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 6fd749b35f..5994d1c27e 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -39,7 +39,7 @@ #define BLD_MAX_PREDS 4 #define BLD_MAX_IMMDS 128 -#define BLD_MAX_COND_NESTING 4 +#define BLD_MAX_COND_NESTING 8 #define BLD_MAX_LOOP_NESTING 4 #define BLD_MAX_CALL_NESTING 2 @@ -70,14 +70,14 @@ bld_vals_del_val(struct bld_value_stack *stk, struct nv_value *val) { unsigned i; - for (i = stk->size - 1; i >= 0; --i) - if (stk->body[i] == val) + for (i = stk->size; i > 0; --i) + if (stk->body[i - 1] == val) break; - if (i < 0) + if (!i) return FALSE; - if (i != stk->size - 1) - stk->body[i] = stk->body[stk->size - 1]; + if (i != stk->size) + stk->body[i - 1] = stk->body[stk->size - 1]; --stk->size; /* XXX: old size in REALLOC */ return TRUE; @@ -1643,6 +1643,8 @@ bld_instruction(struct bld_context *bld, { struct nv_basic_block *b = new_basic_block(bld->pc); + assert(bld->cond_lvl < BLD_MAX_COND_NESTING); + nvbb_attach_block(bld->pc->current_block, b, CFG_EDGE_FORWARD); bld->join_bb[bld->cond_lvl] = bld->pc->current_block; @@ -1695,6 +1697,8 @@ bld_instruction(struct bld_context *bld, struct nv_basic_block *bl = new_basic_block(bld->pc); struct nv_basic_block *bb = new_basic_block(bld->pc); + assert(bld->loop_lvl < BLD_MAX_LOOP_NESTING); + bld->loop_bb[bld->loop_lvl] = bl; bld->brkt_bb[bld->loop_lvl] = bb; -- cgit v1.2.3 From 9b39fb1b6127fecf2fbb41926caca2bbb559a1d0 Mon Sep 17 00:00:00 2001 From: Xavier Chantry Date: Sat, 11 Sep 2010 20:18:25 +0200 Subject: nv50: fix size of outputs_written array --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 5994d1c27e..978bba4d57 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -125,7 +125,7 @@ struct bld_context { struct bld_value_stack pvs[BLD_MAX_PREDS][4]; /* TGSI_FILE_PREDICATE */ struct bld_value_stack ovs[PIPE_MAX_SHADER_OUTPUTS][4]; - uint32_t outputs_written[(PIPE_MAX_SHADER_OUTPUTS + 31) / 32]; + uint32_t outputs_written[(PIPE_MAX_SHADER_OUTPUTS + 7) / 8]; struct nv_value *frgcrd[4]; struct nv_value *sysval[4]; -- cgit v1.2.3 From d4fd11a628b0e48d76fab4a0b94470a7592faf26 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 11:19:24 +0200 Subject: nv50: cannot move from local mem to output reg directly --- src/gallium/drivers/nv50/nv50_pc.c | 3 ++- src/gallium/drivers/nv50/nv50_pc_optimize.c | 3 ++- src/gallium/drivers/nv50/nv50_pc_print.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 26ad9b4e3d..0511acfd57 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -57,6 +57,7 @@ nv50_indirect_opnd(struct nv_instruction *i) switch (i->opcode) { case NV_OP_MOV: case NV_OP_LDA: + case NV_OP_STA: return 0; default: return 1; @@ -341,7 +342,7 @@ nv_print_program(struct nv_pc *pc) nv_print_function(pc->root[i]); } -#ifdef NV50_PC_DEBUG +#ifdef NV50PC_DEBUG static void nv_do_print_cfgraph(struct nv_pc *pc, FILE *f, struct nv_basic_block *b) { diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index edda6c0691..8653bc6e63 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -327,7 +327,8 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) /* cannot write to $oX when using immediate */ for (j = 0; j < 4 && nvi->src[j]; ++j) - if (nvi->src[j]->value->reg.file == NV_FILE_IMM) + if (nvi->src[j]->value->reg.file == NV_FILE_IMM || + nvi->src[j]->value->reg.file == NV_FILE_MEM_L) break; if (j < 4 && nvi->src[j]) continue; diff --git a/src/gallium/drivers/nv50/nv50_pc_print.c b/src/gallium/drivers/nv50/nv50_pc_print.c index a71401979c..984f6cbe17 100644 --- a/src/gallium/drivers/nv50/nv50_pc_print.c +++ b/src/gallium/drivers/nv50/nv50_pc_print.c @@ -220,7 +220,7 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type) PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value)); break; case NV_FILE_MEM_L: - nv_print_address('l', -1, ind, 4 * nv_value_id(value)); + nv_print_address('l', -1, ind, nv_value_id(value)); break; case NV_FILE_MEM_S: nv_print_address('s', -1, ind, 4 * nv_value_id(value)); -- cgit v1.2.3 From fdb00ac1efc7c12aeed1a7e705c5a5dd258b7d54 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 11:37:07 +0200 Subject: nv50: newlines in shader bincode printing --- src/gallium/drivers/nv50/nv50_pc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 0511acfd57..c934450d42 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -405,10 +405,13 @@ nv_print_cfgraph(struct nv_pc *pc, const char *filepath, int subr) static INLINE void nvcg_show_bincode(struct nv_pc *pc) { - int i; + unsigned i; - for (i = 0; i < pc->bin_size / 4; ++i) + for (i = 0; i < pc->bin_size / 4; ++i) { debug_printf("0x%08x ", pc->emit[i]); + if ((i % 16) == 15) + debug_printf("\n"); + } debug_printf("\n"); } -- cgit v1.2.3 From 1fa812d84aa4dcb03f3e64fd46abe5b02ac985d1 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 11:37:45 +0200 Subject: nv50: match TEMP limit with nv50 ir builder Mesa doesn't respect it anyway, but this makes it assert rather than threads access areas of l[] that don't belong to them. --- src/gallium/drivers/nv50/nv50_program.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index a1b2bde97b..d8b6e8d6d1 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -27,7 +27,7 @@ #include "tgsi/tgsi_scan.h" #include "nouveau/nouveau_class.h" -#define NV50_CAP_MAX_PROGRAM_TEMPS (128 / 4) +#define NV50_CAP_MAX_PROGRAM_TEMPS 64 struct nv50_varying { uint8_t id; /* tgsi index */ -- cgit v1.2.3 From 98c87c382d080ff5a048564e942e649fbaf43879 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 14:31:01 +0200 Subject: nv50: handle TGSI EXP and LOG again --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 2 ++ src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 48 +++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 8653bc6e63..ea1da6268d 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -322,6 +322,8 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) continue; assert(nvi->def[0] == sti->src[0]->value); + if (nvi->opcode == NV_OP_SELECT) + continue; if (nvi->def[0]->refc > 1) continue; diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 978bba4d57..b4f5a884c4 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -214,6 +214,7 @@ static INLINE void bld_warn_uninitialized(struct bld_context *bld, int kind, struct bld_value_stack *stk, struct nv_basic_block *b) { +#ifdef NV50_TGSI2NC_DEBUG long i = (stk - &bld->tvs[0][0]) / 4; long c = (stk - &bld->tvs[0][0]) & 3; @@ -222,6 +223,7 @@ bld_warn_uninitialized(struct bld_context *bld, int kind, debug_printf("WARNING: TEMP[%li].%c %s used uninitialized in BB:%i\n", i, (int)('x' + c), kind ? "may be" : "is", b->id); +#endif } static INLINE struct nv_value * @@ -646,7 +648,10 @@ bld_pow(struct bld_context *bld, struct nv_value *x, struct nv_value *e) static INLINE struct nv_value * bld_load_imm_f32(struct bld_context *bld, float f) { - return bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f)); + struct nv_value *imm = bld_insn_1(bld, NV_OP_MOV, bld_imm_f32(bld, f)); + + SET_TYPE(imm, NV_TYPE_F32); + return imm; } static INLINE struct nv_value * @@ -944,6 +949,8 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, switch (reg->Register.File) { case TGSI_FILE_OUTPUT: + if (!value->insn && (bld->ti->output_file == NV_FILE_OUT)) + value = bld_insn_1(bld, NV_OP_MOV, value); value = bld_insn_1(bld, NV_OP_MOV, value); value->reg.file = bld->ti->output_file; @@ -956,9 +963,9 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst, break; case TGSI_FILE_TEMPORARY: assert(reg->Register.Index < BLD_MAX_TEMPS); - value->reg.file = NV_FILE_GPR; - if (value->insn->bb != bld->pc->current_block) + if (!value->insn || (value->insn->bb != bld->pc->current_block)) value = bld_insn_1(bld, NV_OP_MOV, value); + value->reg.file = NV_FILE_GPR; if (bld->ti->store_to_memory) bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value); @@ -1616,6 +1623,23 @@ bld_instruction(struct bld_context *bld, if (insn->Dst[0].Register.WriteMask & 8) dst0[3] = emit_fetch(bld, insn, 1, 3); break; + case TGSI_OPCODE_EXP: + src0 = emit_fetch(bld, insn, 0, 0); + temp = bld_insn_1(bld, NV_OP_FLOOR, src0); + + if (insn->Dst[0].Register.WriteMask & 2) + dst0[1] = bld_insn_2(bld, NV_OP_SUB, src0, temp); + if (insn->Dst[0].Register.WriteMask & 1) { + temp = bld_insn_1(bld, NV_OP_PREEX2, temp); + dst0[0] = bld_insn_1(bld, NV_OP_EX2, temp); + } + if (insn->Dst[0].Register.WriteMask & 4) { + temp = bld_insn_1(bld, NV_OP_PREEX2, src0); + dst0[2] = bld_insn_1(bld, NV_OP_EX2, temp); + } + if (insn->Dst[0].Register.WriteMask & 8) + dst0[3] = bld_imm_f32(bld, 1.0f); + break; case TGSI_OPCODE_EX2: src0 = emit_fetch(bld, insn, 0, 0); temp = bld_insn_1(bld, NV_OP_PREEX2, src0); @@ -1798,6 +1822,24 @@ bld_instruction(struct bld_context *bld, FOR_EACH_DST0_ENABLED_CHANNEL(c, insn) dst0[c] = temp; break; + case TGSI_OPCODE_LOG: + src0 = emit_fetch(bld, insn, 0, 0); + src0 = bld_insn_1(bld, NV_OP_ABS, src0); + temp = bld_insn_1(bld, NV_OP_LG2, src0); + dst0[2] = temp; + if (insn->Dst[0].Register.WriteMask & 3) { + temp = bld_insn_1(bld, NV_OP_FLOOR, temp); + dst0[0] = temp; + } + if (insn->Dst[0].Register.WriteMask & 2) { + temp = bld_insn_1(bld, NV_OP_PREEX2, temp); + temp = bld_insn_1(bld, NV_OP_EX2, temp); + temp = bld_insn_1(bld, NV_OP_RCP, temp); + dst0[1] = bld_insn_2(bld, NV_OP_MUL, src0, temp); + } + if (insn->Dst[0].Register.WriteMask & 8) + dst0[3] = bld_imm_f32(bld, 1.0f); + break; case TGSI_OPCODE_RCP: case TGSI_OPCODE_LG2: src0 = emit_fetch(bld, insn, 0, 0); -- cgit v1.2.3 From cca3906a9b1d994c431ceeccccbde0ce87a2f6b4 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 19:43:22 +0200 Subject: nv50: check for immediates when turning MUL ADD into MAD --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index ea1da6268d..fba60984ac 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -562,6 +562,11 @@ constant_expression(struct nv_pc *pc, struct nv_instruction *nvi, nvi->src[0] = nvi->src[2]; nvi->src[2] = NULL; nvi->opcode = NV_OP_ADD; + + if (val->reg.imm.u32 == 0) { + nvi->src[1] = NULL; + nvi->opcode = NV_OP_MOV; + } } } @@ -703,6 +708,10 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) else continue; + /* could have an immediate from above constant_* */ + if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR) + continue; + nvi->opcode = NV_OP_MAD; mod = nvi->src[(src == src0) ? 0 : 1]->mod; nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL); -- cgit v1.2.3 From 1f1411f2ccc7f808d181c09f925b0780306a05ca Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 20:55:09 +0200 Subject: nv50: interp cannot write flags reg --- src/gallium/drivers/nv50/nv50_pc.c | 21 +++++++++++++++++++++ src/gallium/drivers/nv50/nv50_pc.h | 1 + src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 10 +++------- 3 files changed, 25 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index c934450d42..78aca8fd56 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -170,6 +170,27 @@ nv50_supported_src_mods(uint opcode, int s) } } +/* We may want an opcode table. */ +boolean +nv50_op_can_write_flags(uint opcode) +{ + if (nv_is_vector_op(opcode)) + return FALSE; + switch (opcode) { /* obvious ones like KIL, CALL, etc. not included */ + case NV_OP_PHI: + case NV_OP_MOV: + case NV_OP_LINTERP: + case NV_OP_PINTERP: + case NV_OP_LDA: + return FALSE; + default: + break; + } + if (opcode >= NV_OP_RCP && opcode <= NV_OP_PREEX2) + return FALSE; + return TRUE; +} + int nv_nvi_refcount(struct nv_instruction *nvi) { diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index e8d9942307..8f15a82026 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -486,6 +486,7 @@ int nv50_indirect_opnd(struct nv_instruction *); boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s); boolean nv50_nvi_can_predicate(struct nv_instruction *); boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *); +boolean nv50_op_can_write_flags(uint opcode); ubyte nv50_supported_src_mods(uint opcode, int s); int nv_nvi_refcount(struct nv_instruction *); void nv_nvi_delete(struct nv_instruction *); diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index b4f5a884c4..8ad0b18c79 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -700,19 +700,15 @@ bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only) while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_NEG || nvi->opcode == NV_OP_CVT) { s0i = nvi->src[0]->value->insn; - if (!s0i || - s0i->opcode == NV_OP_LDA || - s0i->opcode == NV_OP_MOV || - s0i->opcode == NV_OP_PHI) + if (!s0i || !nv50_op_can_write_flags(s0i->opcode)) break; nvi = s0i; assert(!nvi->flags_src); } } - if (nvi->opcode == NV_OP_LDA || - nvi->opcode == NV_OP_MOV || - nvi->opcode == NV_OP_PHI || nvi->bb != bld->pc->current_block) { + if (!nv50_op_can_write_flags(nvi->opcode) || + nvi->bb != bld->pc->current_block) { nvi = new_instruction(bld->pc, NV_OP_CVT); nv_reference(bld->pc, &nvi->src[0], src); } -- cgit v1.2.3 From 3b3c20744f2ea90f6aaae33b337bdc5e135f3198 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 12 Sep 2010 23:11:30 +0200 Subject: nv50: MOV TEMP[0], -CONST[0] must be float32 negation --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 8ad0b18c79..54d6fb960f 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1075,7 +1075,7 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, const struct tgsi_full_src_register *src = &insn->Src[s]; struct nv_value *res; struct nv_value *ptr = NULL; - unsigned idx, swz, dim_idx, ind_idx, ind_swz; + unsigned idx, swz, dim_idx, ind_idx, ind_swz, sgn; ubyte type = infer_src_type(insn->Instruction.Opcode); idx = src->Register.Index; @@ -1157,10 +1157,15 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn, if (!res) return bld_undef(bld, NV_FILE_GPR); + sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); + if (insn->Instruction.Opcode != TGSI_OPCODE_MOV) res->reg.as_type = type; + else + if (sgn != TGSI_UTIL_SIGN_KEEP) /* apparently "MOV A, -B" assumes float */ + res->reg.as_type = NV_TYPE_F32; - switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) { + switch (sgn) { case TGSI_UTIL_SIGN_KEEP: break; case TGSI_UTIL_SIGN_CLEAR: -- cgit v1.2.3 From 0b8170103c8eaff46b75e89608198b3eb564bc52 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 13 Sep 2010 00:59:38 +0200 Subject: nv50: fix indirect CONST access with large or negative offsets --- src/gallium/drivers/nv50/nv50_pc_emit.c | 6 ++++-- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 9 ++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index 8c64b19875..1eb44741f1 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -696,7 +696,9 @@ emit_add_b32(struct nv_pc *pc, struct nv_instruction *i) static void emit_add_a16(struct nv_pc *pc, struct nv_instruction *i) { - pc->emit[0] = 0xd0000001 | (get_immd_u32(i->src[0]) << 9); + int s = (i->opcode == NV_OP_MOV) ? 0 : 1; + + pc->emit[0] = 0xd0000001 | ((uint16_t)get_immd_u32(i->src[s]) << 9); pc->emit[1] = 0x20000000; pc->emit[0] |= (DREG(i->def[0])->id + 1) << 2; @@ -704,7 +706,7 @@ emit_add_a16(struct nv_pc *pc, struct nv_instruction *i) set_pred(pc, i); if (i->src[1]) - set_a16_bits(pc, SREG(i->src[1])->id); + set_a16_bits(pc, SREG(i->src[1])->id + 1); } static void diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 54d6fb960f..a2b6901c81 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -665,6 +665,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) { int i; struct nv_instruction *nvi; + struct nv_value *val; for (i = 0; i < 4; ++i) { if (!bld->saved_addr[i][0]) @@ -677,7 +678,13 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect) } i &= 3; - bld->saved_addr[i][0] = bld_load_imm_u32(bld, id); + val = bld_imm_u32(bld, id); + if (indirect) + val = bld_insn_2(bld, NV_OP_ADD, indirect, val); + else + val = bld_insn_1(bld, NV_OP_MOV, val); + + bld->saved_addr[i][0] = val; bld->saved_addr[i][0]->reg.file = NV_FILE_ADDR; bld->saved_addr[i][0]->reg.type = NV_TYPE_U16; bld->saved_addr[i][1] = indirect; -- cgit v1.2.3 From 60f34e9f60c288a67132d91a82ec66378eb318ad Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 13 Sep 2010 17:04:48 +0200 Subject: nv50: fix TXP depth comparison value --- src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 38 +++++++++++++++++------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index a2b6901c81..90d81d3e17 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -1269,10 +1269,14 @@ get_tex_dim(const struct tgsi_full_instruction *insn, int *dim, int *arg) static void load_proj_tex_coords(struct bld_context *bld, - struct nv_value *t[4], int dim, + struct nv_value *t[4], int dim, int arg, const struct tgsi_full_instruction *insn) { - int c, mask = 0; + int c, mask; + + mask = (1 << dim) - 1; + if (arg != dim) + mask |= 4; /* depth comparison value */ t[3] = emit_fetch(bld, insn, 0, 3); @@ -1284,17 +1288,19 @@ load_proj_tex_coords(struct bld_context *bld, t[3] = bld_insn_1(bld, NV_OP_RCP, t[3]); - for (c = 0; c < dim; ++c) { + for (c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) + continue; t[c] = emit_fetch(bld, insn, 0, c); - if (t[c]->insn->opcode == NV_OP_LINTERP || - t[c]->insn->opcode == NV_OP_PINTERP) { - t[c] = bld_duplicate_insn(bld, t[c]->insn); - t[c]->insn->opcode = NV_OP_PINTERP; - nv_reference(bld->pc, &t[c]->insn->src[1], t[3]); - } else { - mask |= 1 << c; - } + if (t[c]->insn->opcode != NV_OP_LINTERP && + t[c]->insn->opcode != NV_OP_PINTERP) + continue; + t[c] = bld_duplicate_insn(bld, t[c]->insn); + t[c]->insn->opcode = NV_OP_PINTERP; + nv_reference(bld->pc, &t[c]->insn->src[1], t[3]); + + mask &= ~(1 << c); } for (c = 0; mask; ++c, mask >>= 1) { @@ -1467,10 +1473,13 @@ bld_tex(struct bld_context *bld, struct nv_value *dst0[4], get_tex_dim(insn, &dim, &arg); if (!cube && insn->Instruction.Opcode == TGSI_OPCODE_TXP) - load_proj_tex_coords(bld, t, dim, insn); - else + load_proj_tex_coords(bld, t, dim, arg, insn); + else { for (c = 0; c < dim; ++c) t[c] = emit_fetch(bld, insn, 0, c); + if (arg != dim) + t[dim] = emit_fetch(bld, insn, 0, 2); + } if (cube) { assert(dim >= 3); @@ -1485,9 +1494,6 @@ bld_tex(struct bld_context *bld, struct nv_value *dst0[4], t[c] = bld_insn_2(bld, NV_OP_MUL, t[c], s[0]); } - if (arg != dim) - t[dim] = emit_fetch(bld, insn, 0, 2); - if (opcode == NV_OP_TXB || opcode == NV_OP_TXL) { t[arg++] = emit_fetch(bld, insn, 0, 3); -- cgit v1.2.3 From 16d8f5fee51a4a86f5f0c15228b48d5668ab2be2 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Mon, 13 Sep 2010 21:13:36 +0200 Subject: nv50: consider address register in reload elimination --- src/gallium/drivers/nv50/nv50_pc_optimize.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index fba60984ac..3ff6db7dd2 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -732,7 +732,7 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) struct load_record { struct load_record *next; - uint64_t data; + uint64_t data[2]; struct nv_value *value; }; @@ -757,7 +757,7 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) { struct load_record **rec, *it; struct nv_instruction *ld, *next; - uint64_t data; + uint64_t data[2]; struct nv_value *val; int j; @@ -769,11 +769,13 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) rec = NULL; if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { - data = val->reg.id; + data[0] = val->reg.id; + data[1] = 0; rec = &ctx->mem_v; } else if (ld->opcode == NV_OP_LDA) { - data = val->reg.id; + data[0] = val->reg.id; + data[1] = ld->src[4] ? ld->src[4]->value->n : ~0ULL; if (val->reg.file >= NV_FILE_MEM_C(0) && val->reg.file <= NV_FILE_MEM_C(15)) rec = &ctx->mem_c[val->reg.file - NV_FILE_MEM_C(0)]; @@ -785,7 +787,8 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) rec = &ctx->mem_l; } else if ((ld->opcode == NV_OP_MOV) && (val->reg.file == NV_FILE_IMM)) { - data = val->reg.imm.u32; + data[0] = val->reg.imm.u32; + data[1] = 0; rec = &ctx->imm; } @@ -793,7 +796,7 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) continue; for (it = *rec; it; it = it->next) - if (it->data == data) + if (it->data[0] == data[0] && it->data[1] == data[1]) break; if (it) { @@ -807,7 +810,8 @@ nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b) continue; it = &ctx->pool[ctx->alloc++]; it->next = *rec; - it->data = data; + it->data[0] = data[0]; + it->data[1] = data[1]; it->value = ld->def[0]; *rec = it; } -- cgit v1.2.3 From c46e7a05e501e02b10dbc06772c0ef01308f60d5 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 15 Sep 2010 13:59:09 +0200 Subject: nv50: improve and fix modifier folding optimization Execute before folding loads, because we don't check if it's legal in lower_mods. Ensure that a value's insn pointer is updated when transferring it to a different instruction. --- src/gallium/drivers/nv50/nv50_pc.c | 1 + src/gallium/drivers/nv50/nv50_pc.h | 1 - src/gallium/drivers/nv50/nv50_pc_emit.c | 5 +++ src/gallium/drivers/nv50/nv50_pc_optimize.c | 65 ++++++++++++++++++----------- 4 files changed, 46 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 78aca8fd56..2706d88779 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -104,6 +104,7 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value) case NV_OP_FLOOR: case NV_OP_TRUNC: case NV_OP_CVT: + case NV_OP_NEG: case NV_OP_MAD: case NV_OP_MUL: case NV_OP_SAT: diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index 8f15a82026..92c6be5f6e 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -220,7 +220,6 @@ struct nv_value { struct nv_ref { struct nv_value *value; - struct nv_instruction *insn; ubyte mod; ubyte typecast; ubyte flags; /* not used yet */ diff --git a/src/gallium/drivers/nv50/nv50_pc_emit.c b/src/gallium/drivers/nv50/nv50_pc_emit.c index 1eb44741f1..137a531dd6 100644 --- a/src/gallium/drivers/nv50/nv50_pc_emit.c +++ b/src/gallium/drivers/nv50/nv50_pc_emit.c @@ -654,6 +654,8 @@ emit_add_f32(struct nv_pc *pc, struct nv_instruction *i) { pc->emit[0] = 0xb0000000; + assert(!((i->src[0]->mod | i->src[1]->mod) & NV_MOD_ABS)); + if (SFILE(i, 1) == NV_FILE_IMM) { emit_form_IMM(pc, i, 0); @@ -665,6 +667,9 @@ emit_add_f32(struct nv_pc *pc, struct nv_instruction *i) if (i->src[0]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 26; if (i->src[1]->mod & NV_MOD_NEG) pc->emit[1] |= 1 << 27; + + if (i->saturate) + pc->emit[1] |= 0x20000000; } else { emit_form_MUL(pc, i); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index 3ff6db7dd2..921ed15691 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -336,6 +336,7 @@ nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) continue; nvi->def[0] = sti->def[0]; + nvi->def[0]->insn = nvi; nvi->fixed = sti->fixed; nv_nvi_delete(sti); @@ -374,7 +375,7 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) if (j == 0 && ld->src[4]) /* can't load shared mem */ continue; - /* fold it ! */ /* XXX: ref->insn */ + /* fold it ! */ nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); if (ld->src[4]) nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); @@ -388,6 +389,7 @@ nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) return 0; } +/* NOTE: Assumes loads have not yet been folded. */ static int nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) { @@ -402,14 +404,7 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) nvi->src[1]->mod ^= NV_MOD_NEG; } - /* should not put any modifiers on NEG and ABS */ - assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod); - assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod); - - for (j = 0; j < 4; ++j) { - if (!nvi->src[j]) - break; - + for (j = 0; j < 4 && nvi->src[j]; ++j) { mi = nvi->src[j]->value->insn; if (!mi) continue; @@ -421,16 +416,32 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS; else continue; + assert(!(mod & mi->src[0]->mod & NV_MOD_NEG)); + + mod |= mi->src[0]->mod; + + if (mi->flags_def || mi->flags_src) + continue; - if (nvi->opcode == NV_OP_ABS) + if ((nvi->opcode == NV_OP_ABS) || (nvi->src[j]->mod & NV_MOD_ABS)) { + /* abs neg [abs] = abs */ mod &= ~(NV_MOD_NEG | NV_MOD_ABS); - else - if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) { - nvi->opcode = NV_OP_MOV; + } else + if ((nvi->opcode == NV_OP_NEG) && (mod & NV_MOD_NEG)) { + /* neg as opcode and modifier on same insn cannot occur */ + /* neg neg abs = abs, neg neg = identity */ + assert(j == 0); + if (mod & NV_MOD_ABS) + nvi->opcode = NV_OP_ABS; + else + if (nvi->flags_def) + nvi->opcode = NV_OP_CVT; + else + nvi->opcode = NV_OP_MOV; mod = 0; } - if (!(nv50_supported_src_mods(nvi->opcode, j) & mod)) + if ((nv50_supported_src_mods(nvi->opcode, j) & mod) != mod) continue; nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value); @@ -441,11 +452,15 @@ nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) if (nvi->opcode == NV_OP_SAT) { mi = nvi->src[0]->value->insn; - if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) { - mi->saturate = 1; - mi->def[0] = nvi->def[0]; - nv_nvi_delete(nvi); - } + if (mi->opcode != NV_OP_ADD || mi->opcode != NV_OP_MAD) + continue; + if (mi->flags_def || mi->def[0]->refc > 1) + continue; + + mi->saturate = 1; + mi->def[0] = nvi->def[0]; + mi->def[0]->insn = mi; + nv_nvi_delete(nvi); } } DESCEND_ARBITRARY(j, nv_pass_lower_mods); @@ -956,7 +971,7 @@ nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1) if (!nv50_nvi_can_predicate(nvi)) break; -#ifdef NV50_PC_DEBUG +#ifdef NV50PC_DEBUG if (nvi) { debug_printf("cannot predicate: "); nv_print_instruction(nvi); } @@ -1081,6 +1096,11 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) if (ret) return ret; + pc->pass_seq++; + ret = nv_pass_lower_mods(&pass, root); + if (ret) + return ret; + pc->pass_seq++; ret = nv_pass_fold_loads(&pass, root); if (ret) @@ -1106,11 +1126,6 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root) if (ret) return ret; - pc->pass_seq++; - ret = nv_pass_lower_mods(&pass, root); - if (ret) - return ret; - dce.pc = pc; do { dce.removed = 0; -- cgit v1.2.3 From 84d170bbcef8e26017ac8e2f3bacbaeb20f889d3 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 15 Sep 2010 15:21:41 +0200 Subject: nv50: put low limit on REG_ALLOC_TEMP and FP_RESULT_COUNT --- src/gallium/drivers/nv50/nv50_pc.c | 4 ++-- src/gallium/drivers/nv50/nv50_program.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 2706d88779..bb464ec4c9 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -539,8 +539,8 @@ nv50_generate_code(struct nv50_translation_info *ti) ti->p->immd_size = pc->immd_count * 4; ti->p->immd = pc->immd_buf; - /* highest 16 bit reg to num of 32 bit regs */ - ti->p->max_gpr = (pc->max_reg[NV_FILE_GPR] >> 1) + 1; + /* highest 16 bit reg to num of 32 bit regs, limit to >= 4 */ + ti->p->max_gpr = MAX2(4, (pc->max_reg[NV_FILE_GPR] >> 1) + 1); ti->p->fixups = pc->fixups; ti->p->num_fixups = pc->num_fixups; diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 24952f70f1..b3600f7ba7 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -514,6 +514,9 @@ nv50_fragprog_prepare(struct nv50_translation_info *ti) if (depr < p->out_nr) { p->out[depr].mask = 0x4; p->out[depr].hw = ti->output_map[depr][2] = p->max_out++; + } else { + /* allowed values are 1, 4, 5, 8, 9, ... */ + p->max_out = MAX2(4, p->max_out); } return 0; -- cgit v1.2.3