diff options
Diffstat (limited to 'src/gallium/drivers/nv50/nv50_pc_optimize.c')
-rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_optimize.c | 717 |
1 files changed, 717 insertions, 0 deletions
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c new file mode 100644 index 0000000000..0811420e42 --- /dev/null +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -0,0 +1,717 @@ + +#include "nv50_pc.h" + +#define DESCEND_ARBITRARY(j, f) \ +do { \ + b->pass_seq = ctx->pc->pass_seq; \ + \ + for (j = 0; j < 2; ++j) \ + if (b->out[j] && b->out[j]->pass_seq < ctx->pc->pass_seq) \ + f(ctx, b->out[j]); \ +} while (0) + +extern unsigned nv50_inst_min_size(struct nv_instruction *); + +struct nv_pc_pass { + struct nv_pc *pc; +}; + +static INLINE boolean +values_equal(struct nv_value *a, struct nv_value *b) +{ + /* XXX: sizes */ + return (a->reg.file == b->reg.file && a->join->reg.id == b->join->reg.id); +} + +static INLINE boolean +inst_commutation_check(struct nv_instruction *a, + struct nv_instruction *b) +{ + int si, di; + + for (di = 0; di < 4; ++di) { + if (!a->def[di]) + break; + for (si = 0; si < 5; ++si) { + if (!b->src[si]) + continue; + if (values_equal(a->def[di], b->src[si]->value)) + return FALSE; + } + } + + if (b->flags_src && b->flags_src->value == a->flags_def) + return FALSE; + + return TRUE; +} + +/* Check whether we can swap the order of the instructions, + * where a & b may be either the earlier or the later one. + */ +static boolean +inst_commutation_legal(struct nv_instruction *a, + struct nv_instruction *b) +{ + return inst_commutation_check(a, b) && inst_commutation_check(b, a); +} + +static INLINE boolean +inst_cullable(struct nv_instruction *nvi) +{ + return (!(nvi->is_terminator || + nvi->target || + nvi->fixed || + nv_nvi_refcount(nvi))); +} + +static INLINE boolean +nvi_isnop(struct nv_instruction *nvi) +{ + if (nvi->opcode == NV_OP_EXPORT) + return TRUE; + + if (nvi->fixed || + nvi->is_terminator || + nvi->flags_src || + nvi->flags_def) + return FALSE; + + if (nvi->def[0]->join->reg.id < 0) + return TRUE; + + if (nvi->opcode != NV_OP_MOV && nvi->opcode != NV_OP_SELECT) + return FALSE; + + if (nvi->def[0]->reg.file != nvi->src[0]->value->reg.file) + return FALSE; + + if (nvi->src[0]->value->join->reg.id < 0) { + debug_printf("nvi_isnop: orphaned value detected\n"); + return TRUE; + } + + if (nvi->opcode == NV_OP_SELECT) + if (!values_equal(nvi->def[0], nvi->src[1]->value)) + return FALSE; + + return values_equal(nvi->def[0], nvi->src[0]->value); +} + +static void +nv_pc_pass_pre_emission(struct nv_pc *pc, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *next; + int j; + uint size, n32 = 0; + + b->priv = 0; + + if (pc->num_blocks) + b->bin_pos = pc->bb_list[pc->num_blocks - 1]->bin_pos + + pc->bb_list[pc->num_blocks - 1]->bin_size; + + pc->bb_list[pc->num_blocks++] = b; + + /* visit node */ + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi_isnop(nvi)) + nv_nvi_delete(nvi); + } + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + + size = nv50_inst_min_size(nvi); + if (nvi->next && size < 8) + ++n32; + else + if ((n32 & 1) && nvi->next && + nv50_inst_min_size(nvi->next) == 4 && + inst_commutation_legal(nvi, nvi->next)) { + ++n32; + debug_printf("permuting: "); + nv_print_instruction(nvi); + nv_print_instruction(nvi->next); + nv_nvi_permute(nvi, nvi->next); + next = nvi; + } else { + nvi->is_long = 1; + + b->bin_size += n32 & 1; + if (n32 & 1) + nvi->prev->is_long = 1; + n32 = 0; + } + b->bin_size += 1 + nvi->is_long; + } + + if (!b->entry) { + debug_printf("block %p is now empty\n", b); + } else + if (!b->exit->is_long) { + assert(n32); + b->exit->is_long = 1; + b->bin_size += 1; + + /* might have del'd a hole tail of instructions */ + if (!b->exit->prev->is_long && !(n32 & 1)) { + b->bin_size += 1; + b->exit->prev->is_long = 1; + } + } + assert(!b->exit || b->exit->is_long); + + pc->bin_size += b->bin_size *= 4; + + /* descend CFG */ + + if (!b->out[0]) + return; + if (!b->out[1] && ++(b->out[0]->priv) != b->out[0]->num_in) + return; + +#if 0 + /* delete ELSE branch */ + if (b->entry && + b->entry->opcode == NV_OP_BRA && b->entry->target == b->out[0]) { + nv_nvi_delete(b->entry); + b->bin_size -= 2; + pc->bin_size -= 8; + } +#endif + for (j = 0; j < 2; ++j) + if (b->out[j] && b->out[j] != b) + nv_pc_pass_pre_emission(pc, b->out[j]); +} + +int +nv_pc_exec_pass2(struct nv_pc *pc) +{ + debug_printf("preparing %u blocks for emission\n", pc->num_blocks); + + pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); + + pc->num_blocks = 0; + nv_pc_pass_pre_emission(pc, pc->root); + + return 0; +} + +static INLINE boolean +is_cmem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + nvi->src[0]->value->reg.file >= NV_FILE_MEM_C(0) && + nvi->src[0]->value->reg.file <= NV_FILE_MEM_C(15)); +} + +static INLINE boolean +is_smem_load(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_LDA && + (nvi->src[0]->value->reg.file == NV_FILE_MEM_S || + nvi->src[0]->value->reg.file <= NV_FILE_MEM_P)); +} + +static INLINE boolean +is_immd_move(struct nv_instruction *nvi) +{ + return (nvi->opcode == NV_OP_MOV && + nvi->src[0]->value->reg.file == NV_FILE_IMM); +} + +static INLINE void +check_swap_src_0_1(struct nv_instruction *nvi) +{ + static const ubyte cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + + struct nv_ref *src0 = nvi->src[0], *src1 = nvi->src[1]; + + if (!nv_op_commutative(nvi->opcode)) + return; + assert(src0 && src1); + + if (is_cmem_load(src0->value->insn)) { + if (!is_cmem_load(src1->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping cmem load to 1\n"); */ + } + } else + if (is_smem_load(src1->value->insn)) { + if (!is_smem_load(src0->value->insn)) { + nvi->src[0] = src1; + nvi->src[1] = src0; + /* debug_printf("swapping smem load to 0\n"); */ + } + } + + if (nvi->opcode == NV_OP_SET && nvi->src[0] != src0) + nvi->set_cond = cc_swapped[nvi->set_cond]; +} + +struct nv_pass { + struct nv_pc *pc; + int n; + void *priv; +}; + +static int +nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *sti; + int j; + + for (sti = b->entry; sti; sti = sti->next) { + if (!sti->def[0]) + continue; + + if (sti->def[0]->reg.file != NV_FILE_OUT) + continue; + if (sti->opcode != NV_OP_MOV && sti->opcode != NV_OP_STA) + continue; + + nvi = sti->src[0]->value->insn; + if (!nvi || nvi->opcode == NV_OP_PHI) + continue; + assert(nvi->def[0] == sti->src[0]->value); + + if (nvi->def[0]->refc > 1) + continue; + + nvi->def[0] = sti->def[0]; + nvi->fixed = 1; + sti->fixed = 0; + } + DESCEND_ARBITRARY(j, nv_pass_fold_stores); + + return 0; +} + +static int +nv_pass_fold_loads(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *ld; + int j; + + for (nvi = b->entry; nvi; nvi = nvi->next) { + check_swap_src_0_1(nvi); + + for (j = 0; j < 3; ++j) { + if (!nvi->src[j]) + break; + ld = nvi->src[j]->value->insn; + if (!ld) + continue; + + if (is_immd_move(ld) && nv50_nvi_can_use_imm(nvi, j)) { + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + debug_printf("folded immediate %i\n", ld->def[0]->n); + continue; + } + + if (ld->opcode != NV_OP_LDA) + continue; + if (!nv50_nvi_can_load(nvi, j, ld->src[0]->value)) + continue; + + if (j == 0 && ld->src[4]) /* can't load shared mem */ + continue; + + /* fold it ! */ /* XXX: ref->insn */ + nv_reference(ctx->pc, &nvi->src[j], ld->src[0]->value); + if (ld->src[4]) + nv_reference(ctx->pc, &nvi->src[4], ld->src[4]->value); + } + } + DESCEND_ARBITRARY(j, nv_pass_fold_loads); + + return 0; +} + +static int +nv_pass_lower_mods(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *mi, *next; + ubyte mod; + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + if (nvi->opcode == NV_OP_SUB) { + nvi->opcode = NV_OP_ADD; + nvi->src[1]->mod ^= NV_MOD_NEG; + } + + /* should not put any modifiers on NEG and ABS */ + assert(nvi->opcode != NV_MOD_NEG || !nvi->src[0]->mod); + assert(nvi->opcode != NV_MOD_ABS || !nvi->src[0]->mod); + + for (j = 0; j < 4; ++j) { + if (!nvi->src[j]) + break; + + mi = nvi->src[j]->value->insn; + if (!mi) + continue; + if (mi->def[0]->refc > 1) + continue; + + if (mi->opcode == NV_OP_NEG) mod = NV_MOD_NEG; + else + if (mi->opcode == NV_OP_ABS) mod = NV_MOD_ABS; + else + continue; + + if (nvi->opcode == NV_OP_ABS) + mod &= ~(NV_MOD_NEG | NV_MOD_ABS); + else + if (nvi->opcode == NV_OP_NEG && mod == NV_MOD_NEG) { + nvi->opcode = NV_OP_MOV; + mod = 0; + } + + if (!(nv50_supported_src_mods(nvi->opcode, j) & mod)) + continue; + + nv_reference(ctx->pc, &nvi->src[j], mi->src[0]->value); + + nvi->src[j]->mod ^= mod; + } + + if (nvi->opcode == NV_OP_SAT) { + mi = nvi->src[0]->value->insn; + + if ((mi->opcode == NV_OP_MAD) && !mi->flags_def) { + mi->saturate = 1; + mi->def[0] = nvi->def[0]; + nv_nvi_delete(nvi); + } + } + } + DESCEND_ARBITRARY(j, nv_pass_lower_mods); + + return 0; +} + +#define SRC_IS_MUL(s) ((s)->insn && (s)->insn->opcode == NV_OP_MUL) + +static struct nv_value * +find_immediate(struct nv_ref *ref) +{ + struct nv_value *src; + + if (!ref) + return NULL; + + src = ref->value; + while (src->insn && src->insn->opcode == NV_OP_MOV) { + assert(!src->insn->src[0]->mod); + src = src->insn->src[0]->value; + } + return (src->reg.file == NV_FILE_IMM) ? src : NULL; +} + +static void +constant_operand(struct nv_pc *pc, + struct nv_instruction *nvi, struct nv_value *val, int s) +{ + int t = s ? 0 : 1; + ubyte type; + + if (!nvi->def[0]) + return; + type = nvi->def[0]->reg.type; + + switch (nvi->opcode) { + case NV_OP_MUL: + if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 1.0f) || + (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 1)) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[s], NULL); + if (!s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } else + if ((type == NV_TYPE_F32 && val->reg.imm.f32 == 2.0f) || + (NV_TYPE_ISINT(type) && val->reg.imm.u32 == 2)) { + nvi->opcode = NV_OP_ADD; + nv_reference(pc, &nvi->src[s], NULL); + if (!s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } else + if (type == NV_TYPE_F32 && val->reg.imm.f32 == -1.0f) { + nvi->opcode = NV_OP_NEG; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } else + if (type == NV_TYPE_F32 && val->reg.imm.f32 == -2.0f) { + nvi->opcode = NV_OP_ADD; + assert(!nvi->src[s]->mod); + nv_reference(pc, &nvi->src[s], nvi->src[t]->value); + nvi->src[t]->mod ^= NV_MOD_NEG; + nvi->src[s]->mod |= NV_MOD_NEG; + } else + if (val->reg.imm.u32 == 0) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[t], NULL); + if (s) { + nvi->src[0] = nvi->src[1]; + nvi->src[1] = NULL; + } + } + break; + case NV_OP_ADD: + if (val->reg.imm.u32 == 0) { + nvi->opcode = NV_OP_MOV; + nv_reference(pc, &nvi->src[s], NULL); + nvi->src[0] = nvi->src[t]; + nvi->src[1] = NULL; + } + break; + default: + break; + } +} + +static int +nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b) +{ + struct nv_instruction *nvi, *next; + int j; + + for (nvi = b->entry; nvi; nvi = next) { + struct nv_value *src0, *src1, *src; + int mod; + + next = nvi->next; + + if ((src = find_immediate(nvi->src[0])) != NULL) + constant_operand(ctx->pc, nvi, src, 0); + else + if ((src = find_immediate(nvi->src[1])) != NULL) + constant_operand(ctx->pc, nvi, src, 1); + + /* try to combine MUL, ADD into MAD */ + if (nvi->opcode != NV_OP_ADD) + continue; + + src0 = nvi->src[0]->value; + src1 = nvi->src[1]->value; + + if (SRC_IS_MUL(src0) && src0->refc == 1) + src = src0; + else + if (SRC_IS_MUL(src1) && src1->refc == 1) + src = src1; + else + continue; + + nvi->opcode = NV_OP_MAD; + mod = nvi->src[(src == src0) ? 0 : 1]->mod; + nv_reference(ctx->pc, &nvi->src[(src == src0) ? 0 : 1], NULL); + nvi->src[2] = nvi->src[(src == src0) ? 1 : 0]; + + assert(!(mod & ~NV_MOD_NEG)); + nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value); + nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value); + nvi->src[0]->mod = src->insn->src[0]->mod ^ mod; + nvi->src[1]->mod = src->insn->src[1]->mod; + } + DESCEND_ARBITRARY(j, nv_pass_lower_arith); + + return 0; +} + +/* +set $r2 g f32 $r2 $r3 +cvt abs rn f32 $r2 s32 $r2 +cvt f32 $c0 # f32 $r2 +e $c0 bra 0x80 +*/ +#if 0 +static int +nv_pass_lower_cond(struct nv_pass *ctx, struct nv_basic_block *b) +{ + /* XXX: easier in IR builder for now */ + return 0; +} +#endif + +/* TODO: reload elimination, redundant store elimination */ + +struct nv_pass_reldelim { + struct nv_pc *pc; +}; + +static int +nv_pass_reload_elim(struct nv_pass_reldelim *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *ld, *next; + + for (ld = b->entry; ld; ld = next) { + next = ld->next; + + if (ld->opcode == NV_OP_LINTERP || ld->opcode == NV_OP_PINTERP) { + + } else + if (ld->opcode == NV_OP_LDA) { + + } else + if (ld->opcode == NV_OP_MOV) { + + } + } + DESCEND_ARBITRARY(j, nv_pass_reload_elim); + + return 0; +} + +static int +nv_pass_tex_mask(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int i, c, j; + + for (i = 0; i < ctx->pc->num_instructions; ++i) { + struct nv_instruction *nvi = &ctx->pc->instructions[i]; + struct nv_value *def[4]; + + if (!nv_is_vector_op(nvi->opcode)) + continue; + nvi->tex_mask = 0; + + for (c = 0; c < 4; ++c) { + if (nvi->def[c]->refc) + nvi->tex_mask |= 1 << c; + def[c] = nvi->def[c]; + } + + j = 0; + for (c = 0; c < 4; ++c) + if (nvi->tex_mask & (1 << c)) + nvi->def[j++] = def[c]; + for (c = 0; c < 4; ++c) + if (!(nvi->tex_mask & (1 << c))) + nvi->def[j++] = def[c]; + assert(j == 4); + } + return 0; +} + +struct nv_pass_dce { + struct nv_pc *pc; + uint removed; +}; + +static int +nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b) +{ + int j; + struct nv_instruction *nvi, *next; + + for (nvi = b->entry; nvi; nvi = next) { + next = nvi->next; + + if (inst_cullable(nvi)) { + nv_nvi_delete(nvi); + + ++ctx->removed; + } + } + DESCEND_ARBITRARY(j, nv_pass_dce); + + return 0; +} + +static INLINE boolean +bb_simple_if_endif(struct nv_basic_block *bb) +{ + return (bb->out[0] && bb->out[1] && + bb->out[0]->out[0] == bb->out[1] && + !bb->out[0]->out[1]); +} + +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b) +{ + int j; + + if (bb_simple_if_endif(b)) { + ++ctx->n; + debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n); + } + DESCEND_ARBITRARY(j, nv_pass_flatten); + + return 0; +} + +int +nv_pc_exec_pass0(struct nv_pc *pc) +{ + struct nv_pass_reldelim *reldelim; + struct nv_pass pass; + struct nv_pass_dce dce; + int ret; + + reldelim = CALLOC_STRUCT(nv_pass_reldelim); + reldelim->pc = pc; + + ret = nv_pass_reload_elim(reldelim, pc->root); + + FREE(reldelim); + if (ret) + return ret; + + pass.pc = pc; + + pc->pass_seq++; + ret = nv_pass_flatten(&pass, pc->root); + if (ret) + return ret; + + /* Do this first, so we don't have to pay attention + * to whether sources are supported memory loads. + */ + pc->pass_seq++; + ret = nv_pass_lower_arith(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_loads(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_fold_stores(&pass, pc->root); + if (ret) + return ret; + + pc->pass_seq++; + ret = nv_pass_lower_mods(&pass, pc->root); + if (ret) + return ret; + + dce.pc = pc; + do { + dce.removed = 0; + pc->pass_seq++; + ret = nv_pass_dce(&dce, pc->root); + if (ret) + return ret; + } while (dce.removed); + + ret = nv_pass_tex_mask(&pass, pc->root); + if (ret) + return ret; + + return ret; +} |