diff options
| author | Christoph Bumiller <e0425955@student.tuwien.ac.at> | 2010-08-16 15:21:23 +0200 | 
|---|---|---|
| committer | Christoph Bumiller <e0425955@student.tuwien.ac.at> | 2010-08-17 00:47:46 +0200 | 
| commit | e7a0bfa69a6ce45bb53baa8220eae418225c5649 (patch) | |
| tree | 599ff01869b30151d57cee914549eff988798918 /src | |
| parent | 4de293bb9acd1ecda683f735af32f7485a0f213e (diff) | |
nv50: flatten simple IF/ELSE/ENDIF constructs
Less branching means less instructions and less thread divergence.
Diffstat (limited to 'src')
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc.c | 14 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc.h | 1 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_pc_optimize.c | 116 | ||||
| -rw-r--r-- | src/gallium/drivers/nv50/nv50_tgsi_to_nc.c | 16 | 
4 files changed, 123 insertions, 24 deletions
| diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c index 7601049126..5041fc7505 100644 --- a/src/gallium/drivers/nv50/nv50_pc.c +++ b/src/gallium/drivers/nv50/nv50_pc.c @@ -125,6 +125,20 @@ nv50_nvi_can_load(struct nv_instruction *nvi, int s, struct nv_value *value)     }  } +/* Return whether this instruction can be executed conditionally. */ +boolean +nv50_nvi_can_predicate(struct nv_instruction *nvi) +{ +   int i; + +   if (nvi->flags_src) +      return FALSE; +   for (i = 0; i < 4 && nvi->src[i]; ++i) +      if (nvi->src[i]->value->reg.file == NV_FILE_IMM) +         return FALSE; +   return TRUE; +} +  ubyte  nv50_supported_src_mods(uint opcode, int s)  { diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h index b24a3067b8..28208ad247 100644 --- a/src/gallium/drivers/nv50/nv50_pc.h +++ b/src/gallium/drivers/nv50/nv50_pc.h @@ -432,6 +432,7 @@ void nv_print_program(struct nv_basic_block *b);  boolean nv_op_commutative(uint opcode);  int nv50_indirect_opnd(struct nv_instruction *);  boolean nv50_nvi_can_use_imm(struct nv_instruction *, int s); +boolean nv50_nvi_can_predicate(struct nv_instruction *);  boolean nv50_nvi_can_load(struct nv_instruction *, int s, struct nv_value *);  ubyte nv50_supported_src_mods(uint opcode, int s);  int nv_nvi_refcount(struct nv_instruction *); diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c index daf63a3d20..4cf387257d 100644 --- a/src/gallium/drivers/nv50/nv50_pc_optimize.c +++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c @@ -119,6 +119,15 @@ nvi_isnop(struct nv_instruction *nvi)     return values_equal(nvi->def[0], nvi->src[0]->value);  } +struct nv_pass { +   struct nv_pc *pc; +   int n; +   void *priv; +}; + +static int +nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b); +  static void  nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)  { @@ -204,6 +213,13 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)  int  nv_pc_exec_pass2(struct nv_pc *pc)  { +   struct nv_pass pass; + +   pass.pc = pc; + +   pc->pass_seq++; +   nv_pass_flatten(&pass, pc->root); +     debug_printf("preparing %u blocks for emission\n", pc->num_blocks);     pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *)); @@ -273,12 +289,6 @@ check_swap_src_0_1(struct nv_instruction *nvi)        nvi->set_cond = cc_swapped[nvi->set_cond];  } -struct nv_pass { -   struct nv_pc *pc; -   int n; -   void *priv; -}; -  static int  nv_pass_fold_stores(struct nv_pass *ctx, struct nv_basic_block *b)  { @@ -863,24 +873,95 @@ nv_pass_dce(struct nv_pass_dce *ctx, struct nv_basic_block *b)     return 0;  } +/* Register allocation inserted ELSE blocks for all IF/ENDIF without ELSE. + * Returns TRUE if @bb initiates an IF/ELSE/ENDIF clause, or is an IF with + * BREAK and dummy ELSE block. + */  static INLINE boolean -bb_simple_if_endif(struct nv_basic_block *bb) +bb_is_if_else_endif(struct nv_basic_block *bb) +{ +   if (!bb->out[0] || !bb->out[1]) +      return FALSE; + +   if (bb->out[0]->out_kind[0] == CFG_EDGE_LOOP_LEAVE) { +      return (bb->out[0]->out[1] == bb->out[1]->out[0] && +              !bb->out[1]->out[1]); +   } else { +      return (bb->out[0]->out[0] == bb->out[1]->out[0] && +              !bb->out[0]->out[1] && +              !bb->out[1]->out[1]); +   } +} + +/* predicate instructions and remove branch at the end */ +static void +predicate_instructions(struct nv_pc *pc, struct nv_basic_block *b, +                       struct nv_value *p, ubyte cc)  { -   return (bb->out[0] && bb->out[1] && -           bb->out[0]->out[0] == bb->out[1] && -           !bb->out[0]->out[1]); +   struct nv_instruction *nvi; + +   if (!b->entry) +      return; +   for (nvi = b->entry; nvi->next; nvi = nvi->next) { +      if (!nvi_isnop(nvi)) { +         nvi->cc = cc; +         nv_reference(pc, &nvi->flags_src, p); +      } +   } + +   if (nvi->opcode == NV_OP_BRA) +      nv_nvi_delete(nvi); +   else +   if (!nvi_isnop(nvi)) { +      nvi->cc = cc; +      nv_reference(pc, &nvi->flags_src, p); +   }  } +/* NOTE: Run this after register allocation, we can just cut out the cflow + * instructions and hook the predicates to the conditional OPs if they are + * not using immediates; better than inserting SELECT to join definitions. + * + * NOTE: Should adapt prior optimization to make this possible more often. + */  static int  nv_pass_flatten(struct nv_pass *ctx, struct nv_basic_block *b)  { -   int j; +   struct nv_instruction *nvi; +   struct nv_value *pred; +   int i; +   int n0 = 0, n1 = 0; + +   if (bb_is_if_else_endif(b)) { + +      debug_printf("nv_pass_flatten: IF/ELSE/ENDIF construct at BB:%i\n", b->id); -   if (bb_simple_if_endif(b)) { -      ++ctx->n; -      debug_printf("nv_pass_flatten: total IF/ENDIF constructs: %i\n", ctx->n); +      for (n0 = 0, nvi = b->out[0]->entry; nvi; nvi = nvi->next, ++n0) +         if (!nv50_nvi_can_predicate(nvi)) +            break; +      if (!nvi) { +         for (n1 = 0, nvi = b->out[1]->entry; nvi; nvi = nvi->next, ++n1) +            if (!nv50_nvi_can_predicate(nvi)) +               break; +         if (nvi) { +            debug_printf("cannot predicate: "); nv_print_instruction(nvi); +         } +      } else { +         debug_printf("cannot predicate: "); nv_print_instruction(nvi); +      } + +      if (!nvi && n0 < 12 && n1 < 12) { /* 12 as arbitrary limit */ +         assert(b->exit && b->exit->flags_src); +         pred = b->exit->flags_src->value; + +         predicate_instructions(ctx->pc, b->out[0], pred, NV_CC_NE | NV_CC_U); +         predicate_instructions(ctx->pc, b->out[1], pred, NV_CC_EQ); + +         assert(b->exit && b->exit->opcode == NV_OP_BRA); +         nv_nvi_delete(b->exit); +      }     } -   DESCEND_ARBITRARY(j, nv_pass_flatten); +   DESCEND_ARBITRARY(i, nv_pass_flatten);     return 0;  } @@ -960,11 +1041,6 @@ nv_pc_exec_pass0(struct nv_pc *pc)     pass.n = 0;     pass.pc = pc; -   pc->pass_seq++; -   ret = nv_pass_flatten(&pass, pc->root); -   if (ret) -      return ret; -     /* Do this first, so we don't have to pay attention      * to whether sources are supported memory loads.      */ diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c index 7e77ed6ef6..b23c285dc1 100644 --- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c +++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c @@ -591,7 +591,7 @@ bld_get_address(struct bld_context *bld, int id, struct nv_value *indirect)  static struct nv_value * -bld_predicate(struct bld_context *bld, struct nv_value *src) +bld_predicate(struct bld_context *bld, struct nv_value *src, boolean bool_only)  {     struct nv_instruction *nvi = src->insn; @@ -600,6 +600,14 @@ bld_predicate(struct bld_context *bld, struct nv_value *src)         nvi->bb != bld->pc->current_block) {        nvi = new_instruction(bld->pc, NV_OP_CVT);        nv_reference(bld->pc, &nvi->src[0], src); +   } else +   if (bool_only) { +      while (nvi->opcode == NV_OP_ABS || nvi->opcode == NV_OP_CVT || +             nvi->opcode == NV_OP_NEG) { +         /* TGSI SET gets conversion to f32, we only need source 0/~0 */ +         if (!nvi->def[0]->insn->flags_src) +            nvi = nvi->src[0]->value->insn; +      }     }     if (!nvi->flags_def) { @@ -614,7 +622,7 @@ bld_kil(struct bld_context *bld, struct nv_value *src)  {     struct nv_instruction *nvi; -   src = bld_predicate(bld, src); +   src = bld_predicate(bld, src, FALSE);     nvi = new_instruction(bld->pc, NV_OP_KIL);     nvi->fixed = 1;     nvi->flags_src = new_ref(bld->pc, src); @@ -1223,7 +1231,7 @@ bld_instruction(struct bld_context *bld,           src0 = emit_fetch(bld, insn, 0, c);           src1 = emit_fetch(bld, insn, 1, c);           src2 = emit_fetch(bld, insn, 2, c); -         src0 = bld_predicate(bld, src0); +         src0 = bld_predicate(bld, src0, FALSE);           src1 = bld_insn_1(bld, NV_OP_MOV, src1);           src1->insn->flags_src = new_ref(bld->pc, src0); @@ -1304,7 +1312,7 @@ bld_instruction(struct bld_context *bld,        bld->join_bb[bld->cond_lvl] = bld->pc->current_block;        bld->cond_bb[bld->cond_lvl] = bld->pc->current_block; -      src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0)); +      src1 = bld_predicate(bld, emit_fetch(bld, insn, 0, 0), TRUE);        bld_flow(bld, NV_OP_BRA, NV_CC_EQ, src1, NULL, FALSE); | 
