diff options
Diffstat (limited to 'src/mesa/drivers/dri/i965/brw_fs.cpp')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 345 |
1 files changed, 108 insertions, 237 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 1a5808f44e..174f622d59 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -179,10 +179,6 @@ type_size(const struct glsl_type *type) } } -static const fs_reg reg_undef; -static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F); -static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D); - int fs_visitor::virtual_grf_alloc(int size) { @@ -503,7 +499,6 @@ fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) *reg, fs_reg(1))); } else { - fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); /* bit 31 is "primitive is back face", so checking < (1 << 31) gives * us front face @@ -2228,6 +2223,47 @@ fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) } void +fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) +{ + assert(inst->mlen != 0); + + brw_MOV(p, + retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), + retype(src, BRW_REGISTER_TYPE_UD)); + brw_oword_block_write(p, brw_message_reg(inst->base_mrf), 1, inst->offset); +} + +void +fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->mlen != 0); + + /* Clear any post destination dependencies that would be ignored by + * the block read. See the B-Spec for pre-gen5 send instruction. + * + * This could use a better solution, since texture sampling and + * math reads could potentially run into it as well -- anywhere + * that we have a SEND with a destination that is a register that + * was written but not read within the last N instructions (what's + * N? unsure). This is rare because of dead code elimination, but + * not impossible. + */ + if (intel->gen == 4 && !intel->is_g4x) + brw_MOV(p, brw_null_reg(), dst); + + brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1, + inst->offset); + + if (intel->gen == 4 && !intel->is_g4x) { + /* gen4 errata: destination from a send can't be used as a + * destination until it's been read. Just read it so we don't + * have to worry. + */ + brw_MOV(p, brw_null_reg(), dst); + } +} + +void fs_visitor::assign_curb_setup() { c->prog_data.first_curbe_grf = c->key.nr_payload_regs; @@ -2311,222 +2347,6 @@ fs_visitor::assign_urb_setup() this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; } -static void -assign_reg(int *reg_hw_locations, fs_reg *reg) -{ - if (reg->file == GRF && reg->reg != 0) { - assert(reg->reg_offset >= 0); - reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset; - reg->reg = 0; - } -} - -void -fs_visitor::assign_regs_trivial() -{ - int last_grf = 0; - int hw_reg_mapping[this->virtual_grf_next]; - int i; - - hw_reg_mapping[0] = 0; - hw_reg_mapping[1] = this->first_non_payload_grf; - for (i = 2; i < this->virtual_grf_next; i++) { - hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + - this->virtual_grf_sizes[i - 1]); - } - last_grf = hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1]; - - foreach_iter(exec_list_iterator, iter, this->instructions) { - fs_inst *inst = (fs_inst *)iter.get(); - - assign_reg(hw_reg_mapping, &inst->dst); - assign_reg(hw_reg_mapping, &inst->src[0]); - assign_reg(hw_reg_mapping, &inst->src[1]); - } - - this->grf_used = last_grf + 1; -} - -void -fs_visitor::assign_regs() -{ - int last_grf = 0; - int hw_reg_mapping[this->virtual_grf_next + 1]; - int base_reg_count = BRW_MAX_GRF - this->first_non_payload_grf; - int class_sizes[base_reg_count]; - int class_count = 0; - int aligned_pair_class = -1; - - /* Set up the register classes. - * - * The base registers store a scalar value. For texture samples, - * we get virtual GRFs composed of 4 contiguous hw register. For - * structures and arrays, we store them as contiguous larger things - * than that, though we should be able to do better most of the - * time. - */ - class_sizes[class_count++] = 1; - if (brw->has_pln && intel->gen < 6) { - /* Always set up the (unaligned) pairs for gen5, so we can find - * them for making the aligned pair class. - */ - class_sizes[class_count++] = 2; - } - for (int r = 1; r < this->virtual_grf_next; r++) { - int i; - - for (i = 0; i < class_count; i++) { - if (class_sizes[i] == this->virtual_grf_sizes[r]) - break; - } - if (i == class_count) { - if (this->virtual_grf_sizes[r] >= base_reg_count) { - fprintf(stderr, "Object too large to register allocate.\n"); - this->fail = true; - } - - class_sizes[class_count++] = this->virtual_grf_sizes[r]; - } - } - - int ra_reg_count = 0; - int class_base_reg[class_count]; - int class_reg_count[class_count]; - int classes[class_count + 1]; - - for (int i = 0; i < class_count; i++) { - class_base_reg[i] = ra_reg_count; - class_reg_count[i] = base_reg_count - (class_sizes[i] - 1); - ra_reg_count += class_reg_count[i]; - } - - struct ra_regs *regs = ra_alloc_reg_set(ra_reg_count); - for (int i = 0; i < class_count; i++) { - classes[i] = ra_alloc_reg_class(regs); - - for (int i_r = 0; i_r < class_reg_count[i]; i_r++) { - ra_class_add_reg(regs, classes[i], class_base_reg[i] + i_r); - } - - /* Add conflicts between our contiguous registers aliasing - * base regs and other register classes' contiguous registers - * that alias base regs, or the base regs themselves for classes[0]. - */ - for (int c = 0; c <= i; c++) { - for (int i_r = 0; i_r < class_reg_count[i]; i_r++) { - for (int c_r = MAX2(0, i_r - (class_sizes[c] - 1)); - c_r < MIN2(class_reg_count[c], i_r + class_sizes[i]); - c_r++) { - - if (0) { - printf("%d/%d conflicts %d/%d\n", - class_sizes[i], this->first_non_payload_grf + i_r, - class_sizes[c], this->first_non_payload_grf + c_r); - } - - ra_add_reg_conflict(regs, - class_base_reg[i] + i_r, - class_base_reg[c] + c_r); - } - } - } - } - - /* Add a special class for aligned pairs, which we'll put delta_x/y - * in on gen5 so that we can do PLN. - */ - if (brw->has_pln && intel->gen < 6) { - int reg_count = (base_reg_count - 1) / 2; - int unaligned_pair_class = 1; - assert(class_sizes[unaligned_pair_class] == 2); - - aligned_pair_class = class_count; - classes[aligned_pair_class] = ra_alloc_reg_class(regs); - class_sizes[aligned_pair_class] = 2; - class_base_reg[aligned_pair_class] = 0; - class_reg_count[aligned_pair_class] = 0; - int start = (this->first_non_payload_grf & 1) ? 1 : 0; - - for (int i = 0; i < reg_count; i++) { - ra_class_add_reg(regs, classes[aligned_pair_class], - class_base_reg[unaligned_pair_class] + i * 2 + start); - } - class_count++; - } - - ra_set_finalize(regs); - - struct ra_graph *g = ra_alloc_interference_graph(regs, - this->virtual_grf_next); - /* Node 0 is just a placeholder to keep virtual_grf[] mapping 1:1 - * with nodes. - */ - ra_set_node_class(g, 0, classes[0]); - - for (int i = 1; i < this->virtual_grf_next; i++) { - for (int c = 0; c < class_count; c++) { - if (class_sizes[c] == this->virtual_grf_sizes[i]) { - if (aligned_pair_class >= 0 && - this->delta_x.reg == i) { - ra_set_node_class(g, i, classes[aligned_pair_class]); - } else { - ra_set_node_class(g, i, classes[c]); - } - break; - } - } - - for (int j = 1; j < i; j++) { - if (virtual_grf_interferes(i, j)) { - ra_add_node_interference(g, i, j); - } - } - } - - /* FINISHME: Handle spilling */ - if (!ra_allocate_no_spills(g)) { - fprintf(stderr, "Failed to allocate registers.\n"); - this->fail = true; - return; - } - - /* Get the chosen virtual registers for each node, and map virtual - * regs in the register classes back down to real hardware reg - * numbers. - */ - hw_reg_mapping[0] = 0; /* unused */ - for (int i = 1; i < this->virtual_grf_next; i++) { - int reg = ra_get_node_reg(g, i); - int hw_reg = -1; - - for (int c = 0; c < class_count; c++) { - if (reg >= class_base_reg[c] && - reg < class_base_reg[c] + class_reg_count[c]) { - hw_reg = reg - class_base_reg[c]; - break; - } - } - - assert(hw_reg >= 0); - hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg; - last_grf = MAX2(last_grf, - hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1); - } - - foreach_iter(exec_list_iterator, iter, this->instructions) { - fs_inst *inst = (fs_inst *)iter.get(); - - assign_reg(hw_reg_mapping, &inst->dst); - assign_reg(hw_reg_mapping, &inst->src[0]); - assign_reg(hw_reg_mapping, &inst->src[1]); - } - - this->grf_used = last_grf + 1; - - talloc_free(g); - talloc_free(regs); -} - /** * Split large virtual GRFs into separate components if we can. * @@ -2621,6 +2441,7 @@ fs_visitor::calculate_live_intervals() int *use = talloc_array(mem_ctx, int, num_vars); int loop_depth = 0; int loop_start = 0; + int bb_header_ip = 0; for (int i = 0; i < num_vars; i++) { def[i] = 1 << 30; @@ -2638,12 +2459,8 @@ fs_visitor::calculate_live_intervals() loop_depth--; if (loop_depth == 0) { - /* FINISHME: - * - * Patches up any vars marked for use within the loop as - * live until the end. This is conservative, as there - * will often be variables defined and used inside the - * loop but dead at the end of the loop body. + /* Patches up the use of vars marked for being live across + * the whole loop. */ for (int i = 0; i < num_vars; i++) { if (use[i] == loop_start) { @@ -2652,22 +2469,53 @@ fs_visitor::calculate_live_intervals() } } } else { - int eip = ip; - - if (loop_depth) - eip = loop_start; - for (unsigned int i = 0; i < 3; i++) { if (inst->src[i].file == GRF && inst->src[i].reg != 0) { - use[inst->src[i].reg] = MAX2(use[inst->src[i].reg], eip); + int reg = inst->src[i].reg; + + if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && + def[reg] >= bb_header_ip)) { + use[reg] = ip; + } else { + def[reg] = MIN2(loop_start, def[reg]); + use[reg] = loop_start; + + /* Nobody else is going to go smash our start to + * later in the loop now, because def[reg] now + * points before the bb header. + */ + } } } if (inst->dst.file == GRF && inst->dst.reg != 0) { - def[inst->dst.reg] = MIN2(def[inst->dst.reg], eip); + int reg = inst->dst.reg; + + if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && + !inst->predicated)) { + def[reg] = MIN2(def[reg], ip); + } else { + def[reg] = MIN2(def[reg], loop_start); + } } } ip++; + + /* Set the basic block header IP. This is used for determining + * if a complete def of single-register virtual GRF in a loop + * dominates a use in the same basic block. It's a quick way to + * reduce the live interval range of most register used in a + * loop. + */ + if (inst->opcode == BRW_OPCODE_IF || + inst->opcode == BRW_OPCODE_ELSE || + inst->opcode == BRW_OPCODE_ENDIF || + inst->opcode == BRW_OPCODE_DO || + inst->opcode == BRW_OPCODE_WHILE || + inst->opcode == BRW_OPCODE_BREAK || + inst->opcode == BRW_OPCODE_CONTINUE) { + bb_header_ip = ip; + } } talloc_free(this->virtual_grf_def); @@ -3257,6 +3105,15 @@ fs_visitor::generate_code() case FS_OPCODE_DDY: generate_ddy(inst, dst, src[0]); break; + + case FS_OPCODE_SPILL: + generate_spill(inst, src[0]); + break; + + case FS_OPCODE_UNSPILL: + generate_unspill(inst, dst); + break; + case FS_OPCODE_FB_WRITE: generate_fb_write(inst); break; @@ -3362,10 +3219,25 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) progress = v.dead_code_eliminate() || progress; } while (progress); + if (0) { + /* Debug of register spilling: Go spill everything. */ + int virtual_grf_count = v.virtual_grf_next; + for (int i = 1; i < virtual_grf_count; i++) { + v.spill_reg(i); + } + v.calculate_live_intervals(); + } + if (0) v.assign_regs_trivial(); - else - v.assign_regs(); + else { + while (!v.assign_regs()) { + if (v.fail) + break; + + v.calculate_live_intervals(); + } + } } if (!v.fail) @@ -3408,7 +3280,6 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) } c->prog_data.total_grf = v.grf_used; - c->prog_data.total_scratch = 0; return GL_TRUE; } |