diff options
Diffstat (limited to 'src/mesa')
-rw-r--r-- | src/mesa/drivers/dri/i965/Makefile | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_eu.h | 15 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_eu_emit.c | 118 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 345 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 15 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 418 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm.c | 21 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm_emit.c | 8 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm_state.c | 5 | ||||
-rw-r--r-- | src/mesa/main/framebuffer.c | 1 | ||||
-rw-r--r-- | src/mesa/main/image.c | 48 | ||||
-rw-r--r-- | src/mesa/program/register_allocate.c | 63 | ||||
-rw-r--r-- | src/mesa/program/register_allocate.h | 2 | ||||
-rw-r--r-- | src/mesa/state_tracker/st_format.c | 2 |
14 files changed, 767 insertions, 295 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile index 917d39061d..e3ca863fe5 100644 --- a/src/mesa/drivers/dri/i965/Makefile +++ b/src/mesa/drivers/dri/i965/Makefile @@ -108,6 +108,7 @@ CXX_SOURCES = \ brw_cubemap_normalize.cpp \ brw_fs.cpp \ brw_fs_channel_expressions.cpp \ + brw_fs_reg_allocate.cpp \ brw_fs_vector_splitting.cpp ASM_SOURCES = diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 8ffa7c760a..0e3ccfa46c 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -897,9 +897,11 @@ void brw_math2(struct brw_compile *p, struct brw_reg src0, struct brw_reg src1); -void brw_dp_READ_16( struct brw_compile *p, - struct brw_reg dest, - GLuint scratch_offset ); +void brw_oword_block_read(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + GLuint offset); void brw_dp_READ_4( struct brw_compile *p, struct brw_reg dest, @@ -918,9 +920,10 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p, GLuint offset, GLuint bind_table_index); -void brw_dp_WRITE_16( struct brw_compile *p, - struct brw_reg src, - GLuint scratch_offset ); +void brw_oword_block_write(struct brw_compile *p, + struct brw_reg mrf, + int num_regs, + GLuint offset); /* If/else/endif. Works by manipulating the execution flags on each * channel. diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 399b99c960..6fbc39672f 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -1353,38 +1353,66 @@ void brw_math_16( struct brw_compile *p, /** - * Write block of 16 dwords/floats to the data port Render Cache scratch buffer. - * Scratch offset should be a multiple of 64. - * Used for register spilling. + * Write a block of OWORDs (half a GRF each) from the scratch buffer, + * using a constant offset per channel. + * + * The offset must be aligned to oword size (16 bytes). Used for + * register spilling. */ -void brw_dp_WRITE_16( struct brw_compile *p, - struct brw_reg src, - GLuint scratch_offset ) +void brw_oword_block_write(struct brw_compile *p, + struct brw_reg mrf, + int num_regs, + GLuint offset) { struct intel_context *intel = &p->brw->intel; - GLuint msg_reg_nr = 1; + uint32_t msg_control; + int mlen; + + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + + if (num_regs == 1) { + msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS; + mlen = 2; + } else { + msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS; + mlen = 3; + } + + /* Set up the message header. This is g0, with g0.2 filled with + * the offset. We don't want to leave our offset around in g0 or + * it'll screw up texture samples, so set it up inside the message + * reg. + */ { brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + /* set message header global offset field (reg 0, element 2) */ brw_MOV(p, - retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D), - brw_imm_d(scratch_offset)); + retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + mrf.nr, + 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(offset)); brw_pop_insn_state(p); } { - GLuint msg_length = 3; struct brw_reg dest; struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); int send_commit_msg; + struct brw_reg src_header = retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UW); - insn->header.predicate_control = 0; /* XXX */ - insn->header.compression_control = BRW_COMPRESSION_NONE; - insn->header.destreg__conditionalmod = msg_reg_nr; + if (insn->header.compression_control != BRW_COMPRESSION_NONE) { + insn->header.compression_control = BRW_COMPRESSION_NONE; + src_header = vec16(src_header); + } + assert(insn->header.predicate_control == BRW_PREDICATE_NONE); + insn->header.destreg__conditionalmod = mrf.nr; /* Until gen6, writes followed by reads from the same location * are not guaranteed to be ordered unless write_commit is set. @@ -1400,19 +1428,19 @@ void brw_dp_WRITE_16( struct brw_compile *p, dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); send_commit_msg = 0; } else { - dest = brw_uw16_grf(0, 0); + dest = src_header; send_commit_msg = 1; } brw_set_dest(insn, dest); - brw_set_src0(insn, src); + brw_set_src0(insn, brw_null_reg()); brw_set_dp_write_message(p->brw, insn, 255, /* binding table index (255=stateless) */ - BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */ + msg_control, BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */ - msg_length, + mlen, GL_TRUE, /* header_present */ 0, /* pixel scoreboard */ send_commit_msg, /* response_length */ @@ -1423,46 +1451,68 @@ void brw_dp_WRITE_16( struct brw_compile *p, /** - * Read block of 16 dwords/floats from the data port Render Cache scratch buffer. - * Scratch offset should be a multiple of 64. - * Used for register spilling. + * Read a block of owords (half a GRF each) from the scratch buffer + * using a constant index per channel. + * + * Offset must be aligned to oword size (16 bytes). Used for register + * spilling. */ -void brw_dp_READ_16( struct brw_compile *p, - struct brw_reg dest, - GLuint scratch_offset ) +void +brw_oword_block_read(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg mrf, + int num_regs, + GLuint offset) { - GLuint msg_reg_nr = 1; + uint32_t msg_control; + int rlen; + + mrf = retype(mrf, BRW_REGISTER_TYPE_UD); + dest = retype(dest, BRW_REGISTER_TYPE_UW); + + if (num_regs == 1) { + msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS; + rlen = 1; + } else { + msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS; + rlen = 2; + } + { brw_push_insn_state(p); brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + /* set message header global offset field (reg 0, element 2) */ brw_MOV(p, - retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D), - brw_imm_d(scratch_offset)); + retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, + mrf.nr, + 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(offset)); brw_pop_insn_state(p); } { struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); - - insn->header.predicate_control = 0; /* XXX */ - insn->header.compression_control = BRW_COMPRESSION_NONE; - insn->header.destreg__conditionalmod = msg_reg_nr; - + + assert(insn->header.predicate_control == 0); + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = mrf.nr; + brw_set_dest(insn, dest); /* UW? */ - brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); + brw_set_src0(insn, brw_null_reg()); brw_set_dp_read_message(p->brw, insn, 255, /* binding table index (255=stateless) */ - BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, + msg_control, BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ 1, /* target cache (render/scratch) */ 1, /* msg_length */ - 2, /* response_length */ + rlen, 0); /* eot */ } } diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 1a5808f44e..174f622d59 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -179,10 +179,6 @@ type_size(const struct glsl_type *type) } } -static const fs_reg reg_undef; -static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F); -static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D); - int fs_visitor::virtual_grf_alloc(int size) { @@ -503,7 +499,6 @@ fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) *reg, fs_reg(1))); } else { - fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); /* bit 31 is "primitive is back face", so checking < (1 << 31) gives * us front face @@ -2228,6 +2223,47 @@ fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask) } void +fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src) +{ + assert(inst->mlen != 0); + + brw_MOV(p, + retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD), + retype(src, BRW_REGISTER_TYPE_UD)); + brw_oword_block_write(p, brw_message_reg(inst->base_mrf), 1, inst->offset); +} + +void +fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst) +{ + assert(inst->mlen != 0); + + /* Clear any post destination dependencies that would be ignored by + * the block read. See the B-Spec for pre-gen5 send instruction. + * + * This could use a better solution, since texture sampling and + * math reads could potentially run into it as well -- anywhere + * that we have a SEND with a destination that is a register that + * was written but not read within the last N instructions (what's + * N? unsure). This is rare because of dead code elimination, but + * not impossible. + */ + if (intel->gen == 4 && !intel->is_g4x) + brw_MOV(p, brw_null_reg(), dst); + + brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), 1, + inst->offset); + + if (intel->gen == 4 && !intel->is_g4x) { + /* gen4 errata: destination from a send can't be used as a + * destination until it's been read. Just read it so we don't + * have to worry. + */ + brw_MOV(p, brw_null_reg(), dst); + } +} + +void fs_visitor::assign_curb_setup() { c->prog_data.first_curbe_grf = c->key.nr_payload_regs; @@ -2311,222 +2347,6 @@ fs_visitor::assign_urb_setup() this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; } -static void -assign_reg(int *reg_hw_locations, fs_reg *reg) -{ - if (reg->file == GRF && reg->reg != 0) { - assert(reg->reg_offset >= 0); - reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset; - reg->reg = 0; - } -} - -void -fs_visitor::assign_regs_trivial() -{ - int last_grf = 0; - int hw_reg_mapping[this->virtual_grf_next]; - int i; - - hw_reg_mapping[0] = 0; - hw_reg_mapping[1] = this->first_non_payload_grf; - for (i = 2; i < this->virtual_grf_next; i++) { - hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + - this->virtual_grf_sizes[i - 1]); - } - last_grf = hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1]; - - foreach_iter(exec_list_iterator, iter, this->instructions) { - fs_inst *inst = (fs_inst *)iter.get(); - - assign_reg(hw_reg_mapping, &inst->dst); - assign_reg(hw_reg_mapping, &inst->src[0]); - assign_reg(hw_reg_mapping, &inst->src[1]); - } - - this->grf_used = last_grf + 1; -} - -void -fs_visitor::assign_regs() -{ - int last_grf = 0; - int hw_reg_mapping[this->virtual_grf_next + 1]; - int base_reg_count = BRW_MAX_GRF - this->first_non_payload_grf; - int class_sizes[base_reg_count]; - int class_count = 0; - int aligned_pair_class = -1; - - /* Set up the register classes. - * - * The base registers store a scalar value. For texture samples, - * we get virtual GRFs composed of 4 contiguous hw register. For - * structures and arrays, we store them as contiguous larger things - * than that, though we should be able to do better most of the - * time. - */ - class_sizes[class_count++] = 1; - if (brw->has_pln && intel->gen < 6) { - /* Always set up the (unaligned) pairs for gen5, so we can find - * them for making the aligned pair class. - */ - class_sizes[class_count++] = 2; - } - for (int r = 1; r < this->virtual_grf_next; r++) { - int i; - - for (i = 0; i < class_count; i++) { - if (class_sizes[i] == this->virtual_grf_sizes[r]) - break; - } - if (i == class_count) { - if (this->virtual_grf_sizes[r] >= base_reg_count) { - fprintf(stderr, "Object too large to register allocate.\n"); - this->fail = true; - } - - class_sizes[class_count++] = this->virtual_grf_sizes[r]; - } - } - - int ra_reg_count = 0; - int class_base_reg[class_count]; - int class_reg_count[class_count]; - int classes[class_count + 1]; - - for (int i = 0; i < class_count; i++) { - class_base_reg[i] = ra_reg_count; - class_reg_count[i] = base_reg_count - (class_sizes[i] - 1); - ra_reg_count += class_reg_count[i]; - } - - struct ra_regs *regs = ra_alloc_reg_set(ra_reg_count); - for (int i = 0; i < class_count; i++) { - classes[i] = ra_alloc_reg_class(regs); - - for (int i_r = 0; i_r < class_reg_count[i]; i_r++) { - ra_class_add_reg(regs, classes[i], class_base_reg[i] + i_r); - } - - /* Add conflicts between our contiguous registers aliasing - * base regs and other register classes' contiguous registers - * that alias base regs, or the base regs themselves for classes[0]. - */ - for (int c = 0; c <= i; c++) { - for (int i_r = 0; i_r < class_reg_count[i]; i_r++) { - for (int c_r = MAX2(0, i_r - (class_sizes[c] - 1)); - c_r < MIN2(class_reg_count[c], i_r + class_sizes[i]); - c_r++) { - - if (0) { - printf("%d/%d conflicts %d/%d\n", - class_sizes[i], this->first_non_payload_grf + i_r, - class_sizes[c], this->first_non_payload_grf + c_r); - } - - ra_add_reg_conflict(regs, - class_base_reg[i] + i_r, - class_base_reg[c] + c_r); - } - } - } - } - - /* Add a special class for aligned pairs, which we'll put delta_x/y - * in on gen5 so that we can do PLN. - */ - if (brw->has_pln && intel->gen < 6) { - int reg_count = (base_reg_count - 1) / 2; - int unaligned_pair_class = 1; - assert(class_sizes[unaligned_pair_class] == 2); - - aligned_pair_class = class_count; - classes[aligned_pair_class] = ra_alloc_reg_class(regs); - class_sizes[aligned_pair_class] = 2; - class_base_reg[aligned_pair_class] = 0; - class_reg_count[aligned_pair_class] = 0; - int start = (this->first_non_payload_grf & 1) ? 1 : 0; - - for (int i = 0; i < reg_count; i++) { - ra_class_add_reg(regs, classes[aligned_pair_class], - class_base_reg[unaligned_pair_class] + i * 2 + start); - } - class_count++; - } - - ra_set_finalize(regs); - - struct ra_graph *g = ra_alloc_interference_graph(regs, - this->virtual_grf_next); - /* Node 0 is just a placeholder to keep virtual_grf[] mapping 1:1 - * with nodes. - */ - ra_set_node_class(g, 0, classes[0]); - - for (int i = 1; i < this->virtual_grf_next; i++) { - for (int c = 0; c < class_count; c++) { - if (class_sizes[c] == this->virtual_grf_sizes[i]) { - if (aligned_pair_class >= 0 && - this->delta_x.reg == i) { - ra_set_node_class(g, i, classes[aligned_pair_class]); - } else { - ra_set_node_class(g, i, classes[c]); - } - break; - } - } - - for (int j = 1; j < i; j++) { - if (virtual_grf_interferes(i, j)) { - ra_add_node_interference(g, i, j); - } - } - } - - /* FINISHME: Handle spilling */ - if (!ra_allocate_no_spills(g)) { - fprintf(stderr, "Failed to allocate registers.\n"); - this->fail = true; - return; - } - - /* Get the chosen virtual registers for each node, and map virtual - * regs in the register classes back down to real hardware reg - * numbers. - */ - hw_reg_mapping[0] = 0; /* unused */ - for (int i = 1; i < this->virtual_grf_next; i++) { - int reg = ra_get_node_reg(g, i); - int hw_reg = -1; - - for (int c = 0; c < class_count; c++) { - if (reg >= class_base_reg[c] && - reg < class_base_reg[c] + class_reg_count[c]) { - hw_reg = reg - class_base_reg[c]; - break; - } - } - - assert(hw_reg >= 0); - hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg; - last_grf = MAX2(last_grf, - hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1); - } - - foreach_iter(exec_list_iterator, iter, this->instructions) { - fs_inst *inst = (fs_inst *)iter.get(); - - assign_reg(hw_reg_mapping, &inst->dst); - assign_reg(hw_reg_mapping, &inst->src[0]); - assign_reg(hw_reg_mapping, &inst->src[1]); - } - - this->grf_used = last_grf + 1; - - talloc_free(g); - talloc_free(regs); -} - /** * Split large virtual GRFs into separate components if we can. * @@ -2621,6 +2441,7 @@ fs_visitor::calculate_live_intervals() int *use = talloc_array(mem_ctx, int, num_vars); int loop_depth = 0; int loop_start = 0; + int bb_header_ip = 0; for (int i = 0; i < num_vars; i++) { def[i] = 1 << 30; @@ -2638,12 +2459,8 @@ fs_visitor::calculate_live_intervals() loop_depth--; if (loop_depth == 0) { - /* FINISHME: - * - * Patches up any vars marked for use within the loop as - * live until the end. This is conservative, as there - * will often be variables defined and used inside the - * loop but dead at the end of the loop body. + /* Patches up the use of vars marked for being live across + * the whole loop. */ for (int i = 0; i < num_vars; i++) { if (use[i] == loop_start) { @@ -2652,22 +2469,53 @@ fs_visitor::calculate_live_intervals() } } } else { - int eip = ip; - - if (loop_depth) - eip = loop_start; - for (unsigned int i = 0; i < 3; i++) { if (inst->src[i].file == GRF && inst->src[i].reg != 0) { - use[inst->src[i].reg] = MAX2(use[inst->src[i].reg], eip); + int reg = inst->src[i].reg; + + if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && + def[reg] >= bb_header_ip)) { + use[reg] = ip; + } else { + def[reg] = MIN2(loop_start, def[reg]); + use[reg] = loop_start; + + /* Nobody else is going to go smash our start to + * later in the loop now, because def[reg] now + * points before the bb header. + */ + } } } if (inst->dst.file == GRF && inst->dst.reg != 0) { - def[inst->dst.reg] = MIN2(def[inst->dst.reg], eip); + int reg = inst->dst.reg; + + if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 && + !inst->predicated)) { + def[reg] = MIN2(def[reg], ip); + } else { + def[reg] = MIN2(def[reg], loop_start); + } } } ip++; + + /* Set the basic block header IP. This is used for determining + * if a complete def of single-register virtual GRF in a loop + * dominates a use in the same basic block. It's a quick way to + * reduce the live interval range of most register used in a + * loop. + */ + if (inst->opcode == BRW_OPCODE_IF || + inst->opcode == BRW_OPCODE_ELSE || + inst->opcode == BRW_OPCODE_ENDIF || + inst->opcode == BRW_OPCODE_DO || + inst->opcode == BRW_OPCODE_WHILE || + inst->opcode == BRW_OPCODE_BREAK || + inst->opcode == BRW_OPCODE_CONTINUE) { + bb_header_ip = ip; + } } talloc_free(this->virtual_grf_def); @@ -3257,6 +3105,15 @@ fs_visitor::generate_code() case FS_OPCODE_DDY: generate_ddy(inst, dst, src[0]); break; + + case FS_OPCODE_SPILL: + generate_spill(inst, src[0]); + break; + + case FS_OPCODE_UNSPILL: + generate_unspill(inst, dst); + break; + case FS_OPCODE_FB_WRITE: generate_fb_write(inst); break; @@ -3362,10 +3219,25 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) progress = v.dead_code_eliminate() || progress; } while (progress); + if (0) { + /* Debug of register spilling: Go spill everything. */ + int virtual_grf_count = v.virtual_grf_next; + for (int i = 1; i < virtual_grf_count; i++) { + v.spill_reg(i); + } + v.calculate_live_intervals(); + } + if (0) v.assign_regs_trivial(); - else - v.assign_regs(); + else { + while (!v.assign_regs()) { + if (v.fail) + break; + + v.calculate_live_intervals(); + } + } } if (!v.fail) @@ -3408,7 +3280,6 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) } c->prog_data.total_grf = v.grf_used; - c->prog_data.total_scratch = 0; return GL_TRUE; } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index b8126083f9..de7137a7db 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -74,6 +74,8 @@ enum fs_opcodes { FS_OPCODE_TXL, FS_OPCODE_DISCARD_NOT, FS_OPCODE_DISCARD_AND, + FS_OPCODE_SPILL, + FS_OPCODE_UNSPILL, }; @@ -196,6 +198,7 @@ public: this->shadow_compare = false; this->mlen = 0; this->base_mrf = 0; + this->offset = 0; } fs_inst() @@ -281,6 +284,7 @@ public: bool eot; bool header_present; bool shadow_compare; + uint32_t offset; /* spill/unspill offset */ /** @{ * Annotation for the generated IR. One of the two can be set. @@ -359,8 +363,10 @@ public: void assign_curb_setup(); void calculate_urb_setup(); void assign_urb_setup(); - void assign_regs(); + bool assign_regs(); void assign_regs_trivial(); + int choose_spill_reg(struct ra_graph *g); + void spill_reg(int spill_reg); void split_virtual_grfs(); void calculate_live_intervals(); bool propagate_constants(); @@ -378,6 +384,8 @@ public: void generate_discard_and(fs_inst *inst, struct brw_reg temp); void generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src); void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src); + void generate_spill(fs_inst *inst, struct brw_reg src); + void generate_unspill(fs_inst *inst, struct brw_reg dst); void emit_dummy_fs(); fs_reg *emit_fragcoord_interpolation(ir_variable *ir); @@ -391,6 +399,7 @@ public: fs_inst *emit_math(fs_opcodes op, fs_reg dst, fs_reg src0, fs_reg src1); void emit_bool_to_cond_code(ir_rvalue *condition); void emit_if_gen6(ir_if *ir); + void emit_unspill(fs_inst *inst, fs_reg reg, uint32_t spill_offset); void emit_fb_writes(); void emit_assignment_writes(fs_reg &l, fs_reg &r, @@ -444,5 +453,9 @@ public: int grf_used; }; +static const fs_reg reg_undef; +static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F); +static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D); + GLboolean brw_do_channel_expressions(struct exec_list *instructions); GLboolean brw_do_vector_splitting(struct exec_list *instructions); diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp new file mode 100644 index 0000000000..b5bfd00d5f --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -0,0 +1,418 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <eric@anholt.net> + * + */ + +extern "C" { + +#include <sys/types.h> + +#include "main/macros.h" +#include "main/shaderobj.h" +#include "main/uniforms.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_optimize.h" +#include "program/register_allocate.h" +#include "program/sampler.h" +#include "program/hash_table.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_wm.h" +#include "talloc.h" +} +#include "brw_fs.h" +#include "../glsl/glsl_types.h" +#include "../glsl/ir_optimization.h" +#include "../glsl/ir_print_visitor.h" + +static void +assign_reg(int *reg_hw_locations, fs_reg *reg) +{ + if (reg->file == GRF && reg->reg != 0) { + assert(reg->reg_offset >= 0); + reg->hw_reg = reg_hw_locations[reg->reg] + reg->reg_offset; + reg->reg = 0; + } +} + +void +fs_visitor::assign_regs_trivial() +{ + int last_grf = 0; + int hw_reg_mapping[this->virtual_grf_next]; + int i; + + hw_reg_mapping[0] = 0; + hw_reg_mapping[1] = this->first_non_payload_grf; + for (i = 2; i < this->virtual_grf_next; i++) { + hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + + this->virtual_grf_sizes[i - 1]); + } + last_grf = hw_reg_mapping[i - 1] + this->virtual_grf_sizes[i - 1]; + + foreach_iter(exec_list_iterator, iter, this->instructions) { + fs_inst *inst = (fs_inst *)iter.get(); + + assign_reg(hw_reg_mapping, &inst->dst); + assign_reg(hw_reg_mapping, &inst->src[0]); + assign_reg(hw_reg_mapping, &inst->src[1]); + } + + this->grf_used = last_grf + 1; +} + +bool +fs_visitor::assign_regs() +{ + int last_grf = 0; + int hw_reg_mapping[this->virtual_grf_next + 1]; + int base_reg_count = BRW_MAX_GRF - this->first_non_payload_grf; + int class_sizes[base_reg_count]; + int class_count = 0; + int aligned_pair_class = -1; + + /* Set up the register classes. + * + * The base registers store a scalar value. For texture samples, + * we get virtual GRFs composed of 4 contiguous hw register. For + * structures and arrays, we store them as contiguous larger things + * than that, though we should be able to do better most of the + * time. + */ + class_sizes[class_count++] = 1; + if (brw->has_pln && intel->gen < 6) { + /* Always set up the (unaligned) pairs for gen5, so we can find + * them for making the aligned pair class. + */ + class_sizes[class_count++] = 2; + } + for (int r = 1; r < this->virtual_grf_next; r++) { + int i; + + for (i = 0; i < class_count; i++) { + if (class_sizes[i] == this->virtual_grf_sizes[r]) + break; + } + if (i == class_count) { + if (this->virtual_grf_sizes[r] >= base_reg_count) { + fprintf(stderr, "Object too large to register allocate.\n"); + this->fail = true; + } + + class_sizes[class_count++] = this->virtual_grf_sizes[r]; + } + } + + int ra_reg_count = 0; + int class_base_reg[class_count]; + int class_reg_count[class_count]; + int classes[class_count + 1]; + + for (int i = 0; i < class_count; i++) { + class_base_reg[i] = ra_reg_count; + class_reg_count[i] = base_reg_count - (class_sizes[i] - 1); + ra_reg_count += class_reg_count[i]; + } + + struct ra_regs *regs = ra_alloc_reg_set(ra_reg_count); + for (int i = 0; i < class_count; i++) { + classes[i] = ra_alloc_reg_class(regs); + + for (int i_r = 0; i_r < class_reg_count[i]; i_r++) { + ra_class_add_reg(regs, classes[i], class_base_reg[i] + i_r); + } + + /* Add conflicts between our contiguous registers aliasing + * base regs and other register classes' contiguous registers + * that alias base regs, or the base regs themselves for classes[0]. + */ + for (int c = 0; c <= i; c++) { + for (int i_r = 0; i_r < class_reg_count[i]; i_r++) { + for (int c_r = MAX2(0, i_r - (class_sizes[c] - 1)); + c_r < MIN2(class_reg_count[c], i_r + class_sizes[i]); + c_r++) { + + if (0) { + printf("%d/%d conflicts %d/%d\n", + class_sizes[i], this->first_non_payload_grf + i_r, + class_sizes[c], this->first_non_payload_grf + c_r); + } + + ra_add_reg_conflict(regs, + class_base_reg[i] + i_r, + class_base_reg[c] + c_r); + } + } + } + } + + /* Add a special class for aligned pairs, which we'll put delta_x/y + * in on gen5 so that we can do PLN. + */ + if (brw->has_pln && intel->gen < 6) { + int reg_count = (base_reg_count - 1) / 2; + int unaligned_pair_class = 1; + assert(class_sizes[unaligned_pair_class] == 2); + + aligned_pair_class = class_count; + classes[aligned_pair_class] = ra_alloc_reg_class(regs); + class_sizes[aligned_pair_class] = 2; + class_base_reg[aligned_pair_class] = 0; + class_reg_count[aligned_pair_class] = 0; + int start = (this->first_non_payload_grf & 1) ? 1 : 0; + + for (int i = 0; i < reg_count; i++) { + ra_class_add_reg(regs, classes[aligned_pair_class], + class_base_reg[unaligned_pair_class] + i * 2 + start); + } + class_count++; + } + + ra_set_finalize(regs); + + struct ra_graph *g = ra_alloc_interference_graph(regs, + this->virtual_grf_next); + /* Node 0 is just a placeholder to keep virtual_grf[] mapping 1:1 + * with nodes. + */ + ra_set_node_class(g, 0, classes[0]); + + for (int i = 1; i < this->virtual_grf_next; i++) { + for (int c = 0; c < class_count; c++) { + if (class_sizes[c] == this->virtual_grf_sizes[i]) { + if (aligned_pair_class >= 0 && + this->delta_x.reg == i) { + ra_set_node_class(g, i, classes[aligned_pair_class]); + } else { + ra_set_node_class(g, i, classes[c]); + } + break; + } + } + + for (int j = 1; j < i; j++) { + if (virtual_grf_interferes(i, j)) { + ra_add_node_interference(g, i, j); + } + } + } + + if (!ra_allocate_no_spills(g)) { + /* Failed to allocate registers. Spill a reg, and the caller will + * loop back into here to try again. + */ + int reg = choose_spill_reg(g); + if (reg == -1) { + this->fail = true; + } else { + spill_reg(reg); + } + + + talloc_free(g); + talloc_free(regs); + + return false; + } + + /* Get the chosen virtual registers for each node, and map virtual + * regs in the register classes back down to real hardware reg + * numbers. + */ + hw_reg_mapping[0] = 0; /* unused */ + for (int i = 1; i < this->virtual_grf_next; i++) { + int reg = ra_get_node_reg(g, i); + int hw_reg = -1; + + for (int c = 0; c < class_count; c++) { + if (reg >= class_base_reg[c] && + reg < class_base_reg[c] + class_reg_count[c]) { + hw_reg = reg - class_base_reg[c]; + break; + } + } + + assert(hw_reg >= 0); + hw_reg_mapping[i] = this->first_non_payload_grf + hw_reg; + last_grf = MAX2(last_grf, + hw_reg_mapping[i] + this->virtual_grf_sizes[i] - 1); + } + + foreach_iter(exec_list_iterator, iter, this->instructions) { + fs_inst *inst = (fs_inst *)iter.get(); + + assign_reg(hw_reg_mapping, &inst->dst); + assign_reg(hw_reg_mapping, &inst->src[0]); + assign_reg(hw_reg_mapping, &inst->src[1]); + } + + this->grf_used = last_grf + 1; + + talloc_free(g); + talloc_free(regs); + + return true; +} + +void +fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset) +{ + int size = virtual_grf_sizes[dst.reg]; + dst.reg_offset = 0; + + for (int chan = 0; chan < size; chan++) { + fs_inst *unspill_inst = new(mem_ctx) fs_inst(FS_OPCODE_UNSPILL, + dst); + dst.reg_offset++; + unspill_inst->offset = spill_offset + chan * REG_SIZE; + unspill_inst->ir = inst->ir; + unspill_inst->annotation = inst->annotation; + + /* Choose a MRF that won't conflict with an MRF that's live across the + * spill. Nothing else will make it up to MRF 14/15. + */ + unspill_inst->base_mrf = 14; + unspill_inst->mlen = 1; /* header contains offset */ + inst->insert_before(unspill_inst); + } +} + +int +fs_visitor::choose_spill_reg(struct ra_graph *g) +{ + float loop_scale = 1.0; + float spill_costs[this->virtual_grf_next]; + bool no_spill[this->virtual_grf_next]; + + for (int i = 0; i < this->virtual_grf_next; i++) { + spill_costs[i] = 0.0; + no_spill[i] = false; + } + + /* Calculate costs for spilling nodes. Call it a cost of 1 per + * spill/unspill we'll have to do, and guess that the insides of + * loops run 10 times. + */ + foreach_iter(exec_list_iterator, iter, this->instructions) { + fs_inst *inst = (fs_inst *)iter.get(); + + for (unsigned int i = 0; i < 3; i++) { + if (inst->src[i].file == GRF) { + int size = virtual_grf_sizes[inst->src[i].reg]; + spill_costs[inst->src[i].reg] += size * loop_scale; + } + } + + if (inst->dst.file == GRF) { + int size = virtual_grf_sizes[inst->dst.reg]; + spill_costs[inst->dst.reg] += size * loop_scale; + } + + switch (inst->opcode) { + + case BRW_OPCODE_DO: + loop_scale *= 10; + break; + + case BRW_OPCODE_WHILE: + loop_scale /= 10; + break; + + case FS_OPCODE_SPILL: + if (inst->src[0].file == GRF) + no_spill[inst->src[0].reg] = true; + break; + + case FS_OPCODE_UNSPILL: + if (inst->dst.file == GRF) + no_spill[inst->dst.reg] = true; + break; + } + } + + for (int i = 0; i < this->virtual_grf_next; i++) { + if (!no_spill[i]) + ra_set_node_spill_cost(g, i, spill_costs[i]); + } + + return ra_get_best_spill_node(g); +} + +void +fs_visitor::spill_reg(int spill_reg) +{ + int size = virtual_grf_sizes[spill_reg]; + unsigned int spill_offset = c->last_scratch; + assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */ + c->last_scratch += size * REG_SIZE; + + /* Generate spill/unspill instructions for the objects being + * spilled. Right now, we spill or unspill the whole thing to a + * virtual grf of the same size. For most instructions, though, we + * could just spill/unspill the GRF being accessed. + */ + foreach_iter(exec_list_iterator, iter, this->instructions) { + fs_inst *inst = (fs_inst *)iter.get(); + + for (unsigned int i = 0; i < 3; i++) { + if (inst->src[i].file == GRF && + inst->src[i].reg == spill_reg) { + inst->src[i].reg = virtual_grf_alloc(size); + emit_unspill(inst, inst->src[i], spill_offset); + } + } + + if (inst->dst.file == GRF && + inst->dst.reg == spill_reg) { + inst->dst.reg = virtual_grf_alloc(size); + + /* Since we spill/unspill the whole thing even if we access + * just a component, we may need to unspill before the + * instruction we're spilling for. + */ + if (size != 1 || inst->predicated) { + emit_unspill(inst, inst->dst, spill_offset); + } + + fs_reg spill_src = inst->dst; + spill_src.reg_offset = 0; + spill_src.abs = false; + spill_src.negate = false; + + for (int chan = 0; chan < size; chan++) { + fs_inst *spill_inst = new(mem_ctx) fs_inst(FS_OPCODE_SPILL, + reg_null_f, spill_src); + spill_src.reg_offset++; + spill_inst->offset = spill_offset + chan * REG_SIZE; + spill_inst->ir = inst->ir; + spill_inst->annotation = inst->annotation; + spill_inst->base_mrf = 14; + spill_inst->mlen = 2; /* header, value */ + inst->insert_after(spill_inst); + } + } + } +} diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index f2ce756564..7f3ba5f058 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -114,14 +114,6 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c) /* how many general-purpose registers are used */ c->prog_data.total_grf = c->max_wm_grf; - /* Scratch space is used for register spilling */ - if (c->last_scratch) { - c->prog_data.total_scratch = c->last_scratch + 0x40; - } - else { - c->prog_data.total_scratch = 0; - } - /* Emit GEN4 code. */ brw_wm_emit(c); @@ -193,6 +185,19 @@ static void do_wm_prog( struct brw_context *brw, } } + /* Scratch space is used for register spilling */ + if (c->last_scratch) { + /* Per-thread scratch space is power-of-two sized. */ + for (c->prog_data.total_scratch = 1024; + c->prog_data.total_scratch <= c->last_scratch; + c->prog_data.total_scratch *= 2) { + /* empty */ + } + } + else { + c->prog_data.total_scratch = 0; + } + if (INTEL_DEBUG & DEBUG_WM) fprintf(stderr, "\n"); diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c index cb71c665b4..88bc64e5dd 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_emit.c +++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c @@ -1576,9 +1576,7 @@ static void emit_spill( struct brw_wm_compile *c, mov (1) r0.2<1>:d 0x00000080:d { Align1 NoMask } send (16) null.0<1>:uw m1 r0.0<8;8,1>:uw 0x053003ff:ud { Align1 } */ - brw_dp_WRITE_16(p, - retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW), - slot); + brw_oword_block_write(p, brw_message_reg(1), 2, slot); } @@ -1603,9 +1601,7 @@ static void emit_unspill( struct brw_wm_compile *c, send (16) r110.0<1>:uw m1 r0.0<8;8,1>:uw 0x041243ff:ud { Align1 } */ - brw_dp_READ_16(p, - retype(vec16(reg), BRW_REGISTER_TYPE_UW), - slot); + brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot); } diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 411204e704..433ccc66f0 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -97,7 +97,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) key->urb_entry_read_length = brw->wm.prog_data->urb_read_length; key->curb_entry_read_length = brw->wm.prog_data->curb_read_length; key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf; - key->total_scratch = ALIGN(brw->wm.prog_data->total_scratch, 1024); + key->total_scratch = brw->wm.prog_data->total_scratch; /* BRW_NEW_URB_FENCE */ key->urb_size = brw->urb.vsize; @@ -184,7 +184,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, if (key->total_scratch != 0) { wm.thread2.scratch_space_base_pointer = brw->wm.scratch_bo->offset >> 10; /* reloc */ - wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1; + wm.thread2.per_thread_scratch_space = ffs(key->total_scratch) - 11; } else { wm.thread2.scratch_space_base_pointer = 0; wm.thread2.per_thread_scratch_space = 0; @@ -293,7 +293,6 @@ static void upload_wm_unit( struct brw_context *brw ) * bother reducing the allocation later, since we use scratch so * rarely. */ - assert(key.total_scratch <= 12 * 1024); if (key.total_scratch) { GLuint total = key.total_scratch * brw->wm_max_threads; diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c index af3b5dfcf9..7d55a84dfc 100644 --- a/src/mesa/main/framebuffer.c +++ b/src/mesa/main/framebuffer.c @@ -868,6 +868,7 @@ _mesa_source_buffer_exists(struct gl_context *ctx, GLenum format) case GL_LUMINANCE: case GL_LUMINANCE_ALPHA: case GL_INTENSITY: + case GL_RG: case GL_RGB: case GL_BGR: case GL_RGBA: diff --git a/src/mesa/main/image.c b/src/mesa/main/image.c index 2c3af332c0..4b5d7dadc9 100644 --- a/src/mesa/main/image.c +++ b/src/mesa/main/image.c @@ -2107,6 +2107,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], dst[i*2+1] = FLOAT_TO_UBYTE(rgba[i][ACOMP]); } break; + case GL_RG: + for (i=0;i<n;i++) { + dst[i*2+0] = FLOAT_TO_UBYTE(rgba[i][RCOMP]); + dst[i*2+1] = FLOAT_TO_UBYTE(rgba[i][GCOMP]); + } + break; case GL_RGB: for (i=0;i<n;i++) { dst[i*3+0] = FLOAT_TO_UBYTE(rgba[i][RCOMP]); @@ -2187,6 +2193,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], dst[i*2+1] = FLOAT_TO_BYTE(rgba[i][ACOMP]); } break; + case GL_RG: + for (i=0;i<n;i++) { + dst[i*2+0] = FLOAT_TO_BYTE(rgba[i][RCOMP]); + dst[i*2+1] = FLOAT_TO_BYTE(rgba[i][GCOMP]); + } + break; case GL_RGB: for (i=0;i<n;i++) { dst[i*3+0] = FLOAT_TO_BYTE(rgba[i][RCOMP]); @@ -2267,6 +2279,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], CLAMPED_FLOAT_TO_USHORT(dst[i*2+1], rgba[i][ACOMP]); } break; + case GL_RG: + for (i=0;i<n;i++) { + CLAMPED_FLOAT_TO_USHORT(dst[i*2+0], rgba[i][RCOMP]); + CLAMPED_FLOAT_TO_USHORT(dst[i*2+1], rgba[i][GCOMP]); + } + break; case GL_RGB: for (i=0;i<n;i++) { CLAMPED_FLOAT_TO_USHORT(dst[i*3+0], rgba[i][RCOMP]); @@ -2347,6 +2365,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], dst[i*2+1] = FLOAT_TO_SHORT(rgba[i][ACOMP]); } break; + case GL_RG: + for (i=0;i<n;i++) { + dst[i*2+0] = FLOAT_TO_SHORT(rgba[i][RCOMP]); + dst[i*2+1] = FLOAT_TO_SHORT(rgba[i][GCOMP]); + } + break; case GL_RGB: for (i=0;i<n;i++) { dst[i*3+0] = FLOAT_TO_SHORT(rgba[i][RCOMP]); @@ -2427,6 +2451,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], dst[i*2+1] = FLOAT_TO_UINT(rgba[i][ACOMP]); } break; + case GL_RG: + for (i=0;i<n;i++) { + dst[i*2+0] = FLOAT_TO_UINT(rgba[i][RCOMP]); + dst[i*2+1] = FLOAT_TO_UINT(rgba[i][GCOMP]); + } + break; case GL_RGB: for (i=0;i<n;i++) { dst[i*3+0] = FLOAT_TO_UINT(rgba[i][RCOMP]); @@ -2507,6 +2537,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], dst[i*2+1] = FLOAT_TO_INT(rgba[i][ACOMP]); } break; + case GL_RG: + for (i=0;i<n;i++) { + dst[i*2+0] = FLOAT_TO_INT(rgba[i][RCOMP]); + dst[i*2+1] = FLOAT_TO_INT(rgba[i][GCOMP]); + } + break; case GL_RGB: for (i=0;i<n;i++) { dst[i*3+0] = FLOAT_TO_INT(rgba[i][RCOMP]); @@ -2587,6 +2623,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], dst[i*2+1] = rgba[i][ACOMP]; } break; + case GL_RG: + for (i=0;i<n;i++) { + dst[i*2+0] = rgba[i][RCOMP]; + dst[i*2+1] = rgba[i][GCOMP]; + } + break; case GL_RGB: for (i=0;i<n;i++) { dst[i*3+0] = rgba[i][RCOMP]; @@ -2667,6 +2709,12 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4], dst[i*2+1] = _mesa_float_to_half(rgba[i][ACOMP]); } break; + case GL_RG: + for (i=0;i<n;i++) { + dst[i*2+0] = _mesa_float_to_half(rgba[i][RCOMP]); + dst[i*2+1] = _mesa_float_to_half(rgba[i][GCOMP]); + } + break; case GL_RGB: for (i=0;i<n;i++) { dst[i*3+0] = _mesa_float_to_half(rgba[i][RCOMP]); diff --git a/src/mesa/program/register_allocate.c b/src/mesa/program/register_allocate.c index 03f04697bf..a6aa7f39cb 100644 --- a/src/mesa/program/register_allocate.c +++ b/src/mesa/program/register_allocate.c @@ -72,6 +72,7 @@ struct ra_node { unsigned int adjacency_count; unsigned int reg; GLboolean in_stack; + float spill_cost; }; struct ra_graph { @@ -359,3 +360,65 @@ ra_get_node_reg(struct ra_graph *g, unsigned int n) { return g->nodes[n].reg; } + +static float +ra_get_spill_benefit(struct ra_graph *g, unsigned int n) +{ + int j; + float benefit = 0; + int n_class = g->nodes[n].class; + + /* Define the benefit of eliminating an interference between n, j + * through spilling as q(C, B) / p(C). This is similar to the + * "count number of edges" approach of traditional graph coloring, + * but takes classes into account. + */ + for (j = 0; j < g->count; j++) { + if (j != n && g->nodes[n].adjacency[j]) { + unsigned int j_class = g->nodes[j].class; + benefit += ((float)g->regs->classes[n_class]->q[j_class] / + g->regs->classes[n_class]->p); + break; + } + } + + return benefit; +} + +/** + * Returns a node number to be spilled according to the cost/benefit using + * the pq test, or -1 if there are no spillable nodes. + */ +int +ra_get_best_spill_node(struct ra_graph *g) +{ + unsigned int best_node = -1; + unsigned int best_benefit = 0.0; + unsigned int n; + + for (n = 0; n < g->count; n++) { + float cost = g->nodes[n].spill_cost; + + if (cost <= 0.0) + continue; + + float benefit = ra_get_spill_benefit(g, n); + + if (benefit / cost > best_benefit) { + best_benefit = benefit / cost; + best_node = n; + } + } + + return best_node; +} + +/** + * Only nodes with a spill cost set (cost != 0.0) will be considered + * for register spilling. + */ +void +ra_set_node_spill_cost(struct ra_graph *g, unsigned int n, float cost) +{ + g->nodes[n].spill_cost = cost; +} diff --git a/src/mesa/program/register_allocate.h b/src/mesa/program/register_allocate.h index 42647b50b8..198b89f2d7 100644 --- a/src/mesa/program/register_allocate.h +++ b/src/mesa/program/register_allocate.h @@ -65,5 +65,7 @@ GLboolean ra_select(struct ra_graph *g); GLboolean ra_allocate_no_spills(struct ra_graph *g); unsigned int ra_get_node_reg(struct ra_graph *g, unsigned int n); +void ra_set_node_spill_cost(struct ra_graph *g, unsigned int n, float cost); +int ra_get_best_spill_node(struct ra_graph *g); /** @} */ diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c index 4e90bd01a3..7ad48ecea0 100644 --- a/src/mesa/state_tracker/st_format.c +++ b/src/mesa/state_tracker/st_format.c @@ -547,6 +547,8 @@ st_choose_format(struct pipe_screen *screen, GLenum internalFormat, } return PIPE_FORMAT_NONE; + case GL_COMPRESSED_RED: + case GL_COMPRESSED_RG: case GL_COMPRESSED_RGB: /* can only sample from compressed formats */ if (bindings & ~PIPE_BIND_SAMPLER_VIEW) |