diff options
Diffstat (limited to 'src/mesa/drivers/dri/i965')
34 files changed, 1092 insertions, 256 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c index 228ee3f3be..a1e9dae915 100644 --- a/src/mesa/drivers/dri/i965/brw_clip.c +++ b/src/mesa/drivers/dri/i965/brw_clip.c @@ -55,6 +55,7 @@ static void compile_clip_prog( struct brw_context *brw, GLuint program_size; GLuint delta; GLuint i; + GLuint header_regs; memset(&c, 0, sizeof(c)); @@ -72,22 +73,28 @@ static void compile_clip_prog( struct brw_context *brw, c.header_position_offset = ATTR_SIZE; if (intel->gen == 5) - delta = 3 * REG_SIZE; + header_regs = 3; else - delta = REG_SIZE; + header_regs = 1; - for (i = 0; i < VERT_RESULT_MAX; i++) + delta = header_regs * REG_SIZE; + + for (i = 0; i < VERT_RESULT_MAX; i++) { if (c.key.attrs & BITFIELD64_BIT(i)) { c.offset[i] = delta; delta += ATTR_SIZE; + + c.idx_to_attr[c.nr_attrs] = i; + c.nr_attrs++; } + } - c.nr_attrs = brw_count_bits(c.key.attrs); - - if (intel->gen == 5) - c.nr_regs = (c.nr_attrs + 1) / 2 + 3; /* are vertices packed, or reg-aligned? */ - else - c.nr_regs = (c.nr_attrs + 1) / 2 + 1; /* are vertices packed, or reg-aligned? */ + /* The vertex attributes start at a URB row-aligned offset after + * the 8-20 dword vertex header, and continue for a URB row-aligned + * length. nr_regs determines the urb_read_length from the start + * of the header to the end of the vertex data. + */ + c.nr_regs = header_regs + (c.nr_attrs + 1) / 2; c.nr_bytes = c.nr_regs * REG_SIZE; diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h index 68222c6c27..3a8cd7bf39 100644 --- a/src/mesa/drivers/dri/i965/brw_clip.h +++ b/src/mesa/drivers/dri/i965/brw_clip.h @@ -115,7 +115,10 @@ struct brw_clip_compile { GLboolean need_direction; GLuint header_position_offset; - GLuint offset[VERT_ATTRIB_MAX]; + /** Mapping from VERT_RESULT_* to offset within the VUE. */ + GLuint offset[VERT_RESULT_MAX]; + /** Mapping from attribute index to VERT_RESULT_* */ + GLuint idx_to_attr[VERT_RESULT_MAX]; }; #define ATTR_SIZE (4*4) diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c index ceb62a3116..4b9117bb0b 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_line.c +++ b/src/mesa/drivers/dri/i965/brw_clip_line.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c index 7f47634dca..b994a32bc3 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_point.c +++ b/src/mesa/drivers/dri/i965/brw_clip_point.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c index 916a99ea00..cb58d1da9f 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_tri.c +++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" @@ -76,10 +76,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, if (c->nr_attrs & 1) { for (j = 0; j < 3; j++) { - GLuint delta = c->nr_attrs*16 + 32; - - if (intel->gen == 5) - delta = c->nr_attrs * 16 + 32 * 3; + GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE; brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0)); } diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c index f36d22fdbf..afd93f8be0 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c +++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c @@ -32,7 +32,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c index 2148bc8244..d2ac1235e4 100644 --- a/src/mesa/drivers/dri/i965/brw_clip_util.c +++ b/src/mesa/drivers/dri/i965/brw_clip_util.c @@ -33,7 +33,7 @@ #include "main/glheader.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" @@ -134,7 +134,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, GLboolean force_edgeflag) { struct brw_compile *p = &c->func; - struct intel_context *intel = &p->brw->intel; struct brw_reg tmp = get_tmp(c); GLuint i; @@ -149,12 +148,9 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, /* Iterate over each attribute (could be done in pairs?) */ for (i = 0; i < c->nr_attrs; i++) { - GLuint delta = i*16 + 32; + GLuint delta = c->offset[c->idx_to_attr[i]]; - if (intel->gen == 5) - delta = i * 16 + 32 * 3; - - if (delta == c->offset[VERT_RESULT_EDGE]) { + if (c->idx_to_attr[i] == VERT_RESULT_EDGE) { if (force_edgeflag) brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1)); else @@ -183,10 +179,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, } if (i & 1) { - GLuint delta = i*16 + 32; - - if (intel->gen == 5) - delta = i * 16 + 32 * 3; + GLuint delta = c->offset[c->idx_to_attr[c->nr_attrs - 1]] + ATTR_SIZE; brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0)); } @@ -199,11 +192,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c, brw_clip_project_vertex(c, dest_ptr ); } - - - -#define MAX_MRF 16 - void brw_clip_emit_vue(struct brw_clip_compile *c, struct brw_indirect vert, GLboolean allocate, diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index d13b9ae298..6d064b822e 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -34,7 +34,6 @@ #include "main/api_noop.h" #include "main/macros.h" #include "main/simple_list.h" - #include "brw_context.h" #include "brw_defines.h" #include "brw_draw.h" diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c index 6c0b79f724..8196d8ca62 100644 --- a/src/mesa/drivers/dri/i965/brw_curbe.c +++ b/src/mesa/drivers/dri/i965/brw_curbe.c @@ -35,9 +35,9 @@ #include "main/context.h" #include "main/macros.h" #include "main/enums.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_statevars.h" #include "intel_batchbuffer.h" #include "intel_regions.h" #include "brw_context.h" diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 39bf5b63fc..f7a68cead7 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -501,6 +501,10 @@ #define BRW_MASK_ENABLE 0 #define BRW_MASK_DISABLE 1 +/* Sandybridge is WECtrl (Write enable control) */ +#define BRW_WE_NORMAL 0 +#define BRW_WE_KILL_PRED 1 + #define BRW_OPCODE_MOV 1 #define BRW_OPCODE_SEL 2 #define BRW_OPCODE_NOT 4 @@ -600,6 +604,8 @@ #define BRW_ARF_NOTIFICATION_COUNT 0x90 #define BRW_ARF_IP 0xA0 +#define BRW_MRF_COMPR4 (1 << 7) + #define BRW_AMASK 0 #define BRW_IMASK 1 #define BRW_LMASK 2 @@ -646,13 +652,14 @@ #define BRW_POLYGON_FACING_BACK 1 #define BRW_MESSAGE_TARGET_NULL 0 -#define BRW_MESSAGE_TARGET_MATH 1 +#define BRW_MESSAGE_TARGET_MATH 1 /* reserved on GEN6 */ #define BRW_MESSAGE_TARGET_SAMPLER 2 #define BRW_MESSAGE_TARGET_GATEWAY 3 -#define BRW_MESSAGE_TARGET_DATAPORT_READ 4 -#define BRW_MESSAGE_TARGET_DATAPORT_WRITE 5 +#define BRW_MESSAGE_TARGET_DATAPORT_READ 4 /* sampler cache on GEN6 */ +#define BRW_MESSAGE_TARGET_DATAPORT_WRITE 5 /* render cache on Gen6 */ #define BRW_MESSAGE_TARGET_URB 6 #define BRW_MESSAGE_TARGET_THREAD_SPAWNER 7 +#define BRW_MESSAGE_TARGET_CONST_CACHE 9 /* GEN6 */ #define BRW_SAMPLER_RETURN_FORMAT_FLOAT32 0 #define BRW_SAMPLER_RETURN_FORMAT_UINT32 2 @@ -698,10 +705,24 @@ #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS 2 #define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS 3 +/* This one stays the same across generations. */ #define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ 0 +/* GEN4 */ #define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 1 -#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ 2 +#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 2 #define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 3 +/* G45, GEN5 */ +#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ 3 +#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 +/* GEN6 */ +#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ 1 +#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ 2 +#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ 4 +#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ 5 +#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ 6 #define BRW_DATAPORT_READ_TARGET_DATA_CACHE 0 #define BRW_DATAPORT_READ_TARGET_RENDER_CACHE 1 @@ -721,6 +742,16 @@ #define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE 5 #define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE 7 +/* GEN6 */ +#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE_GEN6 7 +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE_GEN6 8 +#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE_GEN6 9 +#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE_GEN6 10 +#define BRW_DATAPORT_WRITE_MESSAGE_DWORLD_SCATTERED_WRITE_GEN6 11 +#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6 12 +#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE_GEN6 13 +#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE_GEN6 14 + #define BRW_MATH_FUNCTION_INV 1 #define BRW_MATH_FUNCTION_LOG 2 #define BRW_MATH_FUNCTION_EXP 3 diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index ff12daf497..d230714536 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -598,7 +598,7 @@ static int src_da16 (FILE *file, format (file, ".%d", _subreg_nr); string (file, "<"); err |= control (file, "vert stride", vert_stride, _vert_stride, NULL); - string (file, ",1,1>"); + string (file, ",4,1>"); err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL); /* * Three kinds of swizzle display: @@ -836,10 +836,12 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) if (inst->header.opcode == BRW_OPCODE_SEND) { int target; - if (gen >= 5) - target = inst->bits2.send_gen5.sfid; + if (gen >= 6) + target = inst->header.destreg__conditionalmod; + else if (gen == 5) + target = inst->bits2.send_gen5.sfid; else - target = inst->bits3.generic.msg_target; + target = inst->bits3.generic.msg_target; newline (file); pad (file, 16); @@ -868,13 +870,44 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) inst->bits3.sampler.return_format, NULL); string (file, ")"); break; + case BRW_MESSAGE_TARGET_DATAPORT_READ: + if (gen >= 6) { + format (file, " (%d, %d, %d, %d, %d, %d)", + inst->bits3.dp_render_cache.binding_table_index, + inst->bits3.dp_render_cache.msg_control, + inst->bits3.dp_render_cache.msg_type, + inst->bits3.dp_render_cache.send_commit_msg, + inst->bits3.dp_render_cache.msg_length, + inst->bits3.dp_render_cache.response_length); + } else if (gen >= 5) { + format (file, " (%d, %d, %d)", + inst->bits3.dp_read_gen5.binding_table_index, + inst->bits3.dp_read_gen5.msg_control, + inst->bits3.dp_read_gen5.msg_type); + } else { + format (file, " (%d, %d, %d)", + inst->bits3.dp_read.binding_table_index, + inst->bits3.dp_read.msg_control, + inst->bits3.dp_read.msg_type); + } + break; case BRW_MESSAGE_TARGET_DATAPORT_WRITE: - format (file, " (%d, %d, %d, %d)", - inst->bits3.dp_write.binding_table_index, - (inst->bits3.dp_write.pixel_scoreboard_clear << 3) | - inst->bits3.dp_write.msg_control, - inst->bits3.dp_write.msg_type, - inst->bits3.dp_write.send_commit_msg); + if (gen >= 6) { + format (file, " (%d, %d, %d, %d, %d, %d)", + inst->bits3.dp_render_cache.binding_table_index, + inst->bits3.dp_render_cache.msg_control, + inst->bits3.dp_render_cache.msg_type, + inst->bits3.dp_render_cache.send_commit_msg, + inst->bits3.dp_render_cache.msg_length, + inst->bits3.dp_render_cache.response_length); + } else { + format (file, " (%d, %d, %d, %d)", + inst->bits3.dp_write.binding_table_index, + (inst->bits3.dp_write.pixel_scoreboard_clear << 3) | + inst->bits3.dp_write.msg_control, + inst->bits3.dp_write.msg_type, + inst->bits3.dp_write.send_commit_msg); + } break; case BRW_MESSAGE_TARGET_URB: if (gen >= 5) { @@ -900,15 +933,22 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) case BRW_MESSAGE_TARGET_THREAD_SPAWNER: break; default: - format (file, "unsupported target %d", inst->bits3.generic.msg_target); + format (file, "unsupported target %d", target); break; } if (space) string (file, " "); - format (file, "mlen %d", - inst->bits3.generic.msg_length); - format (file, " rlen %d", - inst->bits3.generic.response_length); + if (gen >= 5) { + format (file, "mlen %d", + inst->bits3.generic_gen5.msg_length); + format (file, " rlen %d", + inst->bits3.generic_gen5.response_length); + } else { + format (file, "mlen %d", + inst->bits3.generic.msg_length); + format (file, " rlen %d", + inst->bits3.generic.response_length); + } } pad (file, 64); if (inst->header.opcode != BRW_OPCODE_NOP) { diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 3a32ad26c1..ffdddd0a38 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -35,7 +35,7 @@ #include "brw_structs.h" #include "brw_defines.h" -#include "shader/prog_instruction.h" +#include "program/prog_instruction.h" #define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6)) #define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3) @@ -520,6 +520,20 @@ static INLINE struct brw_reg brw_acc_reg( void ) 0); } +static INLINE struct brw_reg brw_notification_1_reg(void) +{ + + return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, + BRW_ARF_NOTIFICATION_COUNT, + 1, + BRW_REGISTER_TYPE_UD, + BRW_VERTICAL_STRIDE_0, + BRW_WIDTH_1, + BRW_HORIZONTAL_STRIDE_0, + BRW_SWIZZLE_XXXX, + WRITEMASK_X); +} + static INLINE struct brw_reg brw_flag_reg( void ) { @@ -877,12 +891,15 @@ void brw_dp_READ_4( struct brw_compile *p, void brw_dp_READ_4_vs( struct brw_compile *p, struct brw_reg dest, - GLuint oword, - GLboolean relAddr, - struct brw_reg addrReg, GLuint location, GLuint bind_table_index ); +void brw_dp_READ_4_vs_relative(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg addrReg, + GLuint offset, + GLuint bind_table_index); + void brw_dp_WRITE_16( struct brw_compile *p, struct brw_reg src, GLuint scratch_offset ); @@ -919,6 +936,8 @@ void brw_land_fwd_jump(struct brw_compile *p, void brw_NOP(struct brw_compile *p); +void brw_WAIT(struct brw_compile *p); + /* Special case: there is never a destination, execution size will be * taken from src0: */ @@ -965,5 +984,7 @@ void brw_set_src1( struct brw_instruction *insn, /* brw_optimize.c */ void brw_optimize(struct brw_compile *p); +void brw_remove_duplicate_mrf_moves(struct brw_compile *p); +void brw_remove_grf_to_mrf_moves(struct brw_compile *p); #endif diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 34dfe10cb9..0d5d17f501 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -364,7 +364,8 @@ static void brw_set_dp_write_message( struct brw_context *brw, GLuint msg_length, GLuint pixel_scoreboard_clear, GLuint response_length, - GLuint end_of_thread ) + GLuint end_of_thread, + GLuint send_commit_msg) { struct intel_context *intel = &brw->intel; brw_set_src1(insn, brw_imm_d(0)); @@ -374,7 +375,7 @@ static void brw_set_dp_write_message( struct brw_context *brw, insn->bits3.dp_write_gen5.msg_control = msg_control; insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear; insn->bits3.dp_write_gen5.msg_type = msg_type; - insn->bits3.dp_write_gen5.send_commit_msg = 0; + insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg; insn->bits3.dp_write_gen5.header_present = 1; insn->bits3.dp_write_gen5.response_length = response_length; insn->bits3.dp_write_gen5.msg_length = msg_length; @@ -386,7 +387,7 @@ static void brw_set_dp_write_message( struct brw_context *brw, insn->bits3.dp_write.msg_control = msg_control; insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear; insn->bits3.dp_write.msg_type = msg_type; - insn->bits3.dp_write.send_commit_msg = 0; + insn->bits3.dp_write.send_commit_msg = send_commit_msg; insn->bits3.dp_write.response_length = response_length; insn->bits3.dp_write.msg_length = msg_length; insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE; @@ -906,6 +907,20 @@ void brw_CMP(struct brw_compile *p, } } +/* Issue 'wait' instruction for n1, host could program MMIO + to wake up thread. */ +void brw_WAIT (struct brw_compile *p) +{ + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT); + struct brw_reg src = brw_notification_1_reg(); + + brw_set_dest(insn, src); + brw_set_src0(insn, src); + brw_set_src1(insn, brw_null_reg()); + insn->header.execution_size = 0; /* must */ + insn->header.predicate_control = 0; + insn->header.compression_control = 0; +} /*********************************************************************** @@ -1040,6 +1055,7 @@ void brw_dp_WRITE_16( struct brw_compile *p, struct brw_reg src, GLuint scratch_offset ) { + struct intel_context *intel = &p->brw->intel; GLuint msg_reg_nr = 1; { brw_push_insn_state(p); @@ -1056,13 +1072,32 @@ void brw_dp_WRITE_16( struct brw_compile *p, { GLuint msg_length = 3; - struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); + struct brw_reg dest; struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); - + int send_commit_msg; + insn->header.predicate_control = 0; /* XXX */ insn->header.compression_control = BRW_COMPRESSION_NONE; insn->header.destreg__conditionalmod = msg_reg_nr; - + + /* Until gen6, writes followed by reads from the same location + * are not guaranteed to be ordered unless write_commit is set. + * If set, then a no-op write is issued to the destination + * register to set a dependency, and a read from the destination + * can be used to ensure the ordering. + * + * For gen6, only writes between different threads need ordering + * protection. Our use of DP writes is all about register + * spilling within a thread. + */ + if (intel->gen >= 6) { + dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW); + send_commit_msg = 0; + } else { + dest = brw_uw16_grf(0, 0); + send_commit_msg = 1; + } + brw_set_dest(insn, dest); brw_set_src0(insn, src); @@ -1073,8 +1108,9 @@ void brw_dp_WRITE_16( struct brw_compile *p, BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */ msg_length, 0, /* pixel scoreboard */ - 0, /* response_length */ - 0); /* eot */ + send_commit_msg, /* response_length */ + 0, /* eot */ + send_commit_msg); } } @@ -1115,7 +1151,7 @@ void brw_dp_READ_16( struct brw_compile *p, brw_set_dp_read_message(p->brw, insn, 255, /* binding table index (255=stateless) */ - 3, /* msg_control (3 means 4 Owords) */ + BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ 1, /* target cache (render/scratch) */ 1, /* msg_length */ @@ -1190,68 +1226,107 @@ void brw_dp_READ_4( struct brw_compile *p, */ void brw_dp_READ_4_vs(struct brw_compile *p, struct brw_reg dest, - GLuint oword, - GLboolean relAddr, - struct brw_reg addrReg, GLuint location, GLuint bind_table_index) { + struct brw_instruction *insn; GLuint msg_reg_nr = 1; + struct brw_reg b; - assert(oword < 2); /* printf("vs const read msg, location %u, msg_reg_nr %d\n", location, msg_reg_nr); */ /* Setup MRF[1] with location/offset into const buffer */ - { - struct brw_reg b; + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); - brw_push_insn_state(p); - brw_set_compression_control(p, BRW_COMPRESSION_NONE); - brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_set_predicate_control(p, BRW_PREDICATE_NONE); - /*brw_set_access_mode(p, BRW_ALIGN_16);*/ + /* XXX I think we're setting all the dwords of MRF[1] to 'location'. + * when the docs say only dword[2] should be set. Hmmm. But it works. + */ + b = brw_message_reg(msg_reg_nr); + b = retype(b, BRW_REGISTER_TYPE_UD); + /*b = get_element_ud(b, 2);*/ + brw_MOV(p, b, brw_imm_ud(location)); - /* XXX I think we're setting all the dwords of MRF[1] to 'location'. - * when the docs say only dword[2] should be set. Hmmm. But it works. - */ - b = brw_message_reg(msg_reg_nr); - b = retype(b, BRW_REGISTER_TYPE_UD); - /*b = get_element_ud(b, 2);*/ - if (relAddr) { - brw_ADD(p, b, addrReg, brw_imm_ud(location)); - } - else { - brw_MOV(p, b, brw_imm_ud(location)); - } + brw_pop_insn_state(p); - brw_pop_insn_state(p); - } + insn = next_insn(p, BRW_OPCODE_SEND); - { - struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); - - insn->header.predicate_control = BRW_PREDICATE_NONE; - insn->header.compression_control = BRW_COMPRESSION_NONE; - insn->header.destreg__conditionalmod = msg_reg_nr; - insn->header.mask_control = BRW_MASK_DISABLE; - /*insn->header.access_mode = BRW_ALIGN_16;*/ - - brw_set_dest(insn, dest); - brw_set_src0(insn, brw_null_reg()); + insn->header.predicate_control = BRW_PREDICATE_NONE; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = msg_reg_nr; + insn->header.mask_control = BRW_MASK_DISABLE; - brw_set_dp_read_message(p->brw, - insn, - bind_table_index, - oword, /* 0 = lower Oword, 1 = upper Oword */ - BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ - 0, /* source cache = data cache */ - 1, /* msg_length */ - 1, /* response_length (1 Oword) */ - 0); /* eot */ - } + brw_set_dest(insn, dest); + brw_set_src0(insn, brw_null_reg()); + + brw_set_dp_read_message(p->brw, + insn, + bind_table_index, + 0, + BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */ + 0, /* source cache = data cache */ + 1, /* msg_length */ + 1, /* response_length (1 Oword) */ + 0); /* eot */ +} + +/** + * Read a float[4] constant per vertex from VS constant buffer, with + * relative addressing. + */ +void brw_dp_READ_4_vs_relative(struct brw_compile *p, + struct brw_reg dest, + struct brw_reg addr_reg, + GLuint offset, + GLuint bind_table_index) +{ + struct intel_context *intel = &p->brw->intel; + int msg_type; + + /* Setup MRF[1] with offset into const buffer */ + brw_push_insn_state(p); + brw_set_compression_control(p, BRW_COMPRESSION_NONE); + brw_set_mask_control(p, BRW_MASK_DISABLE); + brw_set_predicate_control(p, BRW_PREDICATE_NONE); + + /* M1.0 is block offset 0, M1.4 is block offset 1, all other + * fields ignored. + */ + brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), + addr_reg, brw_imm_d(offset)); + brw_pop_insn_state(p); + + struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND); + + insn->header.predicate_control = BRW_PREDICATE_NONE; + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.destreg__conditionalmod = 0; + insn->header.mask_control = BRW_MASK_DISABLE; + + brw_set_dest(insn, dest); + brw_set_src0(insn, brw_vec8_grf(0, 0)); + + if (intel->gen == 6) + msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else if (intel->gen == 5 || intel->is_g4x) + msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + else + msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; + + brw_set_dp_read_message(p->brw, + insn, + bind_table_index, + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, + msg_type, + 0, /* source cache = data cache */ + 2, /* msg_length */ + 1, /* response_length */ + 0); /* eot */ } @@ -1281,7 +1356,8 @@ void brw_fb_WRITE(struct brw_compile *p, msg_length, 1, /* pixel scoreboard */ response_length, - eot); + eot, + 0 /* send_commit_msg */); } diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c index 99a6f6be11..a01d5576f8 100644 --- a/src/mesa/drivers/dri/i965/brw_gs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c @@ -34,7 +34,7 @@ #include "main/macros.h" #include "main/enums.h" -#include "shader/program.h" +#include "program/program.h" #include "intel_batchbuffer.h" #include "brw_defines.h" diff --git a/src/mesa/drivers/dri/i965/brw_optimize.c b/src/mesa/drivers/dri/i965/brw_optimize.c index e79b3ddea3..8aa6fb6cc6 100644 --- a/src/mesa/drivers/dri/i965/brw_optimize.c +++ b/src/mesa/drivers/dri/i965/brw_optimize.c @@ -26,12 +26,600 @@ */ #include "main/macros.h" -#include "shader/program.h" -#include "shader/prog_print.h" +#include "program/program.h" +#include "program/prog_print.h" #include "brw_context.h" #include "brw_defines.h" #include "brw_eu.h" +static const struct { + char *name; + int nsrc; + int ndst; + GLboolean is_arith; +} inst_opcode[128] = { + [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 }, + + [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 }, + [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 }, + + [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 }, + [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 }, +}; + +static INLINE +GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst) +{ + return inst_opcode[inst->header.opcode].is_arith; +} + +static const GLuint inst_stride[7] = { + [0] = 0, + [1] = 1, + [2] = 2, + [3] = 4, + [4] = 8, + [5] = 16, + [6] = 32 +}; + +static const GLuint inst_type_size[8] = { + [BRW_REGISTER_TYPE_UD] = 4, + [BRW_REGISTER_TYPE_D] = 4, + [BRW_REGISTER_TYPE_UW] = 2, + [BRW_REGISTER_TYPE_W] = 2, + [BRW_REGISTER_TYPE_UB] = 1, + [BRW_REGISTER_TYPE_B] = 1, + [BRW_REGISTER_TYPE_F] = 4 +}; + +static INLINE GLboolean +brw_is_grf_written(const struct brw_instruction *inst, + int reg_index, int size, + int gen) +{ + if (inst_opcode[inst->header.opcode].ndst == 0) + return GL_FALSE; + + if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + + if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index * REG_SIZE; + const int reg_end = reg_start + size; + + const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; + const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + int length, write_end; + + /* SEND is specific */ + if (inst->header.opcode == BRW_OPCODE_SEND) { + if (gen >= 5) + length = inst->bits3.generic_gen5.response_length*REG_SIZE; + else + length = inst->bits3.generic.response_length*REG_SIZE; + } + else { + length = 1 << inst->header.execution_size; + length *= type_size; + length *= inst->bits1.da1.dest_horiz_stride; + } + + /* If the two intervals intersect, we overwrite the register */ + write_end = write_start + length; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end); + + return left < right; +} + +/* Specific path for message register since we need to handle the compr4 case */ +static INLINE GLboolean +brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size) +{ + if (inst_opcode[inst->header.opcode].ndst == 0) + return GL_FALSE; + + if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE) + return GL_TRUE; + + if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index * REG_SIZE; + const int reg_end = reg_start + size; + + const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f; + const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4; + const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type]; + + /* We use compr4 with a size != 16 elements. Strange, we conservatively + * consider that we are writing the register. + */ + if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16) + return GL_TRUE; + + GLboolean is_written = GL_FALSE; + + /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */ + if (is_compr4) { + const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride; + + /* First 8-way register */ + const int write_start0 = mrf_index*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end0 = write_start0 + length; + + /* Second 8-way register */ + const int write_start1 = (mrf_index+4)*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end1 = write_start1 + length; + + /* If the two intervals intersect, we overwrite the register */ + const int left0 = MAX2(write_start0, reg_start); + const int right0 = MIN2(write_end0, reg_end); + const int left1 = MAX2(write_start1, reg_start); + const int right1 = MIN2(write_end1, reg_end); + + is_written = left0 < right0 || left1 < right1; + } + else { + int length; + length = 1 << inst->header.execution_size; + length *= type_size; + length *= inst->bits1.da1.dest_horiz_stride; + + /* If the two intervals intersect, we write into the register */ + const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE + + inst->bits1.da1.dest_subreg_nr; + const int write_end = write_start + length; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end);; + + is_written = left < right; + } + + /* SEND may perform an implicit mov to a mrf register */ + if (is_written == GL_FALSE && + inst->header.opcode == BRW_OPCODE_SEND && + inst->bits1.da1.src0_reg_file != 0) { + + const int mrf_start = inst->header.destreg__conditionalmod; + const int write_start = mrf_start * REG_SIZE; + const int write_end = write_start + REG_SIZE; + const int left = MAX2(write_start, reg_start); + const int right = MIN2(write_end, reg_end);; + is_written = left < right; + } + + return is_written; +} + +static INLINE GLboolean +brw_is_mrf_read(const struct brw_instruction *inst, + int reg_index, int size, int gen) +{ + if (inst->header.opcode != BRW_OPCODE_SEND) + return GL_FALSE; + if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) + return GL_TRUE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + int length, read_start, read_end; + if (gen >= 5) + length = inst->bits3.generic_gen5.msg_length*REG_SIZE; + else + length = inst->bits3.generic.msg_length*REG_SIZE; + + /* Look if SEND uses an implicit mov. In that case, we read one less register + * (but we write it) + */ + if (inst->bits1.da1.src0_reg_file != 0) + read_start = inst->header.destreg__conditionalmod; + else { + length--; + read_start = inst->header.destreg__conditionalmod + 1; + } + read_start *= REG_SIZE; + read_end = read_start + length; + + const int left = MAX2(read_start, reg_start); + const int right = MIN2(read_end, reg_end); + + return left < right; +} + +static INLINE GLboolean +brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size) +{ + int i, j; + if (inst_opcode[inst->header.opcode].nsrc == 0) + return GL_FALSE; + + /* Look at first source. We must take into account register regions to + * monitor carefully the read. Note that we are a bit too conservative here + * since we do not take into account the fact that some complete registers + * may be skipped + */ + if (inst_opcode[inst->header.opcode].nsrc >= 1) { + + if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + /* See if at least one of this element intersects the interval */ + const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type]; + const int elem_num = 1 << inst->header.execution_size; + const int width = 1 << inst->bits2.da1.src0_width; + const int row_num = elem_num >> inst->bits2.da1.src0_width; + const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride]; + const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride]; + int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE + + inst->bits2.da1.src0_subreg_nr; + for (j = 0; j < row_num; ++j) { + int write_start = row_start; + for (i = 0; i < width; ++i) { + const int write_end = write_start + type_size; + const int left = write_start > reg_start ? write_start : reg_start; + const int right = write_end < reg_end ? write_end : reg_end; + if (left < right) + return GL_TRUE; + write_start += hs; + } + row_start += vs; + } + } + + /* Second src register */ + if (inst_opcode[inst->header.opcode].nsrc >= 2) { + + if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT) + if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE) + return GL_TRUE; + if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE) + return GL_FALSE; + + const int reg_start = reg_index*REG_SIZE; + const int reg_end = reg_start + size; + + /* See if at least one of this element intersects the interval */ + const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type]; + const int elem_num = 1 << inst->header.execution_size; + const int width = 1 << inst->bits3.da1.src1_width; + const int row_num = elem_num >> inst->bits3.da1.src1_width; + const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride]; + const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride]; + int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE + + inst->bits3.da1.src1_subreg_nr; + for (j = 0; j < row_num; ++j) { + int write_start = row_start; + for (i = 0; i < width; ++i) { + const int write_end = write_start + type_size; + const int left = write_start > reg_start ? write_start : reg_start; + const int right = write_end < reg_end ? write_end : reg_end; + if (left < right) + return GL_TRUE; + write_start += hs; + } + row_start += vs; + } + } + + return GL_FALSE; +} + +static INLINE GLboolean +brw_is_control_done(const struct brw_instruction *mov) { + return + mov->header.dependency_control != 0 || + mov->header.thread_control != 0 || + mov->header.mask_control != 0 || + mov->header.saturate != 0 || + mov->header.debug_control != 0; +} + +static INLINE GLboolean +brw_is_predicated(const struct brw_instruction *mov) { + return mov->header.predicate_control != 0; +} + +static INLINE GLboolean +brw_is_grf_to_mrf_mov(const struct brw_instruction *mov, + int *mrf_index, + int *grf_index, + GLboolean *is_compr4) +{ + if (brw_is_predicated(mov) || + brw_is_control_done(mov) || + mov->header.debug_control != 0) + return GL_FALSE; + + if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT || + mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE || + mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F || + mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || + mov->bits1.da1.dest_subreg_nr != 0) + return GL_FALSE; + + if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT || + mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE || + mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F || + mov->bits2.da1.src0_width != BRW_WIDTH_8 || + mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 || + mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 || + mov->bits2.da1.src0_subreg_nr != 0 || + mov->bits2.da1.src0_abs != 0 || + mov->bits2.da1.src0_negate != 0) + return GL_FALSE; + + *grf_index = mov->bits2.da1.src0_reg_nr; + *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f; + *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0; + return GL_TRUE; +} + +static INLINE GLboolean +brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index) +{ + /* remark: no problem to predicate a SEL instruction */ + if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) && + brw_is_control_done(inst) == GL_FALSE && + inst->header.execution_size == 4 && + inst->header.access_mode == BRW_ALIGN_1 && + inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT && + inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE && + inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F && + inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 && + inst->bits1.da1.dest_reg_nr == grf_index && + inst->bits1.da1.dest_subreg_nr == 0 && + brw_is_arithmetic_inst(inst)) + return GL_TRUE; + + return GL_FALSE; +} + +static INLINE GLboolean +brw_inst_are_equal(const struct brw_instruction *src0, + const struct brw_instruction *src1) +{ + const GLuint *field0 = (GLuint *) src0; + const GLuint *field1 = (GLuint *) src1; + return field0[0] == field1[0] && + field0[1] == field1[1] && + field0[2] == field1[2] && + field0[3] == field1[3]; +} + +static INLINE void +brw_inst_copy(struct brw_instruction *dst, + const struct brw_instruction *src) +{ + GLuint *field_dst = (GLuint *) dst; + const GLuint *field_src = (GLuint *) src; + field_dst[0] = field_src[0]; + field_dst[1] = field_src[1]; + field_dst[2] = field_src[2]; + field_dst[3] = field_src[3]; +} + +static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst) +{ + int i, nr_insn = 0, to = 0, from = 0; + + for (from = 0; from < p->nr_insn; ++from) { + if (removeInst[from]) + continue; + if(to != from) + brw_inst_copy(p->store + to, p->store + from); + to++; + } + + for (i = 0; i < p->nr_insn; ++i) + if (removeInst[i] == GL_FALSE) + nr_insn++; + p->nr_insn = nr_insn; +} + +/* The gen code emitter generates a lot of duplications in the + * grf-to-mrf moves, for example when texture sampling with the same + * coordinates from multiple textures.. Here, we monitor same mov + * grf-to-mrf instrutions and remove repeated ones where the operands + * and dst ahven't changed in between. + */ +void brw_remove_duplicate_mrf_moves(struct brw_compile *p) +{ + const int gen = p->brw->intel.gen; + int i, j; + + GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn); + for (i = 0; i < p->nr_insn; i++) { + if (removeInst[i]) + continue; + + const struct brw_instruction *mov = p->store + i; + int mrf_index, grf_index; + GLboolean is_compr4; + + /* Only consider _straight_ grf-to-mrf moves */ + if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) + continue; + + const int mrf_index0 = mrf_index; + const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1; + const int simd16_size = 2 * REG_SIZE; + + for (j = i + 1; j < p->nr_insn; j++) { + const struct brw_instruction *inst = p->store + j; + + if (brw_inst_are_equal(mov, inst)) { + removeInst[j] = GL_TRUE; + continue; + } + + if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || + brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || + brw_is_mrf_written(inst, mrf_index1, REG_SIZE)) + break; + } + } + + brw_remove_inst(p, removeInst); + free(removeInst); +} + +/* Replace moves to MRFs where the value moved is the result of a + * normal arithmetic operation with computation right into the MRF. + */ +void brw_remove_grf_to_mrf_moves(struct brw_compile *p) +{ + int i, j, prev; + struct brw_context *brw = p->brw; + const int gen = brw->intel.gen; + const int simd16_size = 2*REG_SIZE; + + GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn); + assert(removeInst); + + for (i = 0; i < p->nr_insn; i++) { + if (removeInst[i]) + continue; + + struct brw_instruction *grf_inst = NULL; + const struct brw_instruction *mov = p->store + i; + int mrf_index, grf_index; + GLboolean is_compr4; + + /* Only consider _straight_ grf-to-mrf moves */ + if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4)) + continue; + + /* Using comp4 enables a stride of 4 for this instruction */ + const int mrf_index0 = mrf_index; + const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1; + + /* Look where the register has been set */ + prev = i; + GLboolean potential_remove = GL_FALSE; + while (prev--) { + + /* If _one_ instruction writes the grf, we try to remove the mov */ + struct brw_instruction *inst = p->store + prev; + if (brw_is_grf_straight_write(inst, grf_index)) { + potential_remove = GL_TRUE; + grf_inst = inst; + break; + } + + } + + if (potential_remove == GL_FALSE) + continue; + removeInst[i] = GL_TRUE; + + /* Monitor first the section of code between the grf computation and the + * mov. Here we cannot read or write both mrf and grf register + */ + for (j = prev + 1; j < i; ++j) { + struct brw_instruction *inst = p->store + j; + if (removeInst[j]) + continue; + if (brw_is_grf_written(inst, grf_index, simd16_size, gen) || + brw_is_grf_read(inst, grf_index, simd16_size) || + brw_is_mrf_written(inst, mrf_index0, REG_SIZE) || + brw_is_mrf_written(inst, mrf_index1, REG_SIZE) || + brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) || + brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) { + removeInst[i] = GL_FALSE; + break; + } + } + + /* After the mov, we can read or write the mrf. If the grf is overwritten, + * we are done + */ + for (j = i + 1; j < p->nr_insn; ++j) { + struct brw_instruction *inst = p->store + j; + if (removeInst[j]) + continue; + + if (brw_is_grf_read(inst, grf_index, simd16_size)) { + removeInst[i] = GL_FALSE; + break; + } + + if (brw_is_grf_straight_write(inst, grf_index)) + break; + } + + /* Note that with the top down traversal, we can safely pacth the mov + * instruction + */ + if (removeInst[i]) { + grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file; + grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr; + } + } + + brw_remove_inst(p, removeInst); + free(removeInst); +} + static GLboolean is_single_channel_dp4(struct brw_instruction *insn) { diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index bd560acdad..4b08d2599b 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -31,10 +31,10 @@ #include "main/imports.h" #include "main/enums.h" -#include "shader/prog_parameter.h" -#include "shader/program.h" -#include "shader/programopt.h" -#include "shader/shader_api.h" +#include "main/shaderobj.h" +#include "program/prog_parameter.h" +#include "program/program.h" +#include "program/programopt.h" #include "tnl/tnl.h" #include "brw_context.h" @@ -174,9 +174,36 @@ static GLboolean brwProgramStringNotify( GLcontext *ctx, shader_error(ctx, prog, "i965 driver doesn't yet support uninlined function " "calls. Move to using a single return statement at " - "the end of the function to work around it."); + "the end of the function to work around it.\n"); return GL_FALSE; } + if (prog->Instructions[i].DstReg.RelAddr && + prog->Instructions[i].DstReg.File == PROGRAM_INPUT) { + shader_error(ctx, prog, + "Variable indexing of shader inputs unsupported\n"); + return GL_FALSE; + } + if (prog->Instructions[i].DstReg.RelAddr && + prog->Instructions[i].DstReg.File == PROGRAM_OUTPUT) { + shader_error(ctx, prog, + "Variable indexing of shader outputs unsupported\n"); + return GL_FALSE; + } + if (target == GL_FRAGMENT_PROGRAM_ARB) { + if ((prog->Instructions[i].DstReg.RelAddr && + prog->Instructions[i].DstReg.File == PROGRAM_TEMPORARY) || + (prog->Instructions[i].SrcReg[0].RelAddr && + prog->Instructions[i].SrcReg[0].File == PROGRAM_TEMPORARY) || + (prog->Instructions[i].SrcReg[1].RelAddr && + prog->Instructions[i].SrcReg[1].File == PROGRAM_TEMPORARY) || + (prog->Instructions[i].SrcReg[2].RelAddr && + prog->Instructions[i].SrcReg[2].File == PROGRAM_TEMPORARY)) { + shader_error(ctx, prog, + "Variable indexing of variable arrays in the FS " + "unsupported\n"); + return GL_FALSE; + } + } } return GL_TRUE; diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h index a0680a56f2..e525c730d3 100644 --- a/src/mesa/drivers/dri/i965/brw_sf.h +++ b/src/mesa/drivers/dri/i965/brw_sf.h @@ -34,7 +34,7 @@ #define BRW_SF_H -#include "shader/program.h" +#include "program/program.h" #include "brw_context.h" #include "brw_eu.h" diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c index e290ca92f6..914f275cc6 100644 --- a/src/mesa/drivers/dri/i965/brw_sf_state.c +++ b/src/mesa/drivers/dri/i965/brw_sf_state.c @@ -130,7 +130,7 @@ struct brw_sf_unit_key { unsigned scissor:1; unsigned line_smooth:1; unsigned point_sprite:1; - unsigned point_attenuated:1; + unsigned use_vs_point_size:1; unsigned render_to_fbo:1; float line_width; float point_size; @@ -164,7 +164,8 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key) key->point_sprite = ctx->Point.PointSprite; key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize); - key->point_attenuated = ctx->Point._Attenuated; + key->use_vs_point_size = (ctx->VertexProgram.PointSizeEnabled || + ctx->Point._Attenuated); /* _NEW_LIGHT */ key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION); @@ -296,7 +297,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key, /* _NEW_POINT */ sf.sf7.sprite_point = key->point_sprite; sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3); - sf.sf7.use_point_size_state = !key->point_attenuated; + sf.sf7.use_point_size_state = !key->use_vs_point_size; sf.sf7.aa_line_distance_mode = 0; /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons: diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h index 2a7fa5b699..2fde42a706 100644 --- a/src/mesa/drivers/dri/i965/brw_structs.h +++ b/src/mesa/drivers/dri/i965/brw_structs.h @@ -1657,8 +1657,36 @@ struct brw_instruction GLuint end_of_thread:1; } dp_write_gen5; + /* Sandybridge DP for sample cache, constant cache, render cache */ struct { - GLuint pad:16; + GLuint binding_table_index:8; + GLuint msg_control:5; + GLuint msg_type:3; + GLuint pad0:3; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } dp_sampler_const_cache; + + struct { + GLuint binding_table_index:8; + GLuint msg_control:3; + GLuint slot_group_select:1; + GLuint pixel_scoreboard_clear:1; + GLuint msg_type:4; + GLuint send_commit_msg:1; + GLuint pad0:1; + GLuint header_present:1; + GLuint response_length:5; + GLuint msg_length:4; + GLuint pad1:2; + GLuint end_of_thread:1; + } dp_render_cache; + + struct { + GLuint function_control:16; GLuint response_length:4; GLuint msg_length:4; GLuint msg_target:4; @@ -1666,8 +1694,9 @@ struct brw_instruction GLuint end_of_thread:1; } generic; + /* Of this struct, only end_of_thread is not present for gen6. */ struct { - GLuint pad:19; + GLuint function_control:19; GLuint header_present:1; GLuint response_length:5; GLuint msg_length:4; diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c index bba9249d1b..1db2a210d4 100644 --- a/src/mesa/drivers/dri/i965/brw_util.c +++ b/src/mesa/drivers/dri/i965/brw_util.c @@ -31,7 +31,7 @@ #include "main/mtypes.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" #include "brw_util.h" #include "brw_defines.h" diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 3c12f11ea7..9a832af9a9 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -34,8 +34,8 @@ #include "brw_vs.h" #include "brw_util.h" #include "brw_state.h" -#include "shader/prog_print.h" -#include "shader/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_parameter.h" diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index 6493744f3e..9338a6b7db 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -36,7 +36,7 @@ #include "brw_context.h" #include "brw_eu.h" -#include "shader/program.h" +#include "program/program.h" struct brw_vs_prog_key { diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c index 128987d78a..c1d6525e9b 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c @@ -31,9 +31,9 @@ #include "main/macros.h" -#include "shader/program.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" +#include "program/program.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" #include "brw_context.h" #include "brw_vs.h" @@ -44,6 +44,7 @@ static GLboolean brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg) { int opcode_array[] = { + [OPCODE_MOV] = 1, [OPCODE_ADD] = 2, [OPCODE_CMP] = 3, [OPCODE_DP3] = 2, @@ -218,7 +219,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) c->first_overflow_output = 0; if (intel->gen >= 6) - mrf = 6; + mrf = 4; else if (intel->gen == 5) mrf = 8; else @@ -238,12 +239,25 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) mrf++; /* just a placeholder? XXX fix later stages & remove this */ } else { - if (mrf < 16) { + /* Two restrictions on our compute-to-MRF here. The + * message length for all SEND messages is restricted to + * [1,15], so we can't use mrf 15, as that means a length + * of 16. + * + * Additionally, URB writes are aligned to URB rows, so we + * need to put an even number of registers of URB data in + * each URB write so that the later write is aligned. A + * message length of 15 means 1 message header reg plus 14 + * regs of URB data. + * + * For attributes beyond the compute-to-MRF, we compute to + * GRFs and they will be written in the second URB_WRITE. + */ + if (mrf < 15) { c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf); mrf++; } else { - /* too many vertex results to fit in MRF, use GRF for overflow */ if (!c->first_overflow_output) c->first_overflow_output = i; c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0); @@ -318,8 +332,11 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) */ attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs); + /* See emit_vertex_write() for where the VUE's overhead on top of the + * attributes comes from. + */ if (intel->gen >= 6) - c->prog_data.urb_entry_size = (attributes_in_vue + 4 + 7) / 8; + c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 7) / 8; else if (intel->gen == 5) c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4; else @@ -869,8 +886,6 @@ get_constant(struct brw_vs_compile *c, assert(argIndex < 3); if (c->current_const[argIndex].index != src->Index) { - struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0]; - /* Keep track of the last constant loaded in this slot, for reuse. */ c->current_const[argIndex].index = src->Index; @@ -881,9 +896,6 @@ get_constant(struct brw_vs_compile *c, /* need to fetch the constant now */ brw_dp_READ_4_vs(p, const_reg, /* writeback dest */ - 0, /* oword */ - 0, /* relative indexing? */ - addrReg, /* address register */ 16 * src->Index, /* byte offset */ SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ ); @@ -904,8 +916,8 @@ get_reladdr_constant(struct brw_vs_compile *c, const struct prog_src_register *src = &inst->SrcReg[argIndex]; struct brw_compile *p = &c->func; struct brw_reg const_reg = c->current_const[argIndex].reg; - struct brw_reg const2_reg; struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0]; + struct brw_reg byte_addr_reg = get_tmp(c); assert(argIndex < 3); @@ -917,37 +929,15 @@ get_reladdr_constant(struct brw_vs_compile *c, src->Index, argIndex, c->current_const[argIndex].reg.nr); #endif + brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16)); + /* fetch the first vec4 */ - brw_dp_READ_4_vs(p, - const_reg, /* writeback dest */ - 0, /* oword */ - 1, /* relative indexing? */ - addrReg, /* address register */ - 16 * src->Index, /* byte offset */ - SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ - ); - /* second vec4 */ - const2_reg = get_tmp(c); - - /* use upper half of address reg for second read */ - addrReg = stride(addrReg, 0, 4, 0); - addrReg.subnr = 16; - - brw_dp_READ_4_vs(p, - const2_reg, /* writeback dest */ - 1, /* oword */ - 1, /* relative indexing? */ - addrReg, /* address register */ - 16 * src->Index, /* byte offset */ - SURF_INDEX_VERT_CONST_BUFFER - ); - - /* merge the two Owords into the constant register */ - /* const_reg[7..4] = const2_reg[7..4] */ - brw_MOV(p, - suboffset(stride(const_reg, 0, 4, 1), 4), - suboffset(stride(const2_reg, 0, 4, 1), 4)); - release_tmp(c, const2_reg); + brw_dp_READ_4_vs_relative(p, + const_reg, /* writeback dest */ + byte_addr_reg, /* address register */ + 16 * src->Index, /* byte offset */ + SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ + ); return const_reg; } @@ -993,36 +983,71 @@ static struct brw_reg get_reg( struct brw_vs_compile *c, */ static struct brw_reg deref( struct brw_vs_compile *c, struct brw_reg arg, - GLint offset) + GLint offset, + GLuint reg_size ) { struct brw_compile *p = &c->func; - struct brw_reg tmp = vec4(get_tmp(c)); + struct brw_reg tmp = get_tmp(c); struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; - struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW); - GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16; + struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D); + GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size; struct brw_reg indirect = brw_vec4_indirect(0,0); + struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW); + + /* Set the vertical stride on the register access so that the first + * 4 components come from a0.0 and the second 4 from a0.1. + */ + indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL; { brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); - /* This is pretty clunky - load the address register twice and - * fetch each 4-dword value in turn. There must be a way to do - * this in a single pass, but I couldn't get it to work. - */ - brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset)); - brw_MOV(p, tmp, indirect); + brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset)); + + brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset)); - brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset)); - brw_MOV(p, suboffset(tmp, 4), indirect); + brw_MOV(p, tmp, indirect); brw_pop_insn_state(p); } - + /* NOTE: tmp not released */ - return vec8(tmp); + return tmp; } +static void +move_to_reladdr_dst(struct brw_vs_compile *c, + const struct prog_instruction *inst, + struct brw_reg val) +{ + struct brw_compile *p = &c->func; + int reg_size = 32; + struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; + struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D); + struct brw_reg temp_base = c->regs[inst->DstReg.File][0]; + GLuint byte_offset = temp_base.nr * 32 + temp_base.subnr; + struct brw_reg indirect = brw_vec4_indirect(0,0); + struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW); + + byte_offset += inst->DstReg.Index * reg_size; + + brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); + + brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset)); + brw_MOV(p, indirect, val); + + brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size)); + brw_ADD(p, brw_address_reg(0), acc, + brw_imm_uw(byte_offset + reg_size / 2)); + brw_MOV(p, indirect, suboffset(val, 4)); + + brw_pop_insn_state(p); +} /** * Get brw reg corresponding to the instruction's [argIndex] src reg. @@ -1091,7 +1116,7 @@ get_src_reg( struct brw_vs_compile *c, case PROGRAM_INPUT: case PROGRAM_OUTPUT: if (relAddr) { - return deref(c, c->regs[file][0], index); + return deref(c, c->regs[file][0], index, 32); } else { assert(c->regs[file][index].nr != 0); @@ -1113,7 +1138,7 @@ get_src_reg( struct brw_vs_compile *c, return get_constant(c, inst, argIndex); } else if (relAddr) { - return deref(c, c->regs[PROGRAM_STATE_VAR][0], index); + return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16); } else { assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0); @@ -1134,26 +1159,6 @@ get_src_reg( struct brw_vs_compile *c, } } - -static void emit_arl( struct brw_vs_compile *c, - struct brw_reg dst, - struct brw_reg arg0 ) -{ - struct brw_compile *p = &c->func; - struct brw_reg tmp = dst; - GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE); - - if (need_tmp) - tmp = get_tmp(c); - - brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */ - brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */ - - if (need_tmp) - release_tmp(c, tmp); -} - - /** * Return the brw reg for the given instruction's src argument. * Will return mangled results for SWZ op. The emit_swz() function @@ -1198,8 +1203,17 @@ static struct brw_reg get_dst( struct brw_vs_compile *c, switch (dst.File) { case PROGRAM_TEMPORARY: case PROGRAM_OUTPUT: - assert(c->regs[dst.File][dst.Index].nr != 0); - reg = c->regs[dst.File][dst.Index]; + /* register-indirect addressing is only 1x1, not VxH, for + * destination regs. So, for RelAddr we'll return a temporary + * for the dest and do a move of the result to the RelAddr + * register after the instruction emit. + */ + if (dst.RelAddr) { + reg = get_tmp(c); + } else { + assert(c->regs[dst.File][dst.Index].nr != 0); + reg = c->regs[dst.File][dst.Index]; + } break; case PROGRAM_ADDRESS: assert(dst.Index == 0); @@ -1298,7 +1312,6 @@ static void emit_vertex_write( struct brw_vs_compile *c) struct brw_compile *p = &c->func; struct brw_context *brw = p->brw; struct intel_context *intel = &brw->intel; - struct brw_reg m0 = brw_message_reg(0); struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS]; struct brw_reg ndc; int eot; @@ -1381,16 +1394,19 @@ static void emit_vertex_write( struct brw_vs_compile *c) */ brw_set_access_mode(p, BRW_ALIGN_1); + /* The VUE layout is documented in Volume 2a. */ if (intel->gen >= 6) { - /* There are 16 DWs (D0-D15) in VUE header on Sandybridge: + /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge: * dword 0-3 (m1) of the header is indices, point width, clip flags. * dword 4-7 (m2) is the 4D space position - * dword 8-15 (m3,m4) of the vertex header is the user clip distance. - * m5 is the first vertex data we fill, which is the vertex position. + * dword 8-15 (m3,m4) of the vertex header is the user clip distance if + * enabled. We don't use it, so skip it. + * m3 is the first vertex element data we fill, which is the vertex + * position. */ - brw_MOV(p, offset(m0, 2), pos); - brw_MOV(p, offset(m0, 5), pos); - len_vertex_header = 4; + brw_MOV(p, brw_message_reg(2), pos); + brw_MOV(p, brw_message_reg(3), pos); + len_vertex_header = 2; } else if (intel->gen == 5) { /* There are 20 DWs (D0-D19) in VUE header on Ironlake: * dword 0-3 (m1) of the header is indices, point width, clip flags. @@ -1400,9 +1416,9 @@ static void emit_vertex_write( struct brw_vs_compile *c) * m6 is a pad so that the vertex element data is aligned * m7 is the first vertex data we fill, which is the vertex position. */ - brw_MOV(p, offset(m0, 2), ndc); - brw_MOV(p, offset(m0, 3), pos); - brw_MOV(p, offset(m0, 7), pos); + brw_MOV(p, brw_message_reg(2), ndc); + brw_MOV(p, brw_message_reg(3), pos); + brw_MOV(p, brw_message_reg(7), pos); len_vertex_header = 6; } else { /* There are 8 dwords in VUE header pre-Ironlake: @@ -1412,8 +1428,8 @@ static void emit_vertex_write( struct brw_vs_compile *c) * dword 8-11 (m3) is the first vertex data, which we always have be the * vertex position. */ - brw_MOV(p, offset(m0, 2), ndc); - brw_MOV(p, offset(m0, 3), pos); + brw_MOV(p, brw_message_reg(2), ndc); + brw_MOV(p, brw_message_reg(3), pos); len_vertex_header = 2; } @@ -1437,29 +1453,26 @@ static void emit_vertex_write( struct brw_vs_compile *c) * Move the overflowed attributes from the GRF to the MRF and * issue another brw_urb_WRITE(). */ - /* XXX I'm not 100% sure about which MRF regs to use here. Starting - * at mrf[4] atm... - */ - GLuint i, mrf = 0; + GLuint i, mrf = 1; for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) { if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) { /* move from GRF to MRF */ - brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]); + brw_MOV(p, brw_message_reg(mrf), c->regs[PROGRAM_OUTPUT][i]); mrf++; } } brw_urb_WRITE(p, brw_null_reg(), /* dest */ - 4, /* starting mrf reg nr */ + 0, /* starting mrf reg nr */ c->r0, /* src */ 0, /* allocate */ 1, /* used */ - mrf+1, /* msg len */ + mrf, /* msg len */ 0, /* response len */ 1, /* eot */ 1, /* writes complete */ - BRW_MAX_MRF-1, /* urb destination offset */ + 14 / 2, /* urb destination offset */ BRW_URB_SWIZZLE_INTERLEAVE); } } @@ -1665,7 +1678,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL); break; case OPCODE_ARL: - emit_arl(c, dst, args[0]); + brw_RNDD(p, dst, args[0]); break; case OPCODE_FLR: brw_RNDD(p, dst, args[0]); @@ -1890,6 +1903,14 @@ void brw_vs_emit(struct brw_vs_compile *c ) } } + if (inst->DstReg.RelAddr && inst->DstReg.File == PROGRAM_TEMPORARY) { + /* We don't do RelAddr of PROGRAM_OUTPUT yet, because of the + * compute-to-mrf and the fact that we are allocating + * registers for only the used PROGRAM_OUTPUTs. + */ + move_to_reladdr_dst(c, inst, dst); + } + release_tmps(c); } diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index be9e415cb0..0250a68d29 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -31,7 +31,7 @@ #include "main/mtypes.h" #include "main/texstore.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" #include "brw_context.h" #include "brw_state.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index 197b875434..40f51c21c9 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -34,7 +34,7 @@ #define BRW_WM_H -#include "shader/prog_instruction.h" +#include "program/prog_instruction.h" #include "brw_context.h" #include "brw_eu.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c index a90a2d3cf2..0c625a4cd0 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_emit.c +++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c @@ -1326,7 +1326,7 @@ void emit_fb_write(struct brw_wm_compile *c, * + 1 for the second half we get destination + 4. */ brw_MOV(p, - brw_message_reg(nr + channel + (1 << 7)), + brw_message_reg(nr + channel + BRW_MRF_COMPR4), arg0[channel]); } else { /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ @@ -1763,12 +1763,20 @@ void brw_wm_emit( struct brw_wm_compile *c ) inst->dst[i]->spill_slot); } + /* Only properly tested on ILK */ + if (p->brw->intel.gen == 5) { + brw_remove_duplicate_mrf_moves(p); + if (c->dispatch_width == 16) + brw_remove_grf_to_mrf_moves(p); + } + if (INTEL_DEBUG & DEBUG_WM) { int i; - printf("wm-native:\n"); - for (i = 0; i < p->nr_insn; i++) + printf("wm-native:\n"); + for (i = 0; i < p->nr_insn; i++) brw_disasm(stderr, &p->store[i], p->brw->intel.gen); printf("\n"); } } + diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c index d73c391582..0bef874b88 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_fp.c +++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c @@ -37,9 +37,9 @@ #include "brw_wm.h" #include "brw_util.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_statevars.h" /** An invalid texture target */ diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c index 57be08a8d1..2dd346d6dd 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c +++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c @@ -1,7 +1,7 @@ #include "main/macros.h" -#include "shader/prog_parameter.h" -#include "shader/prog_print.h" -#include "shader/prog_optimize.h" +#include "program/prog_parameter.h" +#include "program/prog_print.h" +#include "program/prog_optimize.h" #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c index 60bd92ed22..05de85a957 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c +++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c @@ -32,7 +32,7 @@ #include "brw_context.h" #include "brw_wm.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 1789b21451..c1cf4db1ca 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -222,7 +222,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key, drm_intel_bo_emit_reloc(bo, offsetof(struct brw_wm_unit_state, thread2), brw->wm.scratch_bo, wm.thread2.per_thread_scratch_space, - 0, 0); + I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER); } /* Emit sampler state relocation */ diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 77898dbbe7..17b016b569 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -32,7 +32,7 @@ #include "main/mtypes.h" #include "main/texstore.h" -#include "shader/prog_parameter.h" +#include "program/prog_parameter.h" #include "intel_mipmap_tree.h" #include "intel_batchbuffer.h" diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c index 51940efb44..6820ca3abf 100644 --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c @@ -69,7 +69,7 @@ upload_sf_state(struct brw_context *brw) dw1 = num_outputs << GEN6_SF_NUM_OUTPUTS_SHIFT | (num_inputs + 1) / 2 << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT | - 3 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT; + 1 << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT; dw2 = GEN6_SF_VIEWPORT_TRANSFORM_ENABLE | GEN6_SF_STATISTICS_ENABLE; dw3 = 0; diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c index 5916a13994..4080a9dedf 100644 --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c @@ -29,8 +29,8 @@ #include "brw_state.h" #include "brw_defines.h" #include "brw_util.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" #include "intel_batchbuffer.h" static void diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index ed1a72f03b..863c85449d 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -29,8 +29,8 @@ #include "brw_state.h" #include "brw_defines.h" #include "brw_util.h" -#include "shader/prog_parameter.h" -#include "shader/prog_statevars.h" +#include "program/prog_parameter.h" +#include "program/prog_statevars.h" #include "intel_batchbuffer.h" static void |