summaryrefslogtreecommitdiff
path: root/src/mesa/drivers/dri/i965/brw_vs_emit.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/mesa/drivers/dri/i965/brw_vs_emit.c')
-rw-r--r--src/mesa/drivers/dri/i965/brw_vs_emit.c666
1 files changed, 531 insertions, 135 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 174331a765..1638ef8111 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -38,18 +38,55 @@
#include "brw_vs.h"
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+ struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+ if (++c->last_tmp > c->prog_data.total_grf)
+ c->prog_data.total_grf = c->last_tmp;
+
+ return tmp;
+}
+
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+ if (tmp.nr == c->last_tmp-1)
+ c->last_tmp--;
+}
+
+static void release_tmps( struct brw_vs_compile *c )
+{
+ c->last_tmp = c->first_tmp;
+}
+
-/* Do things as simply as possible. Allocate and populate all regs
+/**
+ * Preallocate GRF register before code emit.
+ * Do things as simply as possible. Allocate and populate all regs
* ahead of time.
*/
static void brw_vs_alloc_regs( struct brw_vs_compile *c )
{
GLuint i, reg = 0, mrf;
- GLuint nr_params;
+ int attributes_in_vue;
+
+ /* Determine whether to use a real constant buffer or use a block
+ * of GRF registers for constants. The later is faster but only
+ * works if everything fits in the GRF.
+ * XXX this heuristic/check may need some fine tuning...
+ */
+ if (c->vp->program.Base.Parameters->NumParameters +
+ c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+ c->vp->use_const_buffer = GL_TRUE;
+ else
+ c->vp->use_const_buffer = GL_FALSE;
+
+ /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
/* r0 -- reserved as usual
*/
- c->r0 = brw_vec8_grf(reg, 0); reg++;
+ c->r0 = brw_vec8_grf(reg, 0);
+ reg++;
/* User clip planes from curbe:
*/
@@ -60,39 +97,59 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
/* Deal with curbe alignment:
*/
- reg += ((6+c->key.nr_userclip+3)/4)*2;
+ reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
}
/* Vertex program parameters from curbe:
*/
- nr_params = c->vp->program.Base.Parameters->NumParameters;
- for (i = 0; i < nr_params; i++) {
- c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
- }
- reg += (nr_params+1)/2;
+ if (c->vp->use_const_buffer) {
+ /* get constants from a real constant buffer */
+ c->prog_data.curb_read_length = 0;
+ c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
+ }
+ else {
+ /* use a section of the GRF for constants */
+ GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
+ for (i = 0; i < nr_params; i++) {
+ c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+ }
+ reg += (nr_params + 1) / 2;
+ c->prog_data.curb_read_length = reg - 1;
- c->prog_data.curb_read_length = reg - 1;
+ c->prog_data.nr_params = nr_params * 4;
+ }
/* Allocate input regs:
*/
c->nr_inputs = 0;
for (i = 0; i < VERT_ATTRIB_MAX; i++) {
- if (c->prog_data.inputs_read & (1<<i)) {
+ if (c->prog_data.inputs_read & (1 << i)) {
c->nr_inputs++;
c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
reg++;
}
}
+ /* If there are no inputs, we'll still be reading one attribute's worth
+ * because it's required -- see urb_read_length setting.
+ */
+ if (c->nr_inputs == 0)
+ reg++;
- /* Allocate outputs: TODO: could organize the non-position outputs
- * to go straight into message regs.
+ /* Allocate outputs. The non-position outputs go straight into message regs.
*/
c->nr_outputs = 0;
c->first_output = reg;
- mrf = 4;
+ c->first_overflow_output = 0;
+
+ if (BRW_IS_IGDNG(c->func.brw))
+ mrf = 8;
+ else
+ mrf = 4;
+
for (i = 0; i < VERT_RESULT_MAX; i++) {
- if (c->prog_data.outputs_written & (1<<i)) {
+ if (c->prog_data.outputs_written & (1 << i)) {
c->nr_outputs++;
+ assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
if (i == VERT_RESULT_HPOS) {
c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
reg++;
@@ -103,8 +160,17 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
mrf++; /* just a placeholder? XXX fix later stages & remove this */
}
else {
- c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
- mrf++;
+ if (mrf < 16) {
+ c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
+ mrf++;
+ }
+ else {
+ /* too many vertex results to fit in MRF, use GRF for overflow */
+ if (!c->first_overflow_output)
+ c->first_overflow_output = i;
+ c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+ reg++;
+ }
}
}
}
@@ -132,17 +198,24 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
reg++;
}
+ if (c->vp->use_const_buffer) {
+ for (i = 0; i < 3; i++) {
+ c->current_const[i].index = -1;
+ c->current_const[i].reg = brw_vec8_grf(reg, 0);
+ reg++;
+ }
+ }
+
for (i = 0; i < 128; i++) {
- if (c->output_regs[i].used_in_src) {
- c->output_regs[i].reg = brw_vec8_grf(reg, 0);
- reg++;
- }
+ if (c->output_regs[i].used_in_src) {
+ c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+ reg++;
+ }
}
c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
reg += 2;
-
-
+
/* Some opcodes need an internal temporary:
*/
c->first_tmp = reg;
@@ -152,35 +225,38 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
* urb_read_length is the number of registers read from *each*
* vertex urb, so is half the amount:
*/
- c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
-
- c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
- c->prog_data.total_grf = reg;
-}
-
+ c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+ /* Setting this field to 0 leads to undefined behavior according to the
+ * the VS_STATE docs. Our VUEs will always have at least one attribute
+ * sitting in them, even if it's padding.
+ */
+ if (c->prog_data.urb_read_length == 0)
+ c->prog_data.urb_read_length = 1;
-static struct brw_reg get_tmp( struct brw_vs_compile *c )
-{
- struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+ /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
+ * them to fit the biggest thing they need to.
+ */
+ attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
- if (++c->last_tmp > c->prog_data.total_grf)
- c->prog_data.total_grf = c->last_tmp;
+ if (BRW_IS_IGDNG(c->func.brw))
+ c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+ else
+ c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
- return tmp;
-}
+ c->prog_data.total_grf = reg;
-static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
-{
- if (tmp.nr == c->last_tmp-1)
- c->last_tmp--;
-}
-
-static void release_tmps( struct brw_vs_compile *c )
-{
- c->last_tmp = c->first_tmp;
+ if (INTEL_DEBUG & DEBUG_VS) {
+ _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
+ _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+ _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
+ }
}
+/**
+ * If an instruction uses a temp reg both as a src and the dest, we
+ * sometimes need to allocate an intermediate temporary.
+ */
static void unalias1( struct brw_vs_compile *c,
struct brw_reg dst,
struct brw_reg arg0,
@@ -200,6 +276,10 @@ static void unalias1( struct brw_vs_compile *c,
}
}
+/**
+ * \sa unalias2
+ * Checkes if 2-operand instruction needs an intermediate temporary.
+ */
static void unalias2( struct brw_vs_compile *c,
struct brw_reg dst,
struct brw_reg arg0,
@@ -222,6 +302,10 @@ static void unalias2( struct brw_vs_compile *c,
}
}
+/**
+ * \sa unalias2
+ * Checkes if 3-operand instruction needs an intermediate temporary.
+ */
static void unalias3( struct brw_vs_compile *c,
struct brw_reg dst,
struct brw_reg arg0,
@@ -615,6 +699,8 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
}
brw_ENDIF(p, if_insn);
+
+ release_tmp(c, tmp);
}
static void emit_lrp_noalias(struct brw_vs_compile *c,
@@ -655,13 +741,84 @@ static void emit_nrm( struct brw_vs_compile *c,
}
+static struct brw_reg
+get_constant(struct brw_vs_compile *c,
+ const struct prog_instruction *inst,
+ GLuint argIndex)
+{
+ const struct prog_src_register *src = &inst->SrcReg[argIndex];
+ struct brw_compile *p = &c->func;
+ struct brw_reg const_reg;
+ struct brw_reg const2_reg;
+ const GLboolean relAddr = src->RelAddr;
+
+ assert(argIndex < 3);
+
+ if (c->current_const[argIndex].index != src->Index || relAddr) {
+ struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+
+ c->current_const[argIndex].index = src->Index;
+
+#if 0
+ printf(" fetch const[%d] for arg %d into reg %d\n",
+ src->Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+ /* need to fetch the constant now */
+ brw_dp_READ_4_vs(p,
+ c->current_const[argIndex].reg,/* writeback dest */
+ 0, /* oword */
+ relAddr, /* relative indexing? */
+ addrReg, /* address register */
+ 16 * src->Index, /* byte offset */
+ SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
+ );
+
+ if (relAddr) {
+ /* second read */
+ const2_reg = get_tmp(c);
+
+ /* use upper half of address reg for second read */
+ addrReg = stride(addrReg, 0, 4, 0);
+ addrReg.subnr = 16;
+
+ brw_dp_READ_4_vs(p,
+ const2_reg, /* writeback dest */
+ 1, /* oword */
+ relAddr, /* relative indexing? */
+ addrReg, /* address register */
+ 16 * src->Index, /* byte offset */
+ SURF_INDEX_VERT_CONST_BUFFER
+ );
+ }
+ }
+
+ const_reg = c->current_const[argIndex].reg;
+
+ if (relAddr) {
+ /* merge the two Owords into the constant register */
+ /* const_reg[7..4] = const2_reg[7..4] */
+ brw_MOV(p,
+ suboffset(stride(const_reg, 0, 4, 1), 4),
+ suboffset(stride(const2_reg, 0, 4, 1), 4));
+ release_tmp(c, const2_reg);
+ }
+ else {
+ /* replicate lower four floats into upper half (to get XYZWXYZW) */
+ const_reg = stride(const_reg, 0, 4, 0);
+ const_reg.subnr = 0;
+ }
+
+ return const_reg;
+}
+
+
+
/* TODO: relative addressing!
*/
static struct brw_reg get_reg( struct brw_vs_compile *c,
- GLuint file,
+ gl_register_file file,
GLuint index )
{
-
switch (file) {
case PROGRAM_TEMPORARY:
case PROGRAM_INPUT:
@@ -690,13 +847,17 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
}
+/**
+ * Indirect addressing: get reg[[arg] + offset].
+ */
static struct brw_reg deref( struct brw_vs_compile *c,
struct brw_reg arg,
GLint offset)
{
struct brw_compile *p = &c->func;
struct brw_reg tmp = vec4(get_tmp(c));
- struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
+ struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+ struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
struct brw_reg indirect = brw_vec4_indirect(0,0);
@@ -717,10 +878,67 @@ static struct brw_reg deref( struct brw_vs_compile *c,
brw_pop_insn_state(p);
}
+ /* NOTE: tmp not released */
return vec8(tmp);
}
+/**
+ * Get brw reg corresponding to the instruction's [argIndex] src reg.
+ * TODO: relative addressing!
+ */
+static struct brw_reg
+get_src_reg( struct brw_vs_compile *c,
+ const struct prog_instruction *inst,
+ GLuint argIndex )
+{
+ const GLuint file = inst->SrcReg[argIndex].File;
+ const GLint index = inst->SrcReg[argIndex].Index;
+ const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
+
+ switch (file) {
+ case PROGRAM_TEMPORARY:
+ case PROGRAM_INPUT:
+ case PROGRAM_OUTPUT:
+ if (relAddr) {
+ return deref(c, c->regs[file][0], index);
+ }
+ else {
+ assert(c->regs[file][index].nr != 0);
+ return c->regs[file][index];
+ }
+
+ case PROGRAM_STATE_VAR:
+ case PROGRAM_CONSTANT:
+ case PROGRAM_UNIFORM:
+ case PROGRAM_ENV_PARAM:
+ if (c->vp->use_const_buffer) {
+ return get_constant(c, inst, argIndex);
+ }
+ else if (relAddr) {
+ return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+ }
+ else {
+ assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
+ return c->regs[PROGRAM_STATE_VAR][index];
+ }
+ case PROGRAM_ADDRESS:
+ assert(index == 0);
+ return c->regs[file][index];
+
+ case PROGRAM_UNDEFINED:
+ /* this is a normal case since we loop over all three src args */
+ return brw_null_reg();
+
+ case PROGRAM_LOCAL_PARAM:
+ case PROGRAM_WRITE_ONLY:
+ default:
+ assert(0);
+ return brw_null_reg();
+ }
+}
+
+
static void emit_arl( struct brw_vs_compile *c,
struct brw_reg dst,
struct brw_reg arg0 )
@@ -732,30 +950,31 @@ static void emit_arl( struct brw_vs_compile *c,
if (need_tmp)
tmp = get_tmp(c);
- brw_RNDD(p, tmp, arg0);
- brw_MUL(p, dst, tmp, brw_imm_d(16));
+ brw_RNDD(p, tmp, arg0); /* tmp = round(arg0) */
+ brw_MUL(p, dst, tmp, brw_imm_d(16)); /* dst = tmp * 16 */
if (need_tmp)
release_tmp(c, tmp);
}
-/* Will return mangled results for SWZ op. The emit_swz() function
+/**
+ * Return the brw reg for the given instruction's src argument.
+ * Will return mangled results for SWZ op. The emit_swz() function
* ignores this result and recalculates taking extended swizzles into
* account.
*/
static struct brw_reg get_arg( struct brw_vs_compile *c,
- struct prog_src_register *src )
+ const struct prog_instruction *inst,
+ GLuint argIndex )
{
+ const struct prog_src_register *src = &inst->SrcReg[argIndex];
struct brw_reg reg;
if (src->File == PROGRAM_UNDEFINED)
return brw_null_reg();
- if (src->RelAddr)
- reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
- else
- reg = get_reg(c, src->File, src->Index);
+ reg = get_src_reg(c, inst, argIndex);
/* Convert 3-bit swizzle to 2-bit.
*/
@@ -766,16 +985,38 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
/* Note this is ok for non-swizzle instructions:
*/
- reg.negate = src->NegateBase ? 1 : 0;
+ reg.negate = src->Negate ? 1 : 0;
return reg;
}
+/**
+ * Get brw register for the given program dest register.
+ */
static struct brw_reg get_dst( struct brw_vs_compile *c,
struct prog_dst_register dst )
{
- struct brw_reg reg = get_reg(c, dst.File, dst.Index);
+ struct brw_reg reg;
+
+ switch (dst.File) {
+ case PROGRAM_TEMPORARY:
+ case PROGRAM_OUTPUT:
+ assert(c->regs[dst.File][dst.Index].nr != 0);
+ reg = c->regs[dst.File][dst.Index];
+ break;
+ case PROGRAM_ADDRESS:
+ assert(dst.Index == 0);
+ reg = c->regs[dst.File][dst.Index];
+ break;
+ case PROGRAM_UNDEFINED:
+ /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
+ reg = brw_null_reg();
+ break;
+ default:
+ assert(0);
+ reg = brw_null_reg();
+ }
reg.dw1.bits.writemask = dst.WriteMask;
@@ -785,14 +1026,16 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
static void emit_swz( struct brw_vs_compile *c,
struct brw_reg dst,
- struct prog_src_register src )
+ const struct prog_instruction *inst)
{
+ const GLuint argIndex = 0;
+ const struct prog_src_register src = inst->SrcReg[argIndex];
struct brw_compile *p = &c->func;
GLuint zeros_mask = 0;
GLuint ones_mask = 0;
GLuint src_mask = 0;
GLubyte src_swz[4];
- GLboolean need_tmp = (src.NegateBase &&
+ GLboolean need_tmp = (src.Negate &&
dst.file != BRW_GENERAL_REGISTER_FILE);
struct brw_reg tmp = dst;
GLuint i;
@@ -826,10 +1069,7 @@ static void emit_swz( struct brw_vs_compile *c,
if (src_mask) {
struct brw_reg arg0;
- if (src.RelAddr)
- arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
- else
- arg0 = get_reg(c, src.File, src.Index);
+ arg0 = get_src_reg(c, inst, argIndex);
arg0 = brw_swizzle(arg0,
src_swz[0], src_swz[1],
@@ -844,8 +1084,8 @@ static void emit_swz( struct brw_vs_compile *c,
if (ones_mask)
brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
- if (src.NegateBase)
- brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
+ if (src.Negate)
+ brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
if (need_tmp) {
brw_MOV(p, dst, tmp);
@@ -863,6 +1103,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
struct brw_reg m0 = brw_message_reg(0);
struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
struct brw_reg ndc;
+ int eot;
+ GLuint len_vertext_header = 2;
if (c->key.copy_edgeflag) {
brw_MOV(p,
@@ -871,21 +1113,17 @@ static void emit_vertex_write( struct brw_vs_compile *c)
}
/* Build ndc coords */
- if (!c->key.know_w_is_one) {
- ndc = get_tmp(c);
- emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
- brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
- }
- else {
- ndc = pos;
- }
+ ndc = get_tmp(c);
+ /* ndc = 1.0 / pos.w */
+ emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+ /* ndc.xyz = pos * ndc */
+ brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
/* Update the header for point size, user clipping flags, and -ve rhw
* workaround.
*/
if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
- c->key.nr_userclip ||
- (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one))
+ c->key.nr_userclip || BRW_IS_965(p->brw))
{
struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
GLuint i;
@@ -916,7 +1154,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
* Later, clipping will detect ucp[6] and ensure the primitive is
* clipped against all fixed planes.
*/
- if (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one) {
+ if (BRW_IS_965(p->brw)) {
brw_CMP(p,
vec8(brw_null_reg()),
BRW_CONDITIONAL_L,
@@ -943,7 +1181,23 @@ static void emit_vertex_write( struct brw_vs_compile *c)
*/
brw_set_access_mode(p, BRW_ALIGN_1);
brw_MOV(p, offset(m0, 2), ndc);
- brw_MOV(p, offset(m0, 3), pos);
+
+ if (BRW_IS_IGDNG(p->brw)) {
+ /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+ brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
+ /* m4, m5 contain the distances from vertex to the user clip planeXXX.
+ * Seems it is useless for us.
+ * m6 is used for aligning, so that the remainder of vertex element is
+ * reg-aligned.
+ */
+ brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
+ len_vertext_header = 6;
+ } else {
+ brw_MOV(p, offset(m0, 3), pos);
+ len_vertext_header = 2;
+ }
+
+ eot = (c->first_overflow_output == 0);
brw_urb_WRITE(p,
brw_null_reg(), /* dest */
@@ -951,62 +1205,126 @@ static void emit_vertex_write( struct brw_vs_compile *c)
c->r0, /* src */
0, /* allocate */
1, /* used */
- c->nr_outputs + 3, /* msg len */
+ MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
0, /* response len */
- 1, /* eot */
- 1, /* writes complete */
+ eot, /* eot */
+ eot, /* writes complete */
0, /* urb destination offset */
BRW_URB_SWIZZLE_INTERLEAVE);
+
+ if (c->first_overflow_output > 0) {
+ /* Not all of the vertex outputs/results fit into the MRF.
+ * Move the overflowed attributes from the GRF to the MRF and
+ * issue another brw_urb_WRITE().
+ */
+ /* XXX I'm not 100% sure about which MRF regs to use here. Starting
+ * at mrf[4] atm...
+ */
+ GLuint i, mrf = 0;
+ for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
+ if (c->prog_data.outputs_written & (1 << i)) {
+ /* move from GRF to MRF */
+ brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+ mrf++;
+ }
+ }
+
+ brw_urb_WRITE(p,
+ brw_null_reg(), /* dest */
+ 4, /* starting mrf reg nr */
+ c->r0, /* src */
+ 0, /* allocate */
+ 1, /* used */
+ mrf+1, /* msg len */
+ 0, /* response len */
+ 1, /* eot */
+ 1, /* writes complete */
+ BRW_MAX_MRF-1, /* urb destination offset */
+ BRW_URB_SWIZZLE_INTERLEAVE);
+ }
}
+/**
+ * Called after code generation to resolve subroutine calls and the
+ * END instruction.
+ * \param end_inst points to brw code for END instruction
+ * \param last_inst points to last instruction emitted before vertex write
+ */
static void
-post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
+post_vs_emit( struct brw_vs_compile *c,
+ struct brw_instruction *end_inst,
+ struct brw_instruction *last_inst )
{
- GLuint nr_insns = c->vp->program.Base.NumInstructions;
- GLuint insn, target_insn;
- struct prog_instruction *inst1, *inst2;
- struct brw_instruction *brw_inst1, *brw_inst2;
- int offset;
- for (insn = 0; insn < nr_insns; insn++) {
- inst1 = &c->vp->program.Base.Instructions[insn];
- brw_inst1 = inst1->Data;
- switch (inst1->Opcode) {
- case OPCODE_CAL:
- case OPCODE_BRA:
- target_insn = inst1->BranchTarget;
- inst2 = &c->vp->program.Base.Instructions[target_insn];
- brw_inst2 = inst2->Data;
- offset = brw_inst2 - brw_inst1;
- brw_set_src1(brw_inst1, brw_imm_d(offset*16));
- break;
- case OPCODE_END:
- offset = end_inst - brw_inst1;
- brw_set_src1(brw_inst1, brw_imm_d(offset*16));
- break;
- default:
- break;
- }
+ GLint offset;
+
+ brw_resolve_cals(&c->func);
+
+ /* patch up the END code to jump past subroutines, etc */
+ offset = last_inst - end_inst;
+ if (offset > 1) {
+ brw_set_src1(end_inst, brw_imm_d(offset * 16));
+ } else {
+ end_inst->header.opcode = BRW_OPCODE_NOP;
+ }
+}
+
+static uint32_t
+get_predicate(const struct prog_instruction *inst)
+{
+ if (inst->DstReg.CondMask == COND_TR)
+ return BRW_PREDICATE_NONE;
+
+ /* All of GLSL only produces predicates for COND_NE and one channel per
+ * vector. Fail badly if someone starts doing something else, as it might
+ * mean infinite looping or something.
+ *
+ * We'd like to support all the condition codes, but our hardware doesn't
+ * quite match the Mesa IR, which is modeled after the NV extensions. For
+ * those, the instruction may update the condition codes or not, then any
+ * later instruction may use one of those condition codes. For gen4, the
+ * instruction may update the flags register based on one of the condition
+ * codes output by the instruction, and then further instructions may
+ * predicate on that. We can probably support this, but it won't
+ * necessarily be easy.
+ */
+ assert(inst->DstReg.CondMask == COND_NE);
+
+ switch (inst->DstReg.CondSwizzle) {
+ case SWIZZLE_XXXX:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+ case SWIZZLE_YYYY:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+ case SWIZZLE_ZZZZ:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+ case SWIZZLE_WWWW:
+ return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+ default:
+ _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
+ inst->DstReg.CondMask);
+ return BRW_PREDICATE_NORMAL;
}
}
-/* Emit the fragment program instructions here.
+/* Emit the vertex program instructions here.
*/
void brw_vs_emit(struct brw_vs_compile *c )
{
-#define MAX_IFSN 32
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
struct brw_compile *p = &c->func;
- GLuint nr_insns = c->vp->program.Base.NumInstructions;
- GLuint insn, if_insn = 0;
- struct brw_instruction *end_inst;
- struct brw_instruction *if_inst[MAX_IFSN];
- struct brw_indirect stack_index = brw_indirect(0, 0);
-
+ struct brw_context *brw = p->brw;
+ const GLuint nr_insns = c->vp->program.Base.NumInstructions;
+ GLuint insn, if_depth = 0, loop_depth = 0;
+ GLuint end_offset = 0;
+ struct brw_instruction *end_inst, *last_inst;
+ struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+ const struct brw_indirect stack_index = brw_indirect(0, 0);
GLuint index;
GLuint file;
if (INTEL_DEBUG & DEBUG_VS) {
- _mesa_printf("vs-emit:\n");
+ _mesa_printf("vs-mesa:\n");
_mesa_print_program(&c->vp->program.Base);
_mesa_printf("\n");
}
@@ -1035,22 +1353,26 @@ void brw_vs_emit(struct brw_vs_compile *c )
for (insn = 0; insn < nr_insns; insn++) {
- struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+ const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
struct brw_reg args[3], dst;
GLuint i;
+#if 0
+ printf("%d: ", insn);
+ _mesa_print_instruction(inst);
+#endif
+
/* Get argument regs. SWZ is special and does this itself.
*/
- inst->Data = &p->store[p->nr_insn];
if (inst->Opcode != OPCODE_SWZ)
for (i = 0; i < 3; i++) {
- struct prog_src_register *src = &inst->SrcReg[i];
+ const struct prog_src_register *src = &inst->SrcReg[i];
index = src->Index;
file = src->File;
- if (file == PROGRAM_OUTPUT&&c->output_regs[index].used_in_src)
+ if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
args[i] = c->output_regs[index].reg;
else
- args[i] = get_arg(c, src);
+ args[i] = get_arg(c, inst, i);
}
/* Get dest regs. Note that it is possible for a reg to be both
@@ -1178,7 +1500,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
/* The args[0] value can't be used here as it won't have
* correctly encoded the full swizzle:
*/
- emit_swz(c, dst, inst->SrcReg[0] );
+ emit_swz(c, dst, inst);
break;
case OPCODE_TRUNC:
/* round toward zero */
@@ -1188,20 +1510,61 @@ void brw_vs_emit(struct brw_vs_compile *c )
emit_xpd(p, dst, args[0], args[1]);
break;
case OPCODE_IF:
- assert(if_insn < MAX_IFSN);
- if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+ assert(if_depth < MAX_IF_DEPTH);
+ if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+ /* Note that brw_IF smashes the predicate_control field. */
+ if_inst[if_depth]->header.predicate_control = get_predicate(inst);
+ if_depth++;
break;
case OPCODE_ELSE:
- if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+ if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
break;
case OPCODE_ENDIF:
- assert(if_insn > 0);
- brw_ENDIF(p, if_inst[--if_insn]);
+ assert(if_depth > 0);
+ brw_ENDIF(p, if_inst[--if_depth]);
break;
+ case OPCODE_BGNLOOP:
+ loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+ break;
+ case OPCODE_BRK:
+ brw_set_predicate_control(p, get_predicate(inst));
+ brw_BREAK(p);
+ brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+ case OPCODE_CONT:
+ brw_set_predicate_control(p, get_predicate(inst));
+ brw_CONT(p);
+ brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+ case OPCODE_ENDLOOP:
+ {
+ struct brw_instruction *inst0, *inst1;
+ GLuint br = 1;
+
+ loop_depth--;
+
+ if (BRW_IS_IGDNG(brw))
+ br = 2;
+
+ inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+ /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+ while (inst0 > loop_inst[loop_depth]) {
+ inst0--;
+ if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+ inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+ inst0->bits3.if_else.pop_count = 0;
+ }
+ else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+ inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+ inst0->bits3.if_else.pop_count = 0;
+ }
+ }
+ }
+ break;
case OPCODE_BRA:
- brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+ brw_set_predicate_control(p, get_predicate(inst));
brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
- brw_set_predicate_control_flag_value(p, 0xff);
+ brw_set_predicate_control(p, BRW_PREDICATE_NONE);
break;
case OPCODE_CAL:
brw_set_access_mode(p, BRW_ALIGN_1);
@@ -1209,7 +1572,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
brw_set_access_mode(p, BRW_ALIGN_16);
brw_ADD(p, get_addr_reg(stack_index),
get_addr_reg(stack_index), brw_imm_d(4));
- inst->Data = &p->store[p->nr_insn];
+ brw_save_call(p, inst->Comment, p->nr_insn);
brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
break;
case OPCODE_RET:
@@ -1218,14 +1581,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
brw_set_access_mode(p, BRW_ALIGN_1);
brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
brw_set_access_mode(p, BRW_ALIGN_16);
+ break;
case OPCODE_END:
+ end_offset = p->nr_insn;
+ /* this instruction will get patched later to jump past subroutine
+ * code, etc.
+ */
brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
break;
case OPCODE_PRINT:
+ /* no-op */
+ break;
case OPCODE_BGNSUB:
+ brw_save_label(p, inst->Comment, p->nr_insn);
+ break;
case OPCODE_ENDSUB:
- /* no-op instructions */
- break;
+ /* no-op */
+ break;
default:
_mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
inst->Opcode, inst->Opcode < MAX_OPCODE ?
@@ -1233,6 +1605,19 @@ void brw_vs_emit(struct brw_vs_compile *c )
"unknown");
}
+ /* Set the predication update on the last instruction of the native
+ * instruction sequence.
+ *
+ * This would be problematic if it was set on a math instruction,
+ * but that shouldn't be the case with the current GLSL compiler.
+ */
+ if (inst->CondUpdate) {
+ struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+ assert(hw_insn->header.destreg__conditionalmod == 0);
+ hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+ }
+
if ((inst->DstReg.File == PROGRAM_OUTPUT)
&& (inst->DstReg.Index != VERT_RESULT_HPOS)
&& c->output_regs[inst->DstReg.Index].used_in_src) {
@@ -1263,9 +1648,20 @@ void brw_vs_emit(struct brw_vs_compile *c )
release_tmps(c);
}
- end_inst = &p->store[p->nr_insn];
+ end_inst = &p->store[end_offset];
+ last_inst = &p->store[p->nr_insn];
+
+ /* The END instruction will be patched to jump to this code */
emit_vertex_write(c);
- post_vs_emit(c, end_inst);
- for (insn = 0; insn < nr_insns; insn++)
- c->vp->program.Base.Instructions[insn].Data = NULL;
+
+ post_vs_emit(c, end_inst, last_inst);
+
+ if (INTEL_DEBUG & DEBUG_VS) {
+ int i;
+
+ _mesa_printf("vs-native:\n");
+ for (i = 0; i < p->nr_insn; i++)
+ brw_disasm(stderr, &p->store[i]);
+ _mesa_printf("\n");
+ }
}