From 96bbc627f369c0100b950f81531b1fe9ef586c34 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 28 Feb 2011 02:00:01 +0100 Subject: r600g: implement instanced drawing support --- src/gallium/drivers/r600/eg_asm.c | 26 +-- src/gallium/drivers/r600/r600_asm.c | 230 ++++++++++++++++++--------- src/gallium/drivers/r600/r600_asm.h | 3 +- src/gallium/drivers/r600/r600_pipe.c | 2 +- src/gallium/drivers/r600/r600_shader.c | 31 +++- src/gallium/drivers/r600/r600_state_common.c | 2 +- 6 files changed, 191 insertions(+), 103 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c index 80c5de3975..8190df725d 100644 --- a/src/gallium/drivers/r600/eg_asm.c +++ b/src/gallium/drivers/r600/eg_asm.c @@ -94,31 +94,9 @@ int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf) return 0; } -void eg_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count) +void eg_cf_vtx(struct r600_vertex_element *ve) { - struct r600_pipe_state *rstate; - unsigned i = 0; - - if (count > 8) { - bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) | - S_SQ_CF_WORD1_BARRIER(1) | - S_SQ_CF_WORD1_COUNT(8 - 1); - bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) | - S_SQ_CF_WORD1_BARRIER(1) | - S_SQ_CF_WORD1_COUNT(count - 8 - 1); - } else { - bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) | - S_SQ_CF_WORD1_BARRIER(1) | - S_SQ_CF_WORD1_COUNT(count - 1); - } - bytecode[i++] = S_SQ_CF_WORD0_ADDR(0); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN) | - S_SQ_CF_WORD1_BARRIER(1); - - rstate = &ve->rstate; + struct r600_pipe_state *rstate = &ve->rstate; rstate->id = R600_PIPE_STATE_FETCH_SHADER; rstate->nregs = 0; r600_pipe_state_add_reg(rstate, R_0288A8_SQ_PGM_RESOURCES_FS, diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index de796188fd..5d59356bf7 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -83,6 +83,7 @@ static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT: + case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS: return 1; @@ -1374,7 +1375,8 @@ static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsign S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) | S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) | S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr); - bc->bytecode[id++] = S_SQ_VTX_WORD2_MEGA_FETCH(1); + bc->bytecode[id++] = S_SQ_VTX_WORD2_OFFSET(vtx->offset) | + S_SQ_VTX_WORD2_MEGA_FETCH(1); bc->bytecode[id++] = 0; return 0; } @@ -1894,12 +1896,13 @@ void r600_bc_dump(struct r600_bc *bc) fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z); fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w); fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields); - fprintf(stderr, "DATA_FORMAT:%d ", vtx->data_format); - fprintf(stderr, "NUM_FORMAT_ALL:%d ", vtx->num_format_all); - fprintf(stderr, "FORMAT_COMP_ALL:%d ", vtx->format_comp_all); - fprintf(stderr, "SRF_MODE_ALL:%d\n", vtx->srf_mode_all); + fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format); + fprintf(stderr, "NUM:%d ", vtx->num_format_all); + fprintf(stderr, "COMP:%d ", vtx->format_comp_all); + fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all); id++; - fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]); + fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]); + fprintf(stderr, "OFFSET:%d\n", vtx->offset); //TODO id++; fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]); @@ -1910,29 +1913,9 @@ void r600_bc_dump(struct r600_bc *bc) fprintf(stderr, "--------------------------------------\n"); } -static void r600_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count) +static void r600_cf_vtx(struct r600_vertex_element *ve) { struct r600_pipe_state *rstate; - unsigned i = 0; - - if (count > 8) { - bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) | - S_SQ_CF_WORD1_BARRIER(1) | - S_SQ_CF_WORD1_COUNT(8 - 1); - bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) | - S_SQ_CF_WORD1_BARRIER(1) | - S_SQ_CF_WORD1_COUNT(count - 8 - 1); - } else { - bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) | - S_SQ_CF_WORD1_BARRIER(1) | - S_SQ_CF_WORD1_COUNT(count - 1); - } - bytecode[i++] = S_SQ_CF_WORD0_ADDR(0); - bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) | - S_SQ_CF_WORD1_BARRIER(1); rstate = &ve->rstate; rstate->id = R600_PIPE_STATE_FETCH_SHADER; @@ -2078,37 +2061,19 @@ out_unknown: int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve) { - unsigned ndw, i; - u32 *bytecode; - unsigned fetch_resource_start = 0, format, num_format, format_comp; + static int dump_shaders = -1; + + struct r600_bc bc; + struct r600_bc_vtx vtx; struct pipe_vertex_element *elements = ve->elements; const struct util_format_description *desc; - - /* 2 dwords for cf aligned to 4 + 4 dwords per input */ - ndw = 8 + ve->count * 4; - ve->fs_size = ndw * 4; - - /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */ - ve->fetch_shader = r600_bo(rctx->radeon, ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0); - if (ve->fetch_shader == NULL) { - return -ENOMEM; - } - - bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL); - if (bytecode == NULL) { - r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL); - return -ENOMEM; - } - - if (rctx->family >= CHIP_CEDAR) { - eg_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4); - } else { - r600_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4); - fetch_resource_start = 160; - } + unsigned fetch_resource_start = rctx->family >= CHIP_CEDAR ? 0 : 160; + unsigned format, num_format, format_comp; + u32 *bytecode; + int i, r; /* vertex elements offset need special handling, if offset is bigger - * than what we can put in fetch instruction then we need to alterate + + * than what we can put in fetch instruction then we need to alterate * the vertex resource offset. In such case in order to simplify code * we will bound one resource per elements. It's a worst case scenario. */ @@ -2119,40 +2084,155 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru } } + memset(&bc, 0, sizeof(bc)); + r = r600_bc_init(&bc, r600_get_family(rctx->radeon)); + if (r) + return r; + + for (i = 0; i < ve->count; i++) { + if (elements[i].instance_divisor > 1) { + struct r600_bc_alu alu; + + memset(&alu, 0, sizeof(alu)); + alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT); + alu.src[0].sel = 0; + alu.src[0].chan = 3; + + alu.dst.sel = i + 1; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + + if ((r = r600_bc_add_alu(&bc, &alu))) { + r600_bc_clear(&bc); + return r; + } + + memset(&alu, 0, sizeof(alu)); + alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL); + alu.src[0].sel = i + 1; + alu.src[0].chan = 3; + + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = fui(1.0f / (float)elements[i].instance_divisor); + + alu.dst.sel = i + 1; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + + if ((r = r600_bc_add_alu(&bc, &alu))) { + r600_bc_clear(&bc); + return r; + } + + memset(&alu, 0, sizeof(alu)); + alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC); + alu.src[0].sel = i + 1; + alu.src[0].chan = 3; + + alu.dst.sel = i + 1; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + + if ((r = r600_bc_add_alu(&bc, &alu))) { + r600_bc_clear(&bc); + return r; + } + + memset(&alu, 0, sizeof(alu)); + alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT); + alu.src[0].sel = i + 1; + alu.src[0].chan = 3; + + alu.dst.sel = i + 1; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + + if ((r = r600_bc_add_alu(&bc, &alu))) { + r600_bc_clear(&bc); + return r; + } + } + } + for (i = 0; i < ve->count; i++) { unsigned vbuffer_index; r600_vertex_data_type(ve->elements[i].src_format, &format, &num_format, &format_comp); desc = util_format_description(ve->elements[i].src_format); if (desc == NULL) { + r600_bc_clear(&bc); R600_ERR("unknown format %d\n", ve->elements[i].src_format); - r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL); return -EINVAL; } /* see above for vbuffer_need_offset explanation */ vbuffer_index = elements[i].vertex_buffer_index; - if (ve->vbuffer_need_offset) { - bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(i + fetch_resource_start); - } else { - bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(vbuffer_index + fetch_resource_start); + memset(&vtx, 0, sizeof(vtx)); + vtx.buffer_id = (ve->vbuffer_need_offset ? i : vbuffer_index) + fetch_resource_start; + vtx.fetch_type = elements[i].instance_divisor ? 1 : 0; + vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0; + vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0; + vtx.mega_fetch_count = 16; + vtx.dst_gpr = i + 1; + vtx.dst_sel_x = desc->swizzle[0]; + vtx.dst_sel_y = desc->swizzle[1]; + vtx.dst_sel_z = desc->swizzle[2]; + vtx.dst_sel_w = desc->swizzle[3]; + vtx.data_format = format; + vtx.num_format_all = num_format; + vtx.format_comp_all = format_comp; + vtx.srf_mode_all = 1; + vtx.offset = elements[i].src_offset; + + if ((r = r600_bc_add_vtx(&bc, &vtx))) { + r600_bc_clear(&bc); + return r; } - bytecode[8 + i * 4 + 0] |= S_SQ_VTX_WORD0_SRC_GPR(0) | - S_SQ_VTX_WORD0_SRC_SEL_X(0) | - S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(0x1F); - bytecode[8 + i * 4 + 1] = S_SQ_VTX_WORD1_DST_SEL_X(desc->swizzle[0]) | - S_SQ_VTX_WORD1_DST_SEL_Y(desc->swizzle[1]) | - S_SQ_VTX_WORD1_DST_SEL_Z(desc->swizzle[2]) | - S_SQ_VTX_WORD1_DST_SEL_W(desc->swizzle[3]) | - S_SQ_VTX_WORD1_USE_CONST_FIELDS(0) | - S_SQ_VTX_WORD1_DATA_FORMAT(format) | - S_SQ_VTX_WORD1_NUM_FORMAT_ALL(num_format) | - S_SQ_VTX_WORD1_FORMAT_COMP_ALL(format_comp) | - S_SQ_VTX_WORD1_SRF_MODE_ALL(1) | - S_SQ_VTX_WORD1_GPR_DST_GPR(i + 1); - bytecode[8 + i * 4 + 2] = S_SQ_VTX_WORD2_OFFSET(elements[i].src_offset) | - S_SQ_VTX_WORD2_MEGA_FETCH(1); - bytecode[8 + i * 4 + 3] = 0; } + + r600_bc_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN)); + + /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */ + ve->fetch_shader = r600_bo(rctx->radeon, bc.ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0); + if (ve->fetch_shader == NULL) { + r600_bc_clear(&bc); + return -ENOMEM; + } + + ve->fs_size = bc.ndw*4; + if ((r = r600_bc_build(&bc))) { + r600_bc_clear(&bc); + return r; + } + + if (dump_shaders == -1) + dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE); + + if (dump_shaders) { + fprintf(stderr, "--------------------------------------------------------------\n"); + r600_bc_dump(&bc); + fprintf(stderr, "______________________________________________________________\n"); + } + + bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL); + if (bytecode == NULL) { + r600_bc_clear(&bc); + r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL); + return -ENOMEM; + } + + memcpy(bytecode, bc.bytecode, ve->fs_size); + r600_bo_unmap(rctx->radeon, ve->fetch_shader); + r600_bc_clear(&bc); + + if (rctx->family >= CHIP_CEDAR) + eg_cf_vtx(ve); + else + r600_cf_vtx(ve); + return 0; } diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 921d0d9845..b22c21d1e2 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -103,6 +103,7 @@ struct r600_bc_vtx { unsigned num_format_all; unsigned format_comp_all; unsigned srf_mode_all; + unsigned offset; }; struct r600_bc_output { @@ -189,7 +190,7 @@ struct r600_bc { /* eg_asm.c */ int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf); -void eg_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count); +void eg_cf_vtx(struct r600_vertex_element *ve); /* r600_asm.c */ int r600_bc_init(struct r600_bc *bc, enum radeon_family family); diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 62d108f351..adcd74aec7 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -285,13 +285,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE: case PIPE_CAP_DEPTH_CLAMP: case PIPE_CAP_SHADER_STENCIL_EXPORT: + case PIPE_CAP_INSTANCED_DRAWING: return 1; /* Unsupported features (boolean caps). */ case PIPE_CAP_STREAM_OUTPUT: case PIPE_CAP_PRIMITIVE_RESTART: case PIPE_CAP_INDEP_BLEND_FUNC: /* FIXME allow this */ - case PIPE_CAP_INSTANCED_DRAWING: return 0; case PIPE_CAP_ARRAY_TEXTURES: diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 9fcb1d75f0..65923fb964 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -420,6 +420,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) { struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; unsigned i; + int r; switch (d->Declaration.File) { case TGSI_FILE_INPUT: @@ -451,6 +452,26 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) case TGSI_FILE_SAMPLER: case TGSI_FILE_ADDRESS: break; + + case TGSI_FILE_SYSTEM_VALUE: + if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) { + struct r600_bc_alu alu; + memset(&alu, 0, sizeof(struct r600_bc_alu)); + + alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT); + alu.src[0].sel = 0; + alu.src[0].chan = 3; + + alu.dst.sel = 0; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + + if ((r = r600_bc_add_alu(ctx->bc, &alu))) + return r; + break; + } + default: R600_ERR("unsupported file %d declaration\n", d->Declaration.File); return -EINVAL; @@ -521,6 +542,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx, r600_src->swizzle[3] = tgsi_src->Register.SwizzleW; r600_src->neg = tgsi_src->Register.Negate; r600_src->abs = tgsi_src->Register.Absolute; + if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { int index; if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) && @@ -535,7 +557,14 @@ static void tgsi_src(struct r600_shader_ctx *ctx, index = tgsi_src->Register.Index; r600_src->sel = V_SQ_ALU_SRC_LITERAL; memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value)); - } else { + } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) { + /* assume we wan't TGSI_SEMANTIC_INSTANCEID here */ + r600_src->swizzle[0] = 3; + r600_src->swizzle[1] = 3; + r600_src->swizzle[2] = 3; + r600_src->swizzle[3] = 3; + r600_src->sel = 0; + } else { if (tgsi_src->Register.Indirect) r600_src->rel = V_SQ_REL_RELATIVE; r600_src->sel = tgsi_src->Register.Index; diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 72707fbd8b..677e220934 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -520,7 +520,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) r600_context_pipe_state_set(&rctx->ctx, &vgt); rdraw.vgt_num_indices = draw.info.count; - rdraw.vgt_num_instances = 1; + rdraw.vgt_num_instances = draw.info.instance_count; rdraw.vgt_index_type = vgt_dma_index_type; rdraw.vgt_draw_initiator = vgt_draw_initiator; rdraw.indices = NULL; -- cgit v1.2.3