diff options
Diffstat (limited to 'src/gallium/drivers/cell/ppu/cell_gen_fp.c')
-rw-r--r-- | src/gallium/drivers/cell/ppu/cell_gen_fp.c | 704 |
1 files changed, 485 insertions, 219 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c index 8d2d4f2a0f..3dfd5f673d 100644 --- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c +++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c @@ -77,13 +77,15 @@ struct codegen /** Per-instruction temps / intermediate temps */ int num_itemps; - int itemps[10]; + int itemps[12]; /** Current IF/ELSE/ENDIF nesting level */ int if_nesting; /** Index of execution mask register */ int exec_mask_reg; + int frame_size; /**< Stack frame size, in words */ + struct spe_function *f; boolean error; }; @@ -165,6 +167,37 @@ get_exec_mask_reg(struct codegen *gen) } +static boolean +is_register_src(struct codegen *gen, int channel, + const struct tgsi_full_src_register *src) +{ + int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel); + int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel); + + if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) { + return FALSE; + } + if (src->SrcRegister.File == TGSI_FILE_TEMPORARY || + src->SrcRegister.File == TGSI_FILE_IMMEDIATE) { + return TRUE; + } + return FALSE; +} + + +static boolean +is_memory_dst(struct codegen *gen, int channel, + const struct tgsi_full_dst_register *dst) +{ + if (dst->DstRegister.File == TGSI_FILE_OUTPUT) { + return TRUE; + } + else { + return FALSE; + } +} + + /** * Return the index of the SPU temporary containing the named TGSI * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we @@ -185,41 +218,48 @@ get_src_reg(struct codegen *gen, assert(swizzle >= TGSI_SWIZZLE_X); assert(swizzle <= TGSI_EXTSWIZZLE_ONE); - switch (src->SrcRegister.File) { - case TGSI_FILE_TEMPORARY: - reg = gen->temp_regs[src->SrcRegister.Index][swizzle]; - break; - case TGSI_FILE_INPUT: - { - if(swizzle == TGSI_EXTSWIZZLE_ONE) - { - /* Load const one float and early out */ - reg = get_const_one_reg(gen); - } - else if(swizzle == TGSI_EXTSWIZZLE_ZERO) + if (swizzle == TGSI_EXTSWIZZLE_ONE) { + /* Load const one float and early out */ + reg = get_const_one_reg(gen); + } + else if (swizzle == TGSI_EXTSWIZZLE_ZERO) { + /* Load const zero float and early out */ + reg = get_itemp(gen); + spe_xor(gen->f, reg, reg, reg); + } + else { + assert(swizzle < 4); + + switch (src->SrcRegister.File) { + case TGSI_FILE_TEMPORARY: + reg = gen->temp_regs[src->SrcRegister.Index][swizzle]; + break; + case TGSI_FILE_INPUT: { - /* Load const zero float and early out */ + /* offset is measured in quadwords, not bytes */ + int offset = src->SrcRegister.Index * 4 + swizzle; reg = get_itemp(gen); - spe_xor(gen->f, reg, reg, reg); + reg_is_itemp = TRUE; + /* Load: reg = memory[(machine_reg) + offset] */ + spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16); } - else + break; + case TGSI_FILE_IMMEDIATE: + reg = gen->imm_regs[src->SrcRegister.Index][swizzle]; + break; + case TGSI_FILE_CONSTANT: { /* offset is measured in quadwords, not bytes */ int offset = src->SrcRegister.Index * 4 + swizzle; reg = get_itemp(gen); reg_is_itemp = TRUE; /* Load: reg = memory[(machine_reg) + offset] */ - spe_lqd(gen->f, reg, gen->inputs_reg, offset); + spe_lqd(gen->f, reg, gen->constants_reg, offset * 16); } + break; + default: + assert(0); } - break; - case TGSI_FILE_IMMEDIATE: - reg = gen->imm_regs[src->SrcRegister.Index][swizzle]; - break; - case TGSI_FILE_CONSTANT: - /* xxx fall-through for now / fix */ - default: - assert(0); } /* @@ -243,7 +283,7 @@ get_src_reg(struct codegen *gen, } /* mask with bit 31 set, the rest cleared */ - spe_load_int(gen->f, bit31mask_reg, (1 << 31)); + spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); if (sign_op == TGSI_UTIL_SIGN_CLEAR) { spe_andc(gen->f, result_reg, reg, bit31mask_reg); @@ -318,6 +358,7 @@ store_dest_reg(struct codegen *gen, } else { /* we're not inside a condition or loop: do nothing special */ + } break; case TGSI_FILE_OUTPUT: @@ -330,17 +371,17 @@ store_dest_reg(struct codegen *gen, /* First read the current value from memory: * Load: curval = memory[(machine_reg) + offset] */ - spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset); + spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16); /* Mix curval with newvalue according to exec mask: * d[i] = mask_reg[i] ? value_reg : d_reg */ spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg); /* Store: memory[(machine_reg) + offset] = curval */ - spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset); + spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16); } else { /* Store: memory[(machine_reg) + offset] = reg */ - spe_stqd(gen->f, value_reg, gen->outputs_reg, offset); + spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16); } } break; @@ -350,18 +391,95 @@ store_dest_reg(struct codegen *gen, } + +static void +emit_prologue(struct codegen *gen) +{ + gen->frame_size = 1024; /* XXX temporary, should be dynamic */ + + spe_comment(gen->f, -4, "Function prologue:"); + + /* save $lr on stack # stqd $lr,16($sp) */ + spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); + + if (gen->frame_size >= 512) { + /* offset is too large for ai instruction */ + int offset_reg = spe_allocate_available_register(gen->f); + int sp_reg = spe_allocate_available_register(gen->f); + /* offset = -framesize */ + spe_load_int(gen->f, offset_reg, -gen->frame_size); + /* sp = $sp */ + spe_move(gen->f, sp_reg, SPE_REG_SP); + /* $sp = $sp + offset_reg */ + spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg); + /* save $sp in stack frame */ + spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0); + /* clean up */ + spe_release_register(gen->f, offset_reg); + spe_release_register(gen->f, sp_reg); + } + else { + /* save stack pointer # stqd $sp,-frameSize($sp) */ + spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size); + + /* adjust stack pointer # ai $sp,$sp,-frameSize */ + spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size); + } +} + + +static void +emit_epilogue(struct codegen *gen) +{ + spe_comment(gen->f, -4, "Function epilogue:"); + + if (gen->frame_size >= 512) { + /* offset is too large for ai instruction */ + int offset_reg = spe_allocate_available_register(gen->f); + /* offset = framesize */ + spe_load_int(gen->f, offset_reg, gen->frame_size); + /* $sp = $sp + offset */ + spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg); + /* clean up */ + spe_release_register(gen->f, offset_reg); + } + else { + /* restore stack pointer # ai $sp,$sp,frameSize */ + spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size); + } + + /* restore $lr # lqd $lr,16($sp) */ + spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); + + /* return from function call */ + spe_bi(gen->f, SPE_REG_RA, 0, 0); +} + + static boolean emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, src_reg[4], dst_reg[4]; + spe_comment(gen->f, -4, "MOV:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* XXX we don't always need to actually emit a mov instruction here */ - spe_move(gen->f, dst_reg, src_reg); - store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]); + src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + } + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) && + is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) { + /* special-case: register to memory store */ + store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]); + } + else { + spe_move(gen->f, dst_reg[ch], src_reg[ch]); + store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]); + } free_itemps(gen); } } @@ -376,22 +494,25 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], s2_reg[4], d_reg[4]; + spe_comment(gen->f, -4, "ADD:"); - /* Loop over Red/Green/Blue/Alpha channels */ + /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */ for (ch = 0; ch < 4; ch++) { /* If the dest R, G, B or A writemask is enabled... */ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - /* get indexes of the two src, one dest SPE registers */ - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + } + /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */ + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { /* Emit actual SPE instruction: d = s1 + s2 */ - spe_fa(gen->f, d_reg, s1_reg, s2_reg); - + spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); /* Free any intermediate temps we allocated */ free_itemps(gen); } @@ -405,23 +526,20 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], s2_reg[4], d_reg[4]; spe_comment(gen->f, -4, "SUB:"); - /* Loop over Red/Green/Blue/Alpha channels */ for (ch = 0; ch < 4; ch++) { - /* If the dest R, G, B or A writemask is enabled... */ if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - /* get indexes of the two src, one dest SPE registers */ - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - - /* Emit actual SPE instruction: d = s1 - s2 */ - spe_fs(gen->f, d_reg, s1_reg, s2_reg); - - /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - /* Free any intermediate temps we allocated */ + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + } + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + /* d = s1 - s2 */ + spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); free_itemps(gen); } } @@ -434,17 +552,21 @@ emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4]; spe_comment(gen->f, -4, "MAD:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + } + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { /* d = s1 * s2 + s3 */ - spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]); + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); free_itemps(gen); } } @@ -458,21 +580,37 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4]; spe_comment(gen->f, -4, "LERP:"); + /* setup/get src/dst/temp regs */ for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* d = s3 + s1(s2 - s3) */ - spe_fs(gen->f, d_reg, s2_reg, s3_reg); - spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + } + + /* d = s3 + s1(s2 - s3) */ + /* do all subtracts, then all fma, then all stores to better pipeline */ + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]); + } + } + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]); } } + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); + } + } + free_itemps(gen); return true; } @@ -482,16 +620,20 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s1_reg[4], s2_reg[4], d_reg[4]; spe_comment(gen->f, -4, "MUL:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + } + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { /* d = s1 * s2 */ - spe_fm(gen->f, d_reg, s1_reg, s2_reg); - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); + spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); free_itemps(gen); } } @@ -557,7 +699,7 @@ emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst) const int bit31mask_reg = get_itemp(gen); /* mask with bit 31 set, the rest cleared */ - spe_load_int(gen->f, bit31mask_reg, (1 << 31)); + spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); /* d = sign bit cleared in s1 */ spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg); @@ -576,27 +718,36 @@ static boolean emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + int s1x_reg, s1y_reg, s1z_reg; + int s2x_reg, s2y_reg, s2z_reg; + int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); + spe_comment(gen->f, -4, "DP3:"); - int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); - int tmp_reg = get_itemp(gen); - /* t = x0 * x1 */ - spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); - s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); - s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); - /* t = y0 * y1 + t */ - spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + /* t0 = x0 * x1 */ + spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg); - s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); - s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); - /* t = z0 * z1 + t */ - spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + /* t1 = y0 * y1 */ + spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg); + + /* t0 = z0 * z1 + t0 */ + spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg); + + /* t0 = t0 + t1 */ + spe_fa(gen->f, t0_reg, t0_reg, t1_reg); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, t0_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); } } @@ -611,32 +762,41 @@ static boolean emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; + int s0x_reg, s0y_reg, s0z_reg, s0w_reg; + int s1x_reg, s1y_reg, s1z_reg, s1w_reg; + int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); + spe_comment(gen->f, -4, "DP4:"); - int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); - int tmp_reg = get_itemp(gen); - /* t = x0 * x1 */ - spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); + s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]); + s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]); + s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); + s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); + s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); + s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); + s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]); + s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]); - s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]); - s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]); - /* t = y0 * y1 + t */ - spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + /* t0 = x0 * x1 */ + spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg); - s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]); - s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]); - /* t = z0 * z1 + t */ - spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + /* t1 = y0 * y1 */ + spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg); - s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]); - s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]); - /* t = w0 * w1 + t */ - spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); + /* t0 = z0 * z1 + t0 */ + spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg); + + /* t1 = w0 * w1 + t1 */ + spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg); + + /* t0 = t0 + t1 */ + spe_fa(gen->f, t0_reg, t0_reg, t1_reg); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, t0_reg); + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); } } @@ -650,6 +810,7 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst) { + /* XXX rewrite this function to look more like DP3/DP4 */ int ch; spe_comment(gen->f, -4, "DPH:"); @@ -676,6 +837,8 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst) for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + spe_move(gen->f, d_reg, tmp_reg); store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]); } } @@ -1016,15 +1179,15 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst) int tmp_reg = get_itemp(gen); /* If negative, subtract 1.0 */ - spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); - spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg); - spe_fs(gen->f, d_reg, s1_reg, tmp_reg); + spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg); + spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg); + spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg); /* Convert float to int */ - spe_cflts(gen->f, d_reg, d_reg, 0); + spe_cflts(gen->f, tmp_reg, tmp_reg, 0); /* Convert int to float */ - spe_csflt(gen->f, d_reg, d_reg, 0); + spe_csflt(gen->f, d_reg, tmp_reg, 0); store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); @@ -1035,15 +1198,14 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst) } /** - * Emit frac. - * Input - FLR(Input) + * Compute frac = Input - FLR(Input) */ static boolean emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; - spe_comment(gen->f, -4, "FLR:"); + spe_comment(gen->f, -4, "FRC:"); int zero_reg = get_itemp(gen); spe_xor(gen->f, zero_reg, zero_reg, zero_reg); @@ -1055,18 +1217,18 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) int tmp_reg = get_itemp(gen); /* If negative, subtract 1.0 */ - spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); - spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg); - spe_fs(gen->f, d_reg, s1_reg, tmp_reg); + spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg); + spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg); + spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg); /* Convert float to int */ - spe_cflts(gen->f, d_reg, d_reg, 0); + spe_cflts(gen->f, tmp_reg, tmp_reg, 0); /* Convert int to float */ - spe_csflt(gen->f, d_reg, d_reg, 0); + spe_csflt(gen->f, tmp_reg, tmp_reg, 0); /* d = s1 - FLR(s1) */ - spe_fs(gen->f, d_reg, s1_reg, d_reg); + spe_fs(gen->f, d_reg, s1_reg, tmp_reg); store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); @@ -1091,6 +1253,21 @@ print_functions(struct cell_context *cell) #endif +static uint +lookup_function(struct cell_context *cell, const char *funcname) +{ + const struct cell_spu_function_info *funcs = &cell->spu_functions; + uint i, addr = 0; + for (i = 0; i < funcs->num; i++) { + if (strcmp(funcs->names[i], funcname) == 0) { + addr = funcs->addrs[i]; + } + } + assert(addr && "spu function not found"); + return addr / 4; /* discard 2 least significant bits */ +} + + /** * Emit code to call a SPU function. * Used to implement instructions like SIN/COS/POW/TEX/etc. @@ -1100,77 +1277,56 @@ emit_function_call(struct codegen *gen, const struct tgsi_full_instruction *inst, char *funcname, uint num_args) { - const struct cell_spu_function_info *funcs = &gen->cell->spu_functions; + const uint addr = lookup_function(gen->cell, funcname); char comment[100]; - uint addr; int ch; - /* XXX temporary value */ - const int frameSize = 64; /* stack frame (activation record) size */ - assert(num_args <= 3); - /* lookup function address */ - { - uint i; - addr = 0; - for (i = 0; i < funcs->num; i++) { - if (strcmp(funcs->names[i], funcname) == 0) { - addr = funcs->addrs[i]; - } - } - assert(addr && "spu function not found"); - } - - addr /= 4; /* discard 2 least significant bits */ - snprintf(comment, sizeof(comment), "CALL %s:", funcname); spe_comment(gen->f, -4, comment); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - int s_regs[3]; - uint a; + int s_regs[3], d_reg; + ubyte usedRegs[SPE_NUM_REGS]; + uint a, i, numUsed; + for (a = 0; a < num_args; a++) { s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]); } + d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); - /* Basically: - * save registers on stack - * move parameters to registers 3, 4, 5... - * call function - * save return value (reg 3) - * restore registers from stack - */ - - /* XXX hack: load first function param */ - spe_move(gen->f, 3, s_regs[0]); + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); - /* save $lr on stack # stqd $lr,16($sp) */ - spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); - /* save stack pointer # stqd $sp,-frameSize($sp) */ - spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize); - - /* XXX save registers to stack here */ + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } - /* adjust stack pointer # ai $sp,$sp,-frameSize */ - spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize); + /* setup function arguments */ + for (a = 0; a < num_args; a++) { + spe_move(gen->f, 3 + a, s_regs[a]); + } /* branch to function, save return addr */ spe_brasl(gen->f, SPE_REG_RA, addr); - /* restore stack pointer # ai $sp,$sp,frameSize */ - spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize); - - /* XXX restore registers from stack here */ - - /* restore $lr # lqd $lr,16($sp) */ - spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); - - /* XXX hack: save function's return value */ + /* save function's return value */ spe_move(gen->f, d_reg, 3); + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_reg) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } + } + store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); free_itemps(gen); } @@ -1180,31 +1336,114 @@ emit_function_call(struct codegen *gen, } +static boolean +emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst) +{ + const uint addr = lookup_function(gen->cell, "spu_txp"); + const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index; + int ch; + int coord_regs[4], d_regs[4]; + + assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER); + + spe_comment(gen->f, -4, "CALL txp:"); + + /* get src/dst reg info */ + for (ch = 0; ch < 4; ch++) { + coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + } + + { + ubyte usedRegs[SPE_NUM_REGS]; + uint i, numUsed; + + numUsed = spe_get_registers_used(gen->f, usedRegs); + assert(numUsed < gen->frame_size / 16 - 2); + + /* save registers to stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + int offset = 2 + i; + spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } + + /* setup function arguments */ + for (i = 0; i < 4; i++) { + spe_move(gen->f, 3 + i, coord_regs[i]); + } + spe_load_uint(gen->f, 7, unit); /* sampler unit */ + + /* branch to function, save return addr */ + spe_brasl(gen->f, SPE_REG_RA, addr); + + /* save function's return values (four pixel's colors) */ + for (i = 0; i < 4; i++) { + spe_move(gen->f, d_regs[i], 3 + i); + } + + /* restore registers from stack */ + for (i = 0; i < numUsed; i++) { + uint reg = usedRegs[i]; + if (reg != d_regs[0] && + reg != d_regs[1] && + reg != d_regs[2] && + reg != d_regs[3]) { + int offset = 2 + i; + spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); + } + } + } + + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]); + free_itemps(gen); + } + } + + return TRUE; +} + + /** * Emit max. See emit_SGT for comments. */ static boolean emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4]; spe_comment(gen->f, -4, "MAX:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + } - /* d = (s1 > s2) ? s1 : s2 */ - spe_fcgt(gen->f, d_reg, s1_reg, s2_reg); - spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg); + /* d = (s0 > s1) ? s0 : s1 */ + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]); + } + } + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]); + } + } - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } } + free_itemps(gen); return true; } @@ -1214,25 +1453,38 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) static boolean emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst) { - int ch; + int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4]; spe_comment(gen->f, -4, "MIN:"); for (ch = 0; ch < 4; ch++) { if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { - int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); - int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); - int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]); + s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]); + d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]); + tmp_reg[ch] = get_itemp(gen); + } + } - /* d = (s2 > s1) ? s1 : s2 */ - spe_fcgt(gen->f, d_reg, s2_reg, s1_reg); - spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg); + /* d = (s1 > s0) ? s0 : s1 */ + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]); + } + } + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]); + } + } - store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]); - free_itemps(gen); + for (ch = 0; ch < 4; ch++) { + if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) { + store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]); } } + free_itemps(gen); return true; } @@ -1339,8 +1591,7 @@ static boolean emit_END(struct codegen *gen) { spe_comment(gen->f, -4, "END:"); - /* return from function call */ - spe_bi(gen->f, SPE_REG_RA, 0, 0); + emit_epilogue(gen); return true; } @@ -1413,6 +1664,18 @@ emit_instruction(struct codegen *gen, return emit_function_call(gen, inst, "spu_sin", 1); case TGSI_OPCODE_POW: return emit_function_call(gen, inst, "spu_pow", 2); + case TGSI_OPCODE_EXPBASE2: + return emit_function_call(gen, inst, "spu_exp2", 1); + case TGSI_OPCODE_LOGBASE2: + return emit_function_call(gen, inst, "spu_log2", 1); + case TGSI_OPCODE_TEX: + /* fall-through for now */ + case TGSI_OPCODE_TXD: + /* fall-through for now */ + case TGSI_OPCODE_TXB: + /* fall-through for now */ + case TGSI_OPCODE_TXP: + return emit_TXP(gen, inst); case TGSI_OPCODE_IF: return emit_IF(gen, inst); @@ -1456,16 +1719,23 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) for (ch = 0; ch < 4; ch++) { float val = immed->u.ImmediateFloat32[ch].Float; - int reg = spe_allocate_available_register(gen->f); - if (reg < 0) - return false; + if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) { + /* re-use previous register */ + gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1]; + } + else { + int reg = spe_allocate_available_register(gen->f); + + if (reg < 0) + return false; - /* update immediate map */ - gen->imm_regs[gen->num_imm][ch] = reg; + /* update immediate map */ + gen->imm_regs[gen->num_imm][ch] = reg; - /* emit initializer instruction */ - spe_load_float(gen->f, reg, val); + /* emit initializer instruction */ + spe_load_float(gen->f, reg, val); + } } gen->num_imm++; @@ -1488,12 +1758,6 @@ emit_declaration(struct cell_context *cell, switch (decl->Declaration.File) { case TGSI_FILE_TEMPORARY: - if (cell->debug_flags & CELL_DEBUG_ASM) { - printf("Declare temp reg %d .. %d\n", - decl->DeclarationRange.First, - decl->DeclarationRange.Last); - } - for (i = decl->DeclarationRange.First; i <= decl->DeclarationRange.Last; i++) { @@ -1508,12 +1772,12 @@ emit_declaration(struct cell_context *cell, * to SPU memory. someday... */ - if (cell->debug_flags & CELL_DEBUG_ASM) { - printf(" SPE regs: %d %d %d %d\n", - gen->temp_regs[i][0], - gen->temp_regs[i][1], - gen->temp_regs[i][2], - gen->temp_regs[i][3]); + { + char buf[100]; + sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i, + gen->temp_regs[i][0], gen->temp_regs[i][1], + gen->temp_regs[i][2], gen->temp_regs[i][3]); + spe_comment(gen->f, -4, buf); } } break; @@ -1525,6 +1789,7 @@ emit_declaration(struct cell_context *cell, } + /** * Translate TGSI shader code to SPE instructions. This is done when * the state tracker gives us a new shader (via pipe->create_fs_state()). @@ -1564,12 +1829,14 @@ cell_gen_fragment_program(struct cell_context *cell, tgsi_parse_init(&parse, tokens); + emit_prologue(&gen); + while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) { tgsi_parse_token(&parse); switch (parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: - if (!emit_immediate(&gen, &parse.FullToken.FullImmediate)) + if (!emit_immediate(&gen, &parse.FullToken.FullImmediate)) gen.error = true; break; @@ -1588,7 +1855,6 @@ cell_gen_fragment_program(struct cell_context *cell, } } - if (gen.error) { /* terminate the SPE code */ return emit_END(&gen); |