/************************************************************************** * * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. * All Rights Reserved. * Copyright 2009 VMware, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /** * Generate SPU fragment program/shader code. * * Note that we generate SOA-style code here. So each TGSI instruction * operates on four pixels (and is translated into four SPU instructions, * generally speaking). * * \author Brian Paul */ #include #include "pipe/p_defines.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_dump.h" #include "rtasm/rtasm_ppc_spe.h" #include "util/u_memory.h" #include "cell_context.h" #include "cell_gen_fp.h" #define MAX_TEMPS 16 #define MAX_IMMED 8 #define CHAN_X 0 #define CHAN_Y 1 #define CHAN_Z 2 #define CHAN_W 3 /** * Context needed during code generation. */ struct codegen { struct cell_context *cell; int inputs_reg; /**< 1st function parameter */ int outputs_reg; /**< 2nd function parameter */ int constants_reg; /**< 3rd function parameter */ int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */ int imm_regs[MAX_IMMED][4]; /**< maps TGSI immediates to SPE registers */ int num_imm; /**< number of immediates */ int one_reg; /**< register containing {1.0, 1.0, 1.0, 1.0} */ int addr_reg; /**< address register, integer values */ /** Per-instruction temps / intermediate temps */ int num_itemps; int itemps[12]; /** Current IF/ELSE/ENDIF nesting level */ int if_nesting; /** Current BGNLOOP/ENDLOOP nesting level */ int loop_nesting; /** Location of start of current loop */ int loop_start; /** Index of if/conditional mask register */ int cond_mask_reg; /** Index of loop mask register */ int loop_mask_reg; /** Index of master execution mask register */ int exec_mask_reg; /** KIL mask: indicates which fragments have been killed */ int kill_mask_reg; int frame_size; /**< Stack frame size, in words */ struct spe_function *f; boolean error; }; /** * Allocate an intermediate temporary register. */ static int get_itemp(struct codegen *gen) { int t = spe_allocate_available_register(gen->f); assert(gen->num_itemps < Elements(gen->itemps)); gen->itemps[gen->num_itemps++] = t; return t; } /** * Free all intermediate temporary registers. To be called after each * instruction has been emitted. */ static void free_itemps(struct codegen *gen) { int i; for (i = 0; i < gen->num_itemps; i++) { spe_release_register(gen->f, gen->itemps[i]); } gen->num_itemps = 0; } /** * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}. * The register is allocated and initialized upon the first call. */ static int get_const_one_reg(struct codegen *gen) { if (gen->one_reg <= 0) { gen->one_reg = spe_allocate_available_register(gen->f); spe_indent(gen->f, 4); spe_comment(gen->f, -4, "init constant reg = 1.0:"); /* one = {1.0, 1.0, 1.0, 1.0} */ spe_load_float(gen->f, gen->one_reg, 1.0f); spe_indent(gen->f, -4); } return gen->one_reg; } /** * Return index of the address register. * Used for indirect register loads/stores. */ static int get_address_reg(struct codegen *gen) { if (gen->addr_reg <= 0) { gen->addr_reg = spe_allocate_available_register(gen->f); spe_indent(gen->f, 4); spe_comment(gen->f, -4, "init address reg = 0:"); /* init addr = {0, 0, 0, 0} */ spe_zero(gen->f, gen->addr_reg); spe_indent(gen->f, -4); } return gen->addr_reg; } /** * Return index of the master execution mask. * The register is allocated an initialized upon the first call. * * The master execution mask controls which pixels in a quad are * modified, according to surrounding conditionals, loops, etc. */ static int get_exec_mask_reg(struct codegen *gen) { if (gen->exec_mask_reg <= 0) { gen->exec_mask_reg = spe_allocate_available_register(gen->f); /* XXX this may not be needed */ spe_comment(gen->f, 0*-4, "initialize master execution mask = ~0"); spe_load_int(gen->f, gen->exec_mask_reg, ~0); } return gen->exec_mask_reg; } /** Return index of the conditional (if/else) execution mask register */ static int get_cond_mask_reg(struct codegen *gen) { if (gen->cond_mask_reg <= 0) { gen->cond_mask_reg = spe_allocate_available_register(gen->f); } return gen->cond_mask_reg; } /** Return index of the loop execution mask register */ static int get_loop_mask_reg(struct codegen *gen) { if (gen->loop_mask_reg <= 0) { gen->loop_mask_reg = spe_allocate_available_register(gen->f); } return gen->loop_mask_reg; } static boolean is_register_src(struct codegen *gen, int channel, const struct tgsi_full_src_register *src) { int swizzle = tgsi_util_get_full_src_register_swizzle(src, channel); int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel); if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) { return FALSE; } if (src->Register.File == TGSI_FILE_TEMPORARY || src->Register.File == TGSI_FILE_IMMEDIATE) { return TRUE; } return FALSE; } static boolean is_memory_dst(struct codegen *gen, int channel, const struct tgsi_full_dst_register *dst) { if (dst->Register.File == TGSI_FILE_OUTPUT) { return TRUE; } else { return FALSE; } } /** * Return the index of the SPU temporary containing the named TGSI * source register. If the TGSI register is a TGSI_FILE_TEMPORARY we * just return the corresponding SPE register. If the TGIS register * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register * and emit an SPE load instruction. */ static int get_src_reg(struct codegen *gen, int channel, const struct tgsi_full_src_register *src) { int reg = -1; int swizzle = tgsi_util_get_full_src_register_swizzle(src, channel); boolean reg_is_itemp = FALSE; uint sign_op; assert(swizzle >= TGSI_SWIZZLE_X); assert(swizzle <= TGSI_SWIZZLE_W); { int index = src->Register.Index; assert(swizzle < 4); if (src->Register.Indirect) { /* XXX unfinished */ } switch (src->Register.File) { case TGSI_FILE_TEMPORARY: reg = gen->temp_regs[index][swizzle]; break; case TGSI_FILE_INPUT: { /* offset is measured in quadwords, not bytes */ int offset = index * 4 + swizzle; reg = get_itemp(gen); reg_is_itemp = TRUE; /* Load: reg = memory[(machine_reg) + offset] */ spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16); } break; case TGSI_FILE_IMMEDIATE: reg = gen->imm_regs[index][swizzle]; break; case TGSI_FILE_CONSTANT: { /* offset is measured in quadwords, not bytes */ int offset = index * 4 + swizzle; reg = get_itemp(gen); reg_is_itemp = TRUE; /* Load: reg = memory[(machine_reg) + offset] */ spe_lqd(gen->f, reg, gen->constants_reg, offset * 16); } break; default: assert(0); } } /* * Handle absolute value, negate or set-negative of src register. */ sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel); if (sign_op != TGSI_UTIL_SIGN_KEEP) { /* * All sign ops are done by manipulating bit 31, the IEEE float sign bit. */ const int bit31mask_reg = get_itemp(gen); int result_reg; if (reg_is_itemp) { /* re-use 'reg' for the result */ result_reg = reg; } else { /* alloc a new reg for the result */ result_reg = get_itemp(gen); } /* mask with bit 31 set, the rest cleared */ spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); if (sign_op == TGSI_UTIL_SIGN_CLEAR) { spe_andc(gen->f, result_reg, reg, bit31mask_reg); } else if (sign_op == TGSI_UTIL_SIGN_SET) { spe_and(gen->f, result_reg, reg, bit31mask_reg); } else { assert(sign_op == TGSI_UTIL_SIGN_TOGGLE); spe_xor(gen->f, result_reg, reg, bit31mask_reg); } reg = result_reg; } return reg; } /** * Return the index of an SPE register to use for the given TGSI register. * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the * corresponding SPE register is returned. If the TGSI register is * TGSI_FILE_OUTPUT we allocate an intermediate temporary register. * See store_dest_reg() below... */ static int get_dst_reg(struct codegen *gen, int channel, const struct tgsi_full_dst_register *dest) { int reg = -1; switch (dest->Register.File) { case TGSI_FILE_TEMPORARY: if (gen->if_nesting > 0 || gen->loop_nesting > 0) reg = get_itemp(gen); else reg = gen->temp_regs[dest->Register.Index][channel]; break; case TGSI_FILE_OUTPUT: reg = get_itemp(gen); break; default: assert(0); } return reg; } /** * When a TGSI instruction is writing to an output register, this * function emits the SPE store instruction to store the value_reg. * \param value_reg the SPE register containing the value to store. * This would have been returned by get_dst_reg(). */ static void store_dest_reg(struct codegen *gen, int value_reg, int channel, const struct tgsi_full_dst_register *dest) { /* * XXX need to implement dst reg clamping/saturation */ #if 0 switch (inst->Instruction.Saturate) { case TGSI_SAT_NONE: break; case TGSI_SAT_ZERO_ONE: break; case TGSI_SAT_MINUS_PLUS_ONE: break; default: assert( 0 ); } #endif switch (dest->Register.File) { case TGSI_FILE_TEMPORARY: if (gen->if_nesting > 0 || gen->loop_nesting > 0) { int d_reg = gen->temp_regs[dest->Register.Index][channel]; int exec_reg = get_exec_mask_reg(gen); /* Mix d with new value according to exec mask: * d[i] = mask_reg[i] ? value_reg : d_reg */ spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg); } else { /* we're not inside a condition or loop: do nothing special */ } break; case TGSI_FILE_OUTPUT: { /* offset is measured in quadwords, not bytes */ int offset = dest->Register.Index * 4 + channel; if (gen->if_nesting > 0 || gen->loop_nesting > 0) { int exec_reg = get_exec_mask_reg(gen); int curval_reg = get_itemp(gen); /* First read the current value from memory: * Load: curval = memory[(machine_reg) + offset] */ spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16); /* Mix curval with newvalue according to exec mask: * d[i] = mask_reg[i] ? value_reg : d_reg */ spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg); /* Store: memory[(machine_reg) + offset] = curval */ spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16); } else { /* Store: memory[(machine_reg) + offset] = reg */ spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16); } } break; default: assert(0); } } static void emit_prologue(struct codegen *gen) { gen->frame_size = 1024; /* XXX temporary, should be dynamic */ spe_comment(gen->f, 0, "Function prologue:"); /* save $lr on stack # stqd $lr,16($sp) */ spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); if (gen->frame_size >= 512) { /* offset is too large for ai instruction */ int offset_reg = spe_allocate_available_register(gen->f); int sp_reg = spe_allocate_available_register(gen->f); /* offset = -framesize */ spe_load_int(gen->f, offset_reg, -gen->frame_size); /* sp = $sp */ spe_move(gen->f, sp_reg, SPE_REG_SP); /* $sp = $sp + offset_reg */ spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg); /* save $sp in stack frame */ spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0); /* clean up */ spe_release_register(gen->f, offset_reg); spe_release_register(gen->f, sp_reg); } else { /* save stack pointer # stqd $sp,-frameSize($sp) */ spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size); /* adjust stack pointer # ai $sp,$sp,-frameSize */ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size); } } static void emit_epilogue(struct codegen *gen) { const int return_reg = 3; spe_comment(gen->f, 0, "Function epilogue:"); spe_comment(gen->f, 0, "return the killed mask"); if (gen->kill_mask_reg > 0) { /* shader called KIL, return the "alive" mask */ spe_move(gen->f, return_reg, gen->kill_mask_reg); } else { /* return {0,0,0,0} */ spe_load_uint(gen->f, return_reg, 0); } spe_comment(gen->f, 0, "restore stack and return"); if (gen->frame_size >= 512) { /* offset is too large for ai instruction */ int offset_reg = spe_allocate_available_register(gen->f); /* offset = framesize */ spe_load_int(gen->f, offset_reg, gen->frame_size); /* $sp = $sp + offset */ spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg); /* clean up */ spe_release_register(gen->f, offset_reg); } else { /* restore stack pointer # ai $sp,$sp,frameSize */ spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size); } /* restore $lr # lqd $lr,16($sp) */ spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16); /* return from function call */ spe_bi(gen->f, SPE_REG_RA, 0, 0); } #define FOR_EACH_ENABLED_CHANNEL(inst, ch) \ for (ch = 0; ch < 4; ch++) \ if (inst->Dst[0].Register.WriteMask & (1 << ch)) static boolean emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch = 0, src_reg, addr_reg; src_reg = get_src_reg(gen, ch, &inst->Src[0]); addr_reg = get_address_reg(gen); /* convert float to int */ spe_cflts(gen->f, addr_reg, src_reg, 0); free_itemps(gen); return TRUE; } static boolean emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, src_reg[4], dst_reg[4]; FOR_EACH_ENABLED_CHANNEL(inst, ch) { src_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); dst_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { if (is_register_src(gen, ch, &inst->Src[0]) && is_memory_dst(gen, ch, &inst->Dst[0])) { /* special-case: register to memory store */ store_dest_reg(gen, src_reg[ch], ch, &inst->Dst[0]); } else { spe_move(gen->f, dst_reg[ch], src_reg[ch]); store_dest_reg(gen, dst_reg[ch], ch, &inst->Dst[0]); } } free_itemps(gen); return TRUE; } /** * Emit binary operation */ static boolean emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], s2_reg[4], d_reg[4]; /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); } /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { /* Emit actual SPE instruction: d = s1 + s2 */ switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ADD: spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); break; case TGSI_OPCODE_SUB: spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); break; case TGSI_OPCODE_MUL: spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); break; default: ; } } /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } /* Free any intermediate temps we allocated */ free_itemps(gen); return TRUE; } /** * Emit multiply add. See emit_ADD for comments. */ static boolean emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4]; FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]); s3_reg[ch] = get_src_reg(gen, ch, &inst->Src[2]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit linear interpolate. See emit_ADD for comments. */ static boolean emit_LRP(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4]; /* setup/get src/dst/temp regs */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]); s3_reg[ch] = get_src_reg(gen, ch, &inst->Src[2]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); tmp_reg[ch] = get_itemp(gen); } /* d = s3 + s1(s2 - s3) */ /* do all subtracts, then all fma, then all stores to better pipeline */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit reciprocal or recip sqrt. */ static boolean emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], d_reg[4], tmp_reg[4]; FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); tmp_reg[ch] = get_itemp(gen); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { if (inst->Instruction.Opcode == TGSI_OPCODE_RCP) { /* tmp = 1/s1 */ spe_frest(gen->f, tmp_reg[ch], s1_reg[ch]); } else { /* tmp = 1/sqrt(s1) */ spe_frsqest(gen->f, tmp_reg[ch], s1_reg[ch]); } } FOR_EACH_ENABLED_CHANNEL(inst, ch) { /* d = float_interp(s1, tmp) */ spe_fi(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit absolute value. See emit_ADD for comments. */ static boolean emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], d_reg[4]; const int bit31mask_reg = get_itemp(gen); /* mask with bit 31 set, the rest cleared */ spe_load_uint(gen->f, bit31mask_reg, (1 << 31)); FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); } /* d = sign bit cleared in s1 */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_andc(gen->f, d_reg[ch], s1_reg[ch], bit31mask_reg); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit 3 component dot product. See emit_ADD for comments. */ static boolean emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; int s1x_reg, s1y_reg, s1z_reg; int s2x_reg, s2y_reg, s2z_reg; int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); s1x_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]); s2x_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]); s1y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]); s2y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]); s1z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]); s2z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]); /* t0 = x0 * x1 */ spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg); /* t1 = y0 * y1 */ spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg); /* t0 = z0 * z1 + t0 */ spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg); /* t0 = t0 + t1 */ spe_fa(gen->f, t0_reg, t0_reg, t1_reg); FOR_EACH_ENABLED_CHANNEL(inst, ch) { int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]); spe_move(gen->f, d_reg, t0_reg); store_dest_reg(gen, d_reg, ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit 4 component dot product. See emit_ADD for comments. */ static boolean emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; int s0x_reg, s0y_reg, s0z_reg, s0w_reg; int s1x_reg, s1y_reg, s1z_reg, s1w_reg; int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); s0x_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]); s1x_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]); s0y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]); s1y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]); s0z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]); s1z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]); s0w_reg = get_src_reg(gen, CHAN_W, &inst->Src[0]); s1w_reg = get_src_reg(gen, CHAN_W, &inst->Src[1]); /* t0 = x0 * x1 */ spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg); /* t1 = y0 * y1 */ spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg); /* t0 = z0 * z1 + t0 */ spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg); /* t1 = w0 * w1 + t1 */ spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg); /* t0 = t0 + t1 */ spe_fa(gen->f, t0_reg, t0_reg, t1_reg); FOR_EACH_ENABLED_CHANNEL(inst, ch) { int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]); spe_move(gen->f, d_reg, t0_reg); store_dest_reg(gen, d_reg, ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit homogeneous dot product. See emit_ADD for comments. */ static boolean emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst) { /* XXX rewrite this function to look more like DP3/DP4 */ int ch; int s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]); int s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]); int tmp_reg = get_itemp(gen); /* t = x0 * x1 */ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]); s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]); /* t = y0 * y1 + t */ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]); s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]); /* t = z0 * z1 + t */ spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); s2_reg = get_src_reg(gen, CHAN_W, &inst->Src[1]); /* t = w1 + t */ spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg); FOR_EACH_ENABLED_CHANNEL(inst, ch) { int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]); spe_move(gen->f, d_reg, tmp_reg); store_dest_reg(gen, tmp_reg, ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit 3-component vector normalize. */ static boolean emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; int src_reg[3]; int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen); src_reg[0] = get_src_reg(gen, CHAN_X, &inst->Src[0]); src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->Src[0]); src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->Src[0]); /* t0 = x * x */ spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]); /* t1 = y * y */ spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]); /* t0 = z * z + t0 */ spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg); /* t0 = t0 + t1 */ spe_fa(gen->f, t0_reg, t0_reg, t1_reg); /* t1 = 1.0 / sqrt(t0) */ spe_frsqest(gen->f, t1_reg, t0_reg); spe_fi(gen->f, t1_reg, t0_reg, t1_reg); FOR_EACH_ENABLED_CHANNEL(inst, ch) { int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]); /* dst = src[ch] * t1 */ spe_fm(gen->f, d_reg, src_reg[ch], t1_reg); store_dest_reg(gen, d_reg, ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit cross product. See emit_ADD for comments. */ static boolean emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst) { int s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]); int s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]); int tmp_reg = get_itemp(gen); /* t = z0 * y1 */ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]); s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]); /* t = y0 * z1 - t */ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); if (inst->Dst[0].Register.WriteMask & (1 << CHAN_X)) { store_dest_reg(gen, tmp_reg, CHAN_X, &inst->Dst[0]); } s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]); s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]); /* t = x0 * z1 */ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]); s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]); /* t = z0 * x1 - t */ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); if (inst->Dst[0].Register.WriteMask & (1 << CHAN_Y)) { store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->Dst[0]); } s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]); s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]); /* t = y0 * x1 */ spe_fm(gen->f, tmp_reg, s1_reg, s2_reg); s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]); s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]); /* t = x0 * y1 - t */ spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg); if (inst->Dst[0].Register.WriteMask & (1 << CHAN_Z)) { store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit inequality instruction. * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as * the result but OpenGL/TGSI needs 0.0 and 1.0 results. * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND. */ static boolean emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg; boolean complement = FALSE; one_reg = get_const_one_reg(gen); FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { switch (inst->Instruction.Opcode) { case TGSI_OPCODE_SGT: spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); break; case TGSI_OPCODE_SLT: spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]); break; case TGSI_OPCODE_SGE: spe_fcgt(gen->f, d_reg[ch], s2_reg[ch], s1_reg[ch]); complement = TRUE; break; case TGSI_OPCODE_SLE: spe_fcgt(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); complement = TRUE; break; case TGSI_OPCODE_SEQ: spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); break; case TGSI_OPCODE_SNE: spe_fceq(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]); complement = TRUE; break; default: assert(0); } } /* convert d from 0x0/0xffffffff to 0.0/1.0 */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { /* d = d & one_reg */ if (complement) spe_andc(gen->f, d_reg[ch], one_reg, d_reg[ch]); else spe_and(gen->f, d_reg[ch], one_reg, d_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit compare. */ static boolean emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; FOR_EACH_ENABLED_CHANNEL(inst, ch) { int s1_reg = get_src_reg(gen, ch, &inst->Src[0]); int s2_reg = get_src_reg(gen, ch, &inst->Src[1]); int s3_reg = get_src_reg(gen, ch, &inst->Src[2]); int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]); int zero_reg = get_itemp(gen); spe_zero(gen->f, zero_reg); /* d = (s1 < 0) ? s2 : s3 */ spe_fcgt(gen->f, d_reg, zero_reg, s1_reg); spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg); store_dest_reg(gen, d_reg, ch, &inst->Dst[0]); free_itemps(gen); } return TRUE; } /** * Emit trunc. * Convert float to signed int * Convert signed int to float */ static boolean emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], d_reg[4]; FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); } /* Convert float to int */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_cflts(gen->f, d_reg[ch], s1_reg[ch], 0); } /* Convert int to float */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_csflt(gen->f, d_reg[ch], d_reg[ch], 0); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit floor. * If negative int subtract one * Convert float to signed int * Convert signed int to float */ static boolean emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg; zero_reg = get_itemp(gen); spe_zero(gen->f, zero_reg); one_reg = get_const_one_reg(gen); FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); tmp_reg[ch] = get_itemp(gen); } /* If negative, subtract 1.0 */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]); } /* Convert float to int */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0); } /* Convert int to float */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_csflt(gen->f, d_reg[ch], tmp_reg[ch], 0); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Compute frac = Input - FLR(Input) */ static boolean emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s1_reg[4], d_reg[4], tmp_reg[4], zero_reg, one_reg; zero_reg = get_itemp(gen); spe_zero(gen->f, zero_reg); one_reg = get_const_one_reg(gen); FOR_EACH_ENABLED_CHANNEL(inst, ch) { s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); tmp_reg[ch] = get_itemp(gen); } /* If negative, subtract 1.0 */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fcgt(gen->f, tmp_reg[ch], zero_reg, s1_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_selb(gen->f, tmp_reg[ch], zero_reg, one_reg, tmp_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fs(gen->f, tmp_reg[ch], s1_reg[ch], tmp_reg[ch]); } /* Convert float to int */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_cflts(gen->f, tmp_reg[ch], tmp_reg[ch], 0); } /* Convert int to float */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_csflt(gen->f, tmp_reg[ch], tmp_reg[ch], 0); } /* d = s1 - FLR(s1) */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_fs(gen->f, d_reg[ch], s1_reg[ch], tmp_reg[ch]); } /* store result */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } #if 0 static void print_functions(struct cell_context *cell) { struct cell_spu_function_info *funcs = &cell->spu_functions; uint i; for (i = 0; i < funcs->num; i++) { printf("SPU func %u: %s at %u\n", i, funcs->names[i], funcs->addrs[i]); } } #endif static uint lookup_function(struct cell_context *cell, const char *funcname) { const struct cell_spu_function_info *funcs = &cell->spu_functions; uint i, addr = 0; for (i = 0; i < funcs->num; i++) { if (strcmp(funcs->names[i], funcname) == 0) { addr = funcs->addrs[i]; } } assert(addr && "spu function not found"); return addr / 4; /* discard 2 least significant bits */ } /** * Emit code to call a SPU function. * Used to implement instructions like SIN/COS/POW/TEX/etc. * If scalar, only the X components of the src regs are used, and the * result is replicated across the dest register's XYZW components. */ static boolean emit_function_call(struct codegen *gen, const struct tgsi_full_instruction *inst, char *funcname, uint num_args, boolean scalar) { const uint addr = lookup_function(gen->cell, funcname); char comment[100]; int s_regs[3]; int func_called = FALSE; uint a, ch; int retval_reg = -1; assert(num_args <= 3); snprintf(comment, sizeof(comment), "CALL %s:", funcname); spe_comment(gen->f, -4, comment); if (scalar) { for (a = 0; a < num_args; a++) { s_regs[a] = get_src_reg(gen, CHAN_X, &inst->Src[a]); } /* we'll call the function, put the return value in this register, * then replicate it across all write-enabled components in d_reg. */ retval_reg = spe_allocate_available_register(gen->f); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { int d_reg; ubyte usedRegs[SPE_NUM_REGS]; uint i, numUsed; if (!scalar) { for (a = 0; a < num_args; a++) { s_regs[a] = get_src_reg(gen, ch, &inst->Src[a]); } } d_reg = get_dst_reg(gen, ch, &inst->Dst[0]); if (!scalar || !func_called) { /* for a scalar function, we'll really only call the function once */ numUsed = spe_get_registers_used(gen->f, usedRegs); assert(numUsed < gen->frame_size / 16 - 2); /* save registers to stack */ for (i = 0; i < numUsed; i++) { uint reg = usedRegs[i]; int offset = 2 + i; spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); } /* setup function arguments */ for (a = 0; a < num_args; a++) { spe_move(gen->f, 3 + a, s_regs[a]); } /* branch to function, save return addr */ spe_brasl(gen->f, SPE_REG_RA, addr); /* save function's return value */ if (scalar) spe_move(gen->f, retval_reg, 3); else spe_move(gen->f, d_reg, 3); /* restore registers from stack */ for (i = 0; i < numUsed; i++) { uint reg = usedRegs[i]; if (reg != d_reg && reg != retval_reg) { int offset = 2 + i; spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); } } func_called = TRUE; } if (scalar) { spe_move(gen->f, d_reg, retval_reg); } store_dest_reg(gen, d_reg, ch, &inst->Dst[0]); free_itemps(gen); } if (scalar) { spe_release_register(gen->f, retval_reg); } return TRUE; } static boolean emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst) { const uint target = inst->Texture.Texture; const uint unit = inst->Src[1].Register.Index; uint addr; int ch; int coord_regs[4], d_regs[4]; switch (target) { case TGSI_TEXTURE_1D: case TGSI_TEXTURE_2D: addr = lookup_function(gen->cell, "spu_tex_2d"); break; case TGSI_TEXTURE_3D: addr = lookup_function(gen->cell, "spu_tex_3d"); break; case TGSI_TEXTURE_CUBE: addr = lookup_function(gen->cell, "spu_tex_cube"); break; default: ASSERT(0 && "unsupported texture target"); return FALSE; } assert(inst->Src[1].Register.File == TGSI_FILE_SAMPLER); spe_comment(gen->f, -4, "CALL tex:"); /* get src/dst reg info */ for (ch = 0; ch < 4; ch++) { coord_regs[ch] = get_src_reg(gen, ch, &inst->Src[0]); d_regs[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); } { ubyte usedRegs[SPE_NUM_REGS]; uint i, numUsed; numUsed = spe_get_registers_used(gen->f, usedRegs); assert(numUsed < gen->frame_size / 16 - 2); /* save registers to stack */ for (i = 0; i < numUsed; i++) { uint reg = usedRegs[i]; int offset = 2 + i; spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset); } /* setup function arguments (XXX depends on target) */ for (i = 0; i < 4; i++) { spe_move(gen->f, 3 + i, coord_regs[i]); } spe_load_uint(gen->f, 7, unit); /* sampler unit */ /* branch to function, save return addr */ spe_brasl(gen->f, SPE_REG_RA, addr); /* save function's return values (four pixel's colors) */ for (i = 0; i < 4; i++) { spe_move(gen->f, d_regs[i], 3 + i); } /* restore registers from stack */ for (i = 0; i < numUsed; i++) { uint reg = usedRegs[i]; if (reg != d_regs[0] && reg != d_regs[1] && reg != d_regs[2] && reg != d_regs[3]) { int offset = 2 + i; spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset); } } } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_regs[ch], ch, &inst->Dst[0]); free_itemps(gen); } return TRUE; } /** * KILL if any of src reg values are less than zero. */ static boolean emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch; int s_regs[4], kil_reg = -1, cmp_reg, zero_reg; spe_comment(gen->f, -4, "CALL kil:"); /* zero = {0,0,0,0} */ zero_reg = get_itemp(gen); spe_zero(gen->f, zero_reg); cmp_reg = get_itemp(gen); /* get src regs */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { s_regs[ch] = get_src_reg(gen, ch, &inst->Src[0]); } /* test if any src regs are < 0 */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { if (kil_reg >= 0) { /* cmp = 0 > src ? : ~0 : 0 */ spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]); /* kil = kil | cmp */ spe_or(gen->f, kil_reg, kil_reg, cmp_reg); } else { kil_reg = get_itemp(gen); /* kil = 0 > src ? : ~0 : 0 */ spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]); } } if (gen->if_nesting || gen->loop_nesting) { /* may have been a conditional kil */ spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg); } /* allocate the kill mask reg if needed */ if (gen->kill_mask_reg <= 0) { gen->kill_mask_reg = spe_allocate_available_register(gen->f); spe_move(gen->f, gen->kill_mask_reg, kil_reg); } else { spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg); } free_itemps(gen); return TRUE; } /** * Emit min or max. */ static boolean emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst) { int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4]; FOR_EACH_ENABLED_CHANNEL(inst, ch) { s0_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]); s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]); d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]); tmp_reg[ch] = get_itemp(gen); } /* d = (s0 > s1) ? s0 : s1 */ FOR_EACH_ENABLED_CHANNEL(inst, ch) { if (inst->Instruction.Opcode == TGSI_OPCODE_MAX) spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]); else spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]); } FOR_EACH_ENABLED_CHANNEL(inst, ch) { store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]); } free_itemps(gen); return TRUE; } /** * Emit code to update the execution mask. * This needs to be done whenever the execution status of a conditional * or loop is changed. */ static void emit_update_exec_mask(struct codegen *gen) { const int exec_reg = get_exec_mask_reg(gen); const int cond_reg = gen->cond_mask_reg; const int loop_reg = gen->loop_mask_reg; spe_comment(gen->f, 0, "Update master execution mask"); if (gen->if_nesting > 0 && gen->loop_nesting > 0) { /* exec_mask = cond_mask & loop_mask */ assert(cond_reg > 0); assert(loop_reg > 0); spe_and(gen->f, exec_reg, cond_reg, loop_reg); } else if (gen->if_nesting > 0) { assert(cond_reg > 0); spe_move(gen->f, exec_reg, cond_reg); } else if (gen->loop_nesting > 0) { assert(loop_reg > 0); spe_move(gen->f, exec_reg, loop_reg); } else { spe_load_int(gen->f, exec_reg, ~0x0); } } static boolean emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst) { const int channel = 0; int cond_reg; cond_reg = get_cond_mask_reg(gen); /* XXX push cond exec mask */ spe_comment(gen->f, 0, "init conditional exec mask = ~0:"); spe_load_int(gen->f, cond_reg, ~0); /* update conditional execution mask with the predicate register */ int tmp_reg = get_itemp(gen); int s1_reg = get_src_reg(gen, channel, &inst->Src[0]); /* tmp = (s1_reg == 0) */ spe_ceqi(gen->f, tmp_reg, s1_reg, 0); /* tmp = !tmp */ spe_complement(gen->f, tmp_reg, tmp_reg); /* cond_mask = cond_mask & tmp */ spe_and(gen->f, cond_reg, cond_reg, tmp_reg); gen->if_nesting++; /* update the master execution mask */ emit_update_exec_mask(gen); free_itemps(gen); return TRUE; } static boolean emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst) { const int cond_reg = get_cond_mask_reg(gen); spe_comment(gen->f, 0, "cond exec mask = !cond exec mask"); spe_complement(gen->f, cond_reg, cond_reg); emit_update_exec_mask(gen); return TRUE; } static boolean emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst) { /* XXX todo: pop cond exec mask */ gen->if_nesting--; emit_update_exec_mask(gen); return TRUE; } static boolean emit_BGNLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst) { int exec_reg, loop_reg; exec_reg = get_exec_mask_reg(gen); loop_reg = get_loop_mask_reg(gen); /* XXX push loop_exec mask */ spe_comment(gen->f, 0*-4, "initialize loop exec mask = ~0"); spe_load_int(gen->f, loop_reg, ~0x0); gen->loop_nesting++; gen->loop_start = spe_code_size(gen->f); /* in bytes */ return TRUE; } static boolean emit_ENDLOOP(struct codegen *gen, const struct tgsi_full_instruction *inst) { const int loop_reg = get_loop_mask_reg(gen); const int tmp_reg = get_itemp(gen); int offset; /* tmp_reg = exec[0] | exec[1] | exec[2] | exec[3] */ spe_orx(gen->f, tmp_reg, loop_reg); offset = gen->loop_start - spe_code_size(gen->f); /* in bytes */ /* branch back to top of loop if tmp_reg != 0 */ spe_brnz(gen->f, tmp_reg, offset / 4); /* XXX pop loop_exec mask */ gen->loop_nesting--; emit_update_exec_mask(gen); return TRUE; } static boolean emit_BRK(struct codegen *gen, const struct tgsi_full_instruction *inst) { const int exec_reg = get_exec_mask_reg(gen); const int loop_reg = get_loop_mask_reg(gen); assert(gen->loop_nesting > 0); spe_comment(gen->f, 0, "loop exec mask &= ~master exec mask"); spe_andc(gen->f, loop_reg, loop_reg, exec_reg); emit_update_exec_mask(gen); return TRUE; } static boolean emit_CONT(struct codegen *gen, const struct tgsi_full_instruction *inst) { assert(gen->loop_nesting > 0); return TRUE; } static boolean emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst, boolean ddx) { int ch; FOR_EACH_ENABLED_CHANNEL(inst, ch) { int s_reg = get_src_reg(gen, ch, &inst->Src[0]); int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]); int t1_reg = get_itemp(gen); int t2_reg = get_itemp(gen); spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */ if (ddx) { spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */ } else { spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */ } spe_fs(gen->f, d_reg, t2_reg, t1_reg); free_itemps(gen); } return TRUE; } /** * Emit END instruction. * We just return from the shader function at this point. * * Note that there may be more code after this that would be * called by TGSI_OPCODE_CALL. */ static boolean emit_END(struct codegen *gen) { emit_epilogue(gen); return TRUE; } /** * Emit code for the given instruction. Just a big switch stmt. */ static boolean emit_instruction(struct codegen *gen, const struct tgsi_full_instruction *inst) { switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ARL: return emit_ARL(gen, inst); case TGSI_OPCODE_MOV: return emit_MOV(gen, inst); case TGSI_OPCODE_ADD: case TGSI_OPCODE_SUB: case TGSI_OPCODE_MUL: return emit_binop(gen, inst); case TGSI_OPCODE_MAD: return emit_MAD(gen, inst); case TGSI_OPCODE_LRP: return emit_LRP(gen, inst); case TGSI_OPCODE_DP3: return emit_DP3(gen, inst); case TGSI_OPCODE_DP4: return emit_DP4(gen, inst); case TGSI_OPCODE_DPH: return emit_DPH(gen, inst); case TGSI_OPCODE_NRM: return emit_NRM3(gen, inst); case TGSI_OPCODE_XPD: return emit_XPD(gen, inst); case TGSI_OPCODE_RCP: case TGSI_OPCODE_RSQ: return emit_RCP_RSQ(gen, inst); case TGSI_OPCODE_ABS: return emit_ABS(gen, inst); case TGSI_OPCODE_SGT: case TGSI_OPCODE_SLT: case TGSI_OPCODE_SGE: case TGSI_OPCODE_SLE: case TGSI_OPCODE_SEQ: case TGSI_OPCODE_SNE: return emit_inequality(gen, inst); case TGSI_OPCODE_CMP: return emit_CMP(gen, inst); case TGSI_OPCODE_MIN: case TGSI_OPCODE_MAX: return emit_MIN_MAX(gen, inst); case TGSI_OPCODE_TRUNC: return emit_TRUNC(gen, inst); case TGSI_OPCODE_FLR: return emit_FLR(gen, inst); case TGSI_OPCODE_FRC: return emit_FRC(gen, inst); case TGSI_OPCODE_END: return emit_END(gen); case TGSI_OPCODE_COS: return emit_function_call(gen, inst, "spu_cos", 1, TRUE); case TGSI_OPCODE_SIN: return emit_function_call(gen, inst, "spu_sin", 1, TRUE); case TGSI_OPCODE_POW: return emit_function_call(gen, inst, "spu_pow", 2, TRUE); case TGSI_OPCODE_EX2: return emit_function_call(gen, inst, "spu_exp2", 1, TRUE); case TGSI_OPCODE_LG2: return emit_function_call(gen, inst, "spu_log2", 1, TRUE); case TGSI_OPCODE_TEX: /* fall-through for now */ case TGSI_OPCODE_TXD: /* fall-through for now */ case TGSI_OPCODE_TXB: /* fall-through for now */ case TGSI_OPCODE_TXL: /* fall-through for now */ case TGSI_OPCODE_TXP: return emit_TEX(gen, inst); case TGSI_OPCODE_KIL: return emit_KIL(gen, inst); case TGSI_OPCODE_IF: return emit_IF(gen, inst); case TGSI_OPCODE_ELSE: return emit_ELSE(gen, inst); case TGSI_OPCODE_ENDIF: return emit_ENDIF(gen, inst); case TGSI_OPCODE_BGNLOOP: return emit_BGNLOOP(gen, inst); case TGSI_OPCODE_ENDLOOP: return emit_ENDLOOP(gen, inst); case TGSI_OPCODE_BRK: return emit_BRK(gen, inst); case TGSI_OPCODE_CONT: return emit_CONT(gen, inst); case TGSI_OPCODE_DDX: return emit_DDX_DDY(gen, inst, TRUE); case TGSI_OPCODE_DDY: return emit_DDX_DDY(gen, inst, FALSE); /* XXX lots more cases to do... */ default: fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n", inst->Instruction.Opcode); return FALSE; } return TRUE; } /** * Emit code for a TGSI immediate value (vector of four floats). * This involves register allocation and initialization. * XXX the initialization should be done by a "prepare" stage, not * per quad execution! */ static boolean emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed) { int ch; assert(gen->num_imm < MAX_TEMPS); for (ch = 0; ch < 4; ch++) { float val = immed->u[ch].Float; if (ch > 0 && val == immed->u[ch - 1].Float) { /* re-use previous register */ gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1]; } else { char str[100]; int reg = spe_allocate_available_register(gen->f); if (reg < 0) return FALSE; sprintf(str, "init $%d = %f", reg, val); spe_comment(gen->f, 0, str); /* update immediate map */ gen->imm_regs[gen->num_imm][ch] = reg; /* emit initializer instruction */ spe_load_float(gen->f, reg, val); } } gen->num_imm++; return TRUE; } /** * Emit "code" for a TGSI declaration. * We only care about TGSI TEMPORARY register declarations at this time. * For each TGSI TEMPORARY we allocate four SPE registers. */ static boolean emit_declaration(struct cell_context *cell, struct codegen *gen, const struct tgsi_full_declaration *decl) { int i, ch; switch (decl->Declaration.File) { case TGSI_FILE_TEMPORARY: for (i = decl->Range.First; i <= decl->Range.Last; i++) { assert(i < MAX_TEMPS); for (ch = 0; ch < 4; ch++) { gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f); if (gen->temp_regs[i][ch] < 0) return FALSE; /* out of regs */ } /* XXX if we run out of SPE registers, we need to spill * to SPU memory. someday... */ { char buf[100]; sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i, gen->temp_regs[i][0], gen->temp_regs[i][1], gen->temp_regs[i][2], gen->temp_regs[i][3]); spe_comment(gen->f, 0, buf); } } break; default: ; /* ignore */ } return TRUE; } /** * Translate TGSI shader code to SPE instructions. This is done when * the state tracker gives us a new shader (via pipe->create_fs_state()). * * \param cell the rendering context (in) * \param tokens the TGSI shader (in) * \param f the generated function (out) */ boolean cell_gen_fragment_program(struct cell_context *cell, const struct tgsi_token *tokens, struct spe_function *f) { struct tgsi_parse_context parse; struct codegen gen; uint ic = 0; memset(&gen, 0, sizeof(gen)); gen.cell = cell; gen.f = f; /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */ gen.inputs_reg = 3; /* pointer to inputs array */ gen.outputs_reg = 4; /* pointer to outputs array */ gen.constants_reg = 5; /* pointer to constants array */ spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE); spe_allocate_register(f, gen.inputs_reg); spe_allocate_register(f, gen.outputs_reg); spe_allocate_register(f, gen.constants_reg); if (cell->debug_flags & CELL_DEBUG_ASM) { spe_print_code(f, TRUE); spe_indent(f, 2*8); printf("Begin %s\n", __FUNCTION__); tgsi_dump(tokens, 0); } tgsi_parse_init(&parse, tokens); emit_prologue(&gen); while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) { tgsi_parse_token(&parse); switch (parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: if (f->print) { _debug_printf(" # "); tgsi_dump_immediate(&parse.FullToken.FullImmediate); } if (!emit_immediate(&gen, &parse.FullToken.FullImmediate)) gen.error = TRUE; break; case TGSI_TOKEN_TYPE_DECLARATION: if (f->print) { _debug_printf(" # "); tgsi_dump_declaration(&parse.FullToken.FullDeclaration); } if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration)) gen.error = TRUE; break; case TGSI_TOKEN_TYPE_INSTRUCTION: if (f->print) { _debug_printf(" # "); ic++; tgsi_dump_instruction(&parse.FullToken.FullInstruction, ic); } if (!emit_instruction(&gen, &parse.FullToken.FullInstruction)) gen.error = TRUE; break; default: assert(0); } } if (gen.error) { /* terminate the SPE code */ return emit_END(&gen); } if (cell->debug_flags & CELL_DEBUG_ASM) { printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst); printf("End %s\n", __FUNCTION__); } tgsi_parse_free( &parse ); return !gen.error; }