/* * (C) Copyright IBM Corporation 2008 * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * \file * Generate code to perform all per-fragment operations. * * Code generated by these functions perform both alpha, depth, and stencil * testing as well as alpha blending. * * \note * Occlusion query is not supported, but this is the right place to add that * support. * * \author Ian Romanick */ #include "pipe/p_defines.h" #include "pipe/p_state.h" #include "cell_context.h" #include "rtasm/rtasm_ppc_spe.h" /** * Generate code to perform alpha testing. * * The code generated by this function uses the register specificed by * \c mask as both an input and an output. * * \param dsa Current alpha-test state * \param f Function to which code should be appended * \param mask Index of register containing active fragment mask * \param alphas Index of register containing per-fragment alpha values * * \note Emits a maximum of 6 instructions. */ static void emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa, struct spe_function *f, int mask, int alphas) { /* If the alpha function is either NEVER or ALWAYS, there is no need to * load the reference value into a register. ALWAYS is a fairly common * case, and this optimization saves 2 instructions. */ if (dsa->alpha.enabled && (dsa->alpha.func != PIPE_FUNC_NEVER) && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) { int ref = spe_allocate_available_register(f); int tmp_a = spe_allocate_available_register(f); int tmp_b = spe_allocate_available_register(f); union { float f; unsigned u; } ref_val; boolean complement = FALSE; ref_val.f = dsa->alpha.ref; spe_il(f, ref, ref_val.u & 0x0000ffff); spe_ilh(f, ref, ref_val.u >> 16); switch (dsa->alpha.func) { case PIPE_FUNC_NOTEQUAL: complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_EQUAL: spe_fceq(f, tmp_a, ref, alphas); break; case PIPE_FUNC_LEQUAL: complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GREATER: spe_fcgt(f, tmp_a, ref, alphas); break; case PIPE_FUNC_LESS: complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GEQUAL: spe_fcgt(f, tmp_a, ref, alphas); spe_fceq(f, tmp_b, ref, alphas); spe_or(f, tmp_a, tmp_b, tmp_a); break; case PIPE_FUNC_ALWAYS: case PIPE_FUNC_NEVER: default: assert(0); break; } if (complement) { spe_andc(f, mask, mask, tmp_a); } else { spe_and(f, mask, mask, tmp_a); } spe_release_register(f, ref); spe_release_register(f, tmp_a); spe_release_register(f, tmp_b); } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) { spe_il(f, mask, 0); } } /** * Generate code to perform Z testing. Four Z values are tested at once. * \param dsa Current depth-test state * \param f Function to which code should be appended * \param mask Index of register to contain depth-pass mask * \param stored Index of register containing values from depth buffer * \param calculated Index of register containing per-fragment depth values * * \return * If the calculated depth comparison mask is the actual mask, \c FALSE is * returned. If the calculated depth comparison mask is the compliment of * the actual mask, \c TRUE is returned. * * \note Emits a maximum of 3 instructions. */ static boolean emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa, struct spe_function *f, int mask, int stored, int calculated) { unsigned func = (dsa->depth.enabled) ? dsa->depth.func : PIPE_FUNC_ALWAYS; int tmp = spe_allocate_available_register(f); boolean compliment = FALSE; switch (func) { case PIPE_FUNC_NEVER: spe_il(f, mask, 0); break; case PIPE_FUNC_NOTEQUAL: compliment = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_EQUAL: spe_ceq(f, mask, calculated, stored); break; case PIPE_FUNC_LEQUAL: compliment = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GREATER: spe_clgt(f, mask, calculated, stored); break; case PIPE_FUNC_LESS: compliment = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GEQUAL: spe_clgt(f, mask, calculated, stored); spe_ceq(f, tmp, calculated, stored); spe_or(f, mask, mask, tmp); break; case PIPE_FUNC_ALWAYS: spe_il(f, mask, ~0); break; default: assert(0); break; } spe_release_register(f, tmp); return compliment; } /** * Generate code to apply the stencil operation (after testing). * \note Emits a maximum of 5 instructions. * * \warning * Since \c out and \c in might be the same register, this routine cannot * generate code that uses \c out as a temporary. */ static void emit_stencil_op(struct spe_function *f, int out, int in, int mask, unsigned op, unsigned ref) { const int clamp = spe_allocate_available_register(f); const int clamp_mask = spe_allocate_available_register(f); const int result = spe_allocate_available_register(f); switch(op) { case PIPE_STENCIL_OP_KEEP: assert(0); case PIPE_STENCIL_OP_ZERO: spe_il(f, result, 0); break; case PIPE_STENCIL_OP_REPLACE: spe_il(f, result, ref); break; case PIPE_STENCIL_OP_INCR: /* clamp = [0xff, 0xff, 0xff, 0xff] */ spe_il(f, clamp, 0x0ff); /* result[i] = in[i] + 1 */ spe_ai(f, result, in, 1); /* clamp_mask[i] = (result[i] > 0xff) */ spe_clgti(f, clamp_mask, result, 0x0ff); /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */ spe_selb(f, result, result, clamp, clamp_mask); break; case PIPE_STENCIL_OP_DECR: spe_il(f, clamp, 0); spe_ai(f, result, in, -1); /* If "(s-1) < 0" in signed arithemtic, then "(s-1) > MAX" in unsigned * arithmetic. */ spe_clgti(f, clamp_mask, result, 0x0ff); spe_selb(f, result, result, clamp, clamp_mask); break; case PIPE_STENCIL_OP_INCR_WRAP: spe_ai(f, result, in, 1); break; case PIPE_STENCIL_OP_DECR_WRAP: spe_ai(f, result, in, -1); break; case PIPE_STENCIL_OP_INVERT: spe_nor(f, result, in, in); break; default: assert(0); } spe_selb(f, out, in, result, mask); spe_release_register(f, result); spe_release_register(f, clamp_mask); spe_release_register(f, clamp); } /** * Generate code to do stencil test. Four pixels are tested at once. * \param dsa Depth / stencil test state * \param face 0 for front face, 1 for back face * \param f Function to append instructions to * \param mask Register containing mask of fragments passing the * alpha test * \param depth_mask Register containing mask of fragments passing the * depth test * \param depth_compliment Is \c depth_mask the compliment of the actual mask? * \param stencil Register containing values from stencil buffer * \param depth_pass Register to store mask of fragments passing stencil test * and depth test * * \note * Emits a maximum of 10 + (3 * 5) = 25 instructions. */ static int emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa, struct pipe_stencil_ref *sr, unsigned face, struct spe_function *f, int mask, int depth_mask, boolean depth_complement, int stencil, int depth_pass) { int stencil_fail = spe_allocate_available_register(f); int depth_fail = spe_allocate_available_register(f); int stencil_mask = spe_allocate_available_register(f); int stencil_pass = spe_allocate_available_register(f); int face_stencil = spe_allocate_available_register(f); int stencil_src = stencil; const unsigned ref = (sr->ref_value[face] & dsa->stencil[face].valuemask); boolean complement = FALSE; int stored; int tmp = spe_allocate_available_register(f); if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS) && (dsa->stencil[face].valuemask != 0x0ff)) { stored = spe_allocate_available_register(f); spe_andi(f, stored, stencil, dsa->stencil[face].valuemask); } else { stored = stencil; } switch (dsa->stencil[face].func) { case PIPE_FUNC_NEVER: spe_il(f, stencil_mask, 0); /* stencil_mask[0..3] = [0,0,0,0] */ break; case PIPE_FUNC_NOTEQUAL: complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_EQUAL: /* stencil_mask[i] = (stored[i] == ref) */ spe_ceqi(f, stencil_mask, stored, ref); break; case PIPE_FUNC_LEQUAL: complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GREATER: complement = TRUE; /* stencil_mask[i] = (stored[i] > ref) */ spe_clgti(f, stencil_mask, stored, ref); break; case PIPE_FUNC_LESS: complement = TRUE; /* FALLTHROUGH */ case PIPE_FUNC_GEQUAL: /* stencil_mask[i] = (stored[i] > ref) */ spe_clgti(f, stencil_mask, stored, ref); /* tmp[i] = (stored[i] == ref) */ spe_ceqi(f, tmp, stored, ref); /* stencil_mask[i] = stencil_mask[i] | tmp[i] */ spe_or(f, stencil_mask, stencil_mask, tmp); break; case PIPE_FUNC_ALWAYS: /* See comment below. */ break; default: assert(0); break; } if (stored != stencil) { spe_release_register(f, stored); } spe_release_register(f, tmp); /* ALWAYS is a very common stencil-test, so some effort is applied to * optimize that case. The stencil-pass mask is the same as the input * fragment mask. This makes the stencil-test (above) a no-op, and the * input fragment mask can be "renamed" the stencil-pass mask. */ if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) { spe_release_register(f, stencil_pass); stencil_pass = mask; } else { if (complement) { spe_andc(f, stencil_pass, mask, stencil_mask); } else { spe_and(f, stencil_pass, mask, stencil_mask); } } if (depth_complement) { spe_andc(f, depth_pass, stencil_pass, depth_mask); } else { spe_and(f, depth_pass, stencil_pass, depth_mask); } /* Conditionally emit code to update the stencil value under various * condititons. Note that there is no need to generate code under the * following circumstances: * * - Stencil write mask is zero. * - For stencil-fail if the stencil test is ALWAYS * - For depth-fail if the stencil test is NEVER * - For depth-pass if the stencil test is NEVER * - Any of the 3 conditions if the operation is KEEP */ if (dsa->stencil[face].writemask != 0) { if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS) && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) { if (complement) { spe_and(f, stencil_fail, mask, stencil_mask); } else { spe_andc(f, stencil_fail, mask, stencil_mask); } emit_stencil_op(f, face_stencil, stencil_src, stencil_fail, dsa->stencil[face].fail_op, sr->ref_value[face]); stencil_src = face_stencil; } if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) { if (depth_complement) { spe_and(f, depth_fail, stencil_pass, depth_mask); } else { spe_andc(f, depth_fail, stencil_pass, depth_mask); } emit_stencil_op(f, face_stencil, stencil_src, depth_fail, dsa->stencil[face].zfail_op, sr->ref_value[face]); stencil_src = face_stencil; } if ((dsa->stencil[face].func != PIPE_FUNC_NEVER) && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) { emit_stencil_op(f, face_stencil, stencil_src, depth_pass, dsa->stencil[face].zpass_op, sr->ref_value[face]); stencil_src = face_stencil; } } spe_release_register(f, stencil_fail); spe_release_register(f, depth_fail); spe_release_register(f, stencil_mask); if (stencil_pass != mask) { spe_release_register(f, stencil_pass); } /* If all of the stencil operations were KEEP or the stencil write mask was * zero, "stencil_src" will still be set to "stencil". In this case * release the "face_stencil" register. Otherwise apply the stencil write * mask to select bits from the calculated stencil value and the previous * stencil value. */ if (stencil_src == stencil) { spe_release_register(f, face_stencil); } else if (dsa->stencil[face].writemask != 0x0ff) { int tmp = spe_allocate_available_register(f); spe_il(f, tmp, dsa->stencil[face].writemask); spe_selb(f, stencil_src, stencil, stencil_src, tmp); spe_release_register(f, tmp); } return stencil_src; } void cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa, struct pipe_stencil_ref *sr) { struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base; struct spe_function *const f = &cdsa->code; /* This code generates a maximum of 6 (alpha test) + 3 (depth test) * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions. Round * up to 64 to make it a happy power-of-two. */ spe_init_func(f, SPE_INST_SIZE * 64); /* Allocate registers for the function's input parameters. Cleverly (and * clever code is usually dangerous, but I couldn't resist) the generated * function returns a structure. Returned structures start with register * 3, and the structure fields are ordered to match up exactly with the * input parameters. */ int mask = spe_allocate_register(f, 3); int depth = spe_allocate_register(f, 4); int stencil = spe_allocate_register(f, 5); int zvals = spe_allocate_register(f, 6); int frag_a = spe_allocate_register(f, 7); int facing = spe_allocate_register(f, 8); int depth_mask = spe_allocate_available_register(f); boolean depth_complement; emit_alpha_test(dsa, f, mask, frag_a); depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals); if (dsa->stencil[0].enabled) { const int front_depth_pass = spe_allocate_available_register(f); int front_stencil = emit_stencil_test(dsa, sr, 0, f, mask, depth_mask, depth_complement, stencil, front_depth_pass); if (dsa->stencil[1].enabled) { const int back_depth_pass = spe_allocate_available_register(f); int back_stencil = emit_stencil_test(dsa, sr, 1, f, mask, depth_mask, depth_complement, stencil, back_depth_pass); /* If the front facing stencil value and the back facing stencil * value are stored in the same register, there is no need to select * a value based on the facing. This can happen if the stencil value * was not modified due to the write masks being zero, the stencil * operations being KEEP, etc. */ if (front_stencil != back_stencil) { spe_selb(f, stencil, back_stencil, front_stencil, facing); } if (back_stencil != stencil) { spe_release_register(f, back_stencil); } if (front_stencil != stencil) { spe_release_register(f, front_stencil); } spe_selb(f, mask, back_depth_pass, front_depth_pass, facing); spe_release_register(f, back_depth_pass); } else { if (front_stencil != stencil) { spe_or(f, stencil, front_stencil, front_stencil); spe_release_register(f, front_stencil); } spe_or(f, mask, front_depth_pass, front_depth_pass); } spe_release_register(f, front_depth_pass); } else if (dsa->depth.enabled) { if (depth_complement) { spe_andc(f, mask, mask, depth_mask); } else { spe_and(f, mask, mask, depth_mask); } } if (dsa->depth.writemask) { spe_selb(f, depth, depth, zvals, mask); } spe_bi(f, 0, 0, 0); /* return from function call */ #if 0 { const uint32_t *p = f->store; unsigned i; printf("# alpha (%sabled)\n", (dsa->alpha.enabled) ? "en" : "dis"); printf("# func: %u\n", dsa->alpha.func); printf("# ref: %.2f\n", dsa->alpha.ref); printf("# depth (%sabled)\n", (dsa->depth.enabled) ? "en" : "dis"); printf("# func: %u\n", dsa->depth.func); for (i = 0; i < 2; i++) { printf("# %s stencil (%sabled)\n", (i == 0) ? "front" : "back", (dsa->stencil[i].enabled) ? "en" : "dis"); printf("# func: %u\n", dsa->stencil[i].func); printf("# op (sf, zf, zp): %u %u %u\n", dsa->stencil[i].fail_op, dsa->stencil[i].zfail_op, dsa->stencil[i].zpass_op); printf("# ref value / value mask / write mask: %02x %02x %02x\n", sr->ref_value[i], dsa->stencil[i].valuemask, dsa->stencil[i].writemask); } printf("\t.text\n"); for (/* empty */; p < f->csr; p++) { printf("\t.long\t0x%04x\n", *p); } fflush(stdout); } #endif } /** * \note Emits a maximum of 3 instructions */ static int emit_alpha_factor_calculation(struct spe_function *f, unsigned factor, int src_alpha, int dst_alpha, int const_alpha) { int factor_reg; int tmp; switch (factor) { case PIPE_BLENDFACTOR_ONE: factor_reg = -1; break; case PIPE_BLENDFACTOR_SRC_ALPHA: factor_reg = spe_allocate_available_register(f); spe_or(f, factor_reg, src_alpha, src_alpha); break; case PIPE_BLENDFACTOR_DST_ALPHA: factor_reg = dst_alpha; break; case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: factor_reg = -1; break; case PIPE_BLENDFACTOR_INV_CONST_ALPHA: factor_reg = spe_allocate_available_register(f); tmp = spe_allocate_available_register(f); spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); spe_fs(f, factor_reg, tmp, const_alpha); spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_CONST_ALPHA: factor_reg = const_alpha; break; case PIPE_BLENDFACTOR_ZERO: factor_reg = -1; break; case PIPE_BLENDFACTOR_INV_SRC_ALPHA: tmp = spe_allocate_available_register(f); factor_reg = spe_allocate_available_register(f); spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); spe_fs(f, factor_reg, tmp, src_alpha); spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_INV_DST_ALPHA: tmp = spe_allocate_available_register(f); factor_reg = spe_allocate_available_register(f); spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); spe_fs(f, factor_reg, tmp, dst_alpha); spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_SRC1_ALPHA: case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: default: assert(0); factor_reg = -1; break; } return factor_reg; } /** * \note Emits a maximum of 6 instructions */ static void emit_color_factor_calculation(struct spe_function *f, unsigned sF, unsigned mask, const int *src, const int *dst, const int *const_color, int *factor) { int tmp; unsigned i; factor[0] = -1; factor[1] = -1; factor[2] = -1; factor[3] = -1; switch (sF) { case PIPE_BLENDFACTOR_ONE: break; case PIPE_BLENDFACTOR_SRC_COLOR: for (i = 0; i < 3; ++i) { if ((mask & (1U << i)) != 0) { factor[i] = spe_allocate_available_register(f); spe_or(f, factor[i], src[i], src[i]); } } break; case PIPE_BLENDFACTOR_SRC_ALPHA: factor[0] = spe_allocate_available_register(f); factor[1] = factor[0]; factor[2] = factor[0]; spe_or(f, factor[0], src[3], src[3]); break; case PIPE_BLENDFACTOR_DST_ALPHA: factor[0] = dst[3]; factor[1] = dst[3]; factor[2] = dst[3]; break; case PIPE_BLENDFACTOR_DST_COLOR: factor[0] = dst[0]; factor[1] = dst[1]; factor[2] = dst[2]; break; case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: tmp = spe_allocate_available_register(f); factor[0] = spe_allocate_available_register(f); factor[1] = factor[0]; factor[2] = factor[0]; /* Alpha saturate means min(As, 1-Ad). */ spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); spe_fs(f, tmp, tmp, dst[3]); spe_fcgt(f, factor[0], tmp, src[3]); spe_selb(f, factor[0], src[3], tmp, factor[0]); spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_INV_CONST_COLOR: tmp = spe_allocate_available_register(f); spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); for (i = 0; i < 3; i++) { factor[i] = spe_allocate_available_register(f); spe_fs(f, factor[i], tmp, const_color[i]); } spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_CONST_COLOR: for (i = 0; i < 3; i++) { factor[i] = const_color[i]; } break; case PIPE_BLENDFACTOR_INV_CONST_ALPHA: factor[0] = spe_allocate_available_register(f); factor[1] = factor[0]; factor[2] = factor[0]; tmp = spe_allocate_available_register(f); spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); spe_fs(f, factor[0], tmp, const_color[3]); spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_CONST_ALPHA: factor[0] = const_color[3]; factor[1] = factor[0]; factor[2] = factor[0]; break; case PIPE_BLENDFACTOR_ZERO: break; case PIPE_BLENDFACTOR_INV_SRC_COLOR: tmp = spe_allocate_available_register(f); spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); for (i = 0; i < 3; ++i) { if ((mask & (1U << i)) != 0) { factor[i] = spe_allocate_available_register(f); spe_fs(f, factor[i], tmp, src[i]); } } spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_INV_SRC_ALPHA: tmp = spe_allocate_available_register(f); factor[0] = spe_allocate_available_register(f); factor[1] = factor[0]; factor[2] = factor[0]; spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); spe_fs(f, factor[0], tmp, src[3]); spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_INV_DST_ALPHA: tmp = spe_allocate_available_register(f); factor[0] = spe_allocate_available_register(f); factor[1] = factor[0]; factor[2] = factor[0]; spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); spe_fs(f, factor[0], tmp, dst[3]); spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_INV_DST_COLOR: tmp = spe_allocate_available_register(f); spe_il(f, tmp, 1); spe_cuflt(f, tmp, tmp, 0); for (i = 0; i < 3; ++i) { if ((mask & (1U << i)) != 0) { factor[i] = spe_allocate_available_register(f); spe_fs(f, factor[i], tmp, dst[i]); } } spe_release_register(f, tmp); break; case PIPE_BLENDFACTOR_SRC1_COLOR: case PIPE_BLENDFACTOR_SRC1_ALPHA: case PIPE_BLENDFACTOR_INV_SRC1_COLOR: case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: default: assert(0); } } static void emit_blend_calculation(struct spe_function *f, unsigned func, unsigned sF, unsigned dF, int src, int src_factor, int dst, int dst_factor) { int tmp = spe_allocate_available_register(f); switch (func) { case PIPE_BLEND_ADD: if (sF == PIPE_BLENDFACTOR_ONE) { if (dF == PIPE_BLENDFACTOR_ZERO) { /* Do nothing. */ } else if (dF == PIPE_BLENDFACTOR_ONE) { spe_fa(f, src, src, dst); } } else if (sF == PIPE_BLENDFACTOR_ZERO) { if (dF == PIPE_BLENDFACTOR_ZERO) { spe_il(f, src, 0); } else if (dF == PIPE_BLENDFACTOR_ONE) { spe_or(f, src, dst, dst); } else { spe_fm(f, src, dst, dst_factor); } } else if (dF == PIPE_BLENDFACTOR_ZERO) { spe_fm(f, src, src, src_factor); } else { spe_fm(f, tmp, dst, dst_factor); spe_fma(f, src, src, src_factor, tmp); } break; case PIPE_BLEND_SUBTRACT: if (sF == PIPE_BLENDFACTOR_ONE) { if (dF == PIPE_BLENDFACTOR_ZERO) { /* Do nothing. */ } else if (dF == PIPE_BLENDFACTOR_ONE) { spe_fs(f, src, src, dst); } } else if (sF == PIPE_BLENDFACTOR_ZERO) { if (dF == PIPE_BLENDFACTOR_ZERO) { spe_il(f, src, 0); } else if (dF == PIPE_BLENDFACTOR_ONE) { spe_il(f, tmp, 0); spe_fs(f, src, tmp, dst); } else { spe_fm(f, src, dst, dst_factor); } } else if (dF == PIPE_BLENDFACTOR_ZERO) { spe_fm(f, src, src, src_factor); } else { spe_fm(f, tmp, dst, dst_factor); spe_fms(f, src, src, src_factor, tmp); } break; case PIPE_BLEND_REVERSE_SUBTRACT: if (sF == PIPE_BLENDFACTOR_ONE) { if (dF == PIPE_BLENDFACTOR_ZERO) { spe_il(f, tmp, 0); spe_fs(f, src, tmp, src); } else if (dF == PIPE_BLENDFACTOR_ONE) { spe_fs(f, src, dst, src); } } else if (sF == PIPE_BLENDFACTOR_ZERO) { if (dF == PIPE_BLENDFACTOR_ZERO) { spe_il(f, src, 0); } else if (dF == PIPE_BLENDFACTOR_ONE) { spe_or(f, src, dst, dst); } else { spe_fm(f, src, dst, dst_factor); } } else if (dF == PIPE_BLENDFACTOR_ZERO) { spe_fm(f, src, src, src_factor); } else { spe_fm(f, tmp, src, src_factor); spe_fms(f, src, src, dst_factor, tmp); } break; case PIPE_BLEND_MIN: spe_cgt(f, tmp, src, dst); spe_selb(f, src, src, dst, tmp); break; case PIPE_BLEND_MAX: spe_cgt(f, tmp, src, dst); spe_selb(f, src, dst, src, tmp); break; default: assert(0); } spe_release_register(f, tmp); } /** * Generate code to perform alpha blending on the SPE */ void cell_generate_alpha_blend(struct cell_blend_state *cb) { struct pipe_blend_state *const b = &cb->base; struct spe_function *const f = &cb->code; /* This code generates a maximum of 3 (source alpha factor) * + 3 (destination alpha factor) + (3 * 6) (source color factor) * + (3 * 6) (destination color factor) + (4 * 2) (blend equation) * + 4 (fragment mask) + 1 (return) = 55 instlructions. Round up to 64 to * make it a happy power-of-two. */ spe_init_func(f, SPE_INST_SIZE * 64); const int frag[4] = { spe_allocate_register(f, 3), spe_allocate_register(f, 4), spe_allocate_register(f, 5), spe_allocate_register(f, 6), }; const int pixel[4] = { spe_allocate_register(f, 7), spe_allocate_register(f, 8), spe_allocate_register(f, 9), spe_allocate_register(f, 10), }; const int const_color[4] = { spe_allocate_register(f, 11), spe_allocate_register(f, 12), spe_allocate_register(f, 13), spe_allocate_register(f, 14), }; unsigned func[4]; unsigned sF[4]; unsigned dF[4]; unsigned i; int src_factor[4]; int dst_factor[4]; /* Does the selected blend mode make use of the source / destination * color (RGB) blend factors? */ boolean need_color_factor = b->rt[0].blend_enable && (b->rt[0].rgb_func != PIPE_BLEND_MIN) && (b->rt[0].rgb_func != PIPE_BLEND_MAX); /* Does the selected blend mode make use of the source / destination * alpha blend factors? */ boolean need_alpha_factor = b->rt[0].blend_enable && (b->rt[0].alpha_func != PIPE_BLEND_MIN) && (b->rt[0].alpha_func != PIPE_BLEND_MAX); if (b->rt[0].blend_enable) { sF[0] = b->rt[0].rgb_src_factor; sF[1] = sF[0]; sF[2] = sF[0]; switch (b->rt[0].alpha_src_factor & 0x0f) { case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: sF[3] = PIPE_BLENDFACTOR_ONE; break; case PIPE_BLENDFACTOR_SRC_COLOR: case PIPE_BLENDFACTOR_DST_COLOR: case PIPE_BLENDFACTOR_CONST_COLOR: case PIPE_BLENDFACTOR_SRC1_COLOR: sF[3] = b->rt[0].alpha_src_factor + 1; break; default: sF[3] = b->rt[0].alpha_src_factor; } dF[0] = b->rt[0].rgb_dst_factor; dF[1] = dF[0]; dF[2] = dF[0]; switch (b->rt[0].alpha_dst_factor & 0x0f) { case PIPE_BLENDFACTOR_SRC_COLOR: case PIPE_BLENDFACTOR_DST_COLOR: case PIPE_BLENDFACTOR_CONST_COLOR: case PIPE_BLENDFACTOR_SRC1_COLOR: dF[3] = b->rt[0].alpha_dst_factor + 1; break; default: dF[3] = b->rt[0].alpha_dst_factor; } func[0] = b->rt[0].rgb_func; func[1] = func[0]; func[2] = func[0]; func[3] = b->rt[0].alpha_func; } else { sF[0] = PIPE_BLENDFACTOR_ONE; sF[1] = PIPE_BLENDFACTOR_ONE; sF[2] = PIPE_BLENDFACTOR_ONE; sF[3] = PIPE_BLENDFACTOR_ONE; dF[0] = PIPE_BLENDFACTOR_ZERO; dF[1] = PIPE_BLENDFACTOR_ZERO; dF[2] = PIPE_BLENDFACTOR_ZERO; dF[3] = PIPE_BLENDFACTOR_ZERO; func[0] = PIPE_BLEND_ADD; func[1] = PIPE_BLEND_ADD; func[2] = PIPE_BLEND_ADD; func[3] = PIPE_BLEND_ADD; } /* If alpha writing is enabled and the alpha blend mode requires use of * the alpha factor, calculate the alpha factor. */ if (((b->rt[0].colormask & 8) != 0) && need_alpha_factor) { src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3], frag[3], pixel[3]); /* If the alpha destination blend factor is the same as the alpha source * blend factor, re-use the previously calculated value. */ dst_factor[3] = (dF[3] == sF[3]) ? src_factor[3] : emit_alpha_factor_calculation(f, dF[3], const_color[3], frag[3], pixel[3]); } if (sF[0] == sF[3]) { src_factor[0] = src_factor[3]; src_factor[1] = src_factor[3]; src_factor[2] = src_factor[3]; } else if (sF[0] == dF[3]) { src_factor[0] = dst_factor[3]; src_factor[1] = dst_factor[3]; src_factor[2] = dst_factor[3]; } else if (need_color_factor) { emit_color_factor_calculation(f, b->rt[0].rgb_src_factor, b->rt[0].colormask, frag, pixel, const_color, src_factor); } if (dF[0] == sF[3]) { dst_factor[0] = src_factor[3]; dst_factor[1] = src_factor[3]; dst_factor[2] = src_factor[3]; } else if (dF[0] == dF[3]) { dst_factor[0] = dst_factor[3]; dst_factor[1] = dst_factor[3]; dst_factor[2] = dst_factor[3]; } else if (dF[0] == sF[0]) { dst_factor[0] = src_factor[0]; dst_factor[1] = src_factor[1]; dst_factor[2] = src_factor[2]; } else if (need_color_factor) { emit_color_factor_calculation(f, b->rt[0].rgb_dst_factor, b->rt[0].colormask, frag, pixel, const_color, dst_factor); } for (i = 0; i < 4; ++i) { if ((b->rt[0].colormask & (1U << i)) != 0) { emit_blend_calculation(f, func[i], sF[i], dF[i], frag[i], src_factor[i], pixel[i], dst_factor[i]); } } spe_bi(f, 0, 0, 0); #if 0 { const uint32_t *p = f->store; printf("# %u instructions\n", f->csr - f->store); printf("# blend (%sabled)\n", (cb->base.blend_enable) ? "en" : "dis"); printf("# RGB func / sf / df: %u %u %u\n", cb->base.rgb_func, cb->base.rgb_src_factor, cb->base.rgb_dst_factor); printf("# ALP func / sf / df: %u %u %u\n", cb->base.alpha_func, cb->base.alpha_src_factor, cb->base.alpha_dst_factor); printf("\t.text\n"); for (/* empty */; p < f->csr; p++) { printf("\t.long\t0x%04x\n", *p); } fflush(stdout); } #endif } static int PC_OFFSET(const struct spe_function *f, const void *d) { const intptr_t pc = (intptr_t) &f->store[f->num_inst]; const intptr_t ea = ~0x0f & (intptr_t) d; return (ea - pc) >> 2; } /** * Generate code to perform color conversion and logic op * * \bug * The code generated by this function should also perform dithering. * * \bug * The code generated by this function should also perform color-write * masking. * * \bug * Only two framebuffer formats are supported at this time. */ void cell_generate_logic_op(struct spe_function *f, const struct pipe_blend_state *blend, struct pipe_surface *surf) { const unsigned logic_op = (blend->logicop_enable) ? blend->logicop_func : PIPE_LOGICOP_COPY; /* This code generates a maximum of 37 instructions. An additional 32 * bytes (equiv. to 8 instructions) are needed for data storage. Round up * to 64 to make it a happy power-of-two. */ spe_init_func(f, SPE_INST_SIZE * 64); /* Pixel colors in framebuffer format in AoS layout. */ const int pixel[4] = { spe_allocate_register(f, 3), spe_allocate_register(f, 4), spe_allocate_register(f, 5), spe_allocate_register(f, 6), }; /* Fragment colors stored as floats in SoA layout. */ const int frag[4] = { spe_allocate_register(f, 7), spe_allocate_register(f, 8), spe_allocate_register(f, 9), spe_allocate_register(f, 10), }; const int mask = spe_allocate_register(f, 11); /* Short-circuit the noop and invert cases. */ if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->rt[0].colormask == 0)) { spe_bi(f, 0, 0, 0); return; } else if (logic_op == PIPE_LOGICOP_INVERT) { spe_nor(f, pixel[0], pixel[0], pixel[0]); spe_nor(f, pixel[1], pixel[1], pixel[1]); spe_nor(f, pixel[2], pixel[2], pixel[2]); spe_nor(f, pixel[3], pixel[3], pixel[3]); spe_bi(f, 0, 0, 0); return; } const int tmp[4] = { spe_allocate_available_register(f), spe_allocate_available_register(f), spe_allocate_available_register(f), spe_allocate_available_register(f), }; const int shuf_xpose_hi = spe_allocate_available_register(f); const int shuf_xpose_lo = spe_allocate_available_register(f); const int shuf_color = spe_allocate_available_register(f); /* Pointer to the begining of the function's private data area. */ uint32_t *const data = ((uint32_t *) f->store) + (64 - 8); /* Convert fragment colors to framebuffer format in AoS layout. */ switch (surf->format) { case PIPE_FORMAT_B8G8R8A8_UNORM: data[0] = 0x00010203; data[1] = 0x10111213; data[2] = 0x04050607; data[3] = 0x14151617; data[4] = 0x0c000408; data[5] = 0x80808080; data[6] = 0x80808080; data[7] = 0x80808080; break; case PIPE_FORMAT_A8R8G8B8_UNORM: data[0] = 0x03020100; data[1] = 0x13121110; data[2] = 0x07060504; data[3] = 0x17161514; data[4] = 0x0804000c; data[5] = 0x80808080; data[6] = 0x80808080; data[7] = 0x80808080; break; default: fprintf(stderr, "CELL: Bad pixel format in cell_generate_logic_op()"); ASSERT(0); } spe_ilh(f, tmp[0], 0x0808); spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0)); spe_lqr(f, shuf_color, PC_OFFSET(f, data+4)); spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]); spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi); spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo); spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi); spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo); spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi); spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo); spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi); spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo); spe_cfltu(f, frag[0], frag[0], 32); spe_cfltu(f, frag[1], frag[1], 32); spe_cfltu(f, frag[2], frag[2], 32); spe_cfltu(f, frag[3], frag[3], 32); spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color); spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color); spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color); spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color); /* If logic op is enabled, perform the requested logical operation on the * converted fragment colors and the pixel colors. */ switch (logic_op) { case PIPE_LOGICOP_CLEAR: spe_il(f, frag[0], 0); spe_il(f, frag[1], 0); spe_il(f, frag[2], 0); spe_il(f, frag[3], 0); break; case PIPE_LOGICOP_NOR: spe_nor(f, frag[0], frag[0], pixel[0]); spe_nor(f, frag[1], frag[1], pixel[1]); spe_nor(f, frag[2], frag[2], pixel[2]); spe_nor(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_AND_INVERTED: spe_andc(f, frag[0], pixel[0], frag[0]); spe_andc(f, frag[1], pixel[1], frag[1]); spe_andc(f, frag[2], pixel[2], frag[2]); spe_andc(f, frag[3], pixel[3], frag[3]); break; case PIPE_LOGICOP_COPY_INVERTED: spe_nor(f, frag[0], frag[0], frag[0]); spe_nor(f, frag[1], frag[1], frag[1]); spe_nor(f, frag[2], frag[2], frag[2]); spe_nor(f, frag[3], frag[3], frag[3]); break; case PIPE_LOGICOP_AND_REVERSE: spe_andc(f, frag[0], frag[0], pixel[0]); spe_andc(f, frag[1], frag[1], pixel[1]); spe_andc(f, frag[2], frag[2], pixel[2]); spe_andc(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_XOR: spe_xor(f, frag[0], frag[0], pixel[0]); spe_xor(f, frag[1], frag[1], pixel[1]); spe_xor(f, frag[2], frag[2], pixel[2]); spe_xor(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_NAND: spe_nand(f, frag[0], frag[0], pixel[0]); spe_nand(f, frag[1], frag[1], pixel[1]); spe_nand(f, frag[2], frag[2], pixel[2]); spe_nand(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_AND: spe_and(f, frag[0], frag[0], pixel[0]); spe_and(f, frag[1], frag[1], pixel[1]); spe_and(f, frag[2], frag[2], pixel[2]); spe_and(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_EQUIV: spe_eqv(f, frag[0], frag[0], pixel[0]); spe_eqv(f, frag[1], frag[1], pixel[1]); spe_eqv(f, frag[2], frag[2], pixel[2]); spe_eqv(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_OR_INVERTED: spe_orc(f, frag[0], pixel[0], frag[0]); spe_orc(f, frag[1], pixel[1], frag[1]); spe_orc(f, frag[2], pixel[2], frag[2]); spe_orc(f, frag[3], pixel[3], frag[3]); break; case PIPE_LOGICOP_COPY: break; case PIPE_LOGICOP_OR_REVERSE: spe_orc(f, frag[0], frag[0], pixel[0]); spe_orc(f, frag[1], frag[1], pixel[1]); spe_orc(f, frag[2], frag[2], pixel[2]); spe_orc(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_OR: spe_or(f, frag[0], frag[0], pixel[0]); spe_or(f, frag[1], frag[1], pixel[1]); spe_or(f, frag[2], frag[2], pixel[2]); spe_or(f, frag[3], frag[3], pixel[3]); break; case PIPE_LOGICOP_SET: spe_il(f, frag[0], ~0); spe_il(f, frag[1], ~0); spe_il(f, frag[2], ~0); spe_il(f, frag[3], ~0); break; /* These two cases are short-circuited above. */ case PIPE_LOGICOP_INVERT: case PIPE_LOGICOP_NOOP: default: assert(0); } /* Apply fragment mask. */ spe_ilh(f, tmp[0], 0x0000); spe_ilh(f, tmp[1], 0x0404); spe_ilh(f, tmp[2], 0x0808); spe_ilh(f, tmp[3], 0x0c0c); spe_shufb(f, tmp[0], mask, mask, tmp[0]); spe_shufb(f, tmp[1], mask, mask, tmp[1]); spe_shufb(f, tmp[2], mask, mask, tmp[2]); spe_shufb(f, tmp[3], mask, mask, tmp[3]); spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]); spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]); spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]); spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]); spe_bi(f, 0, 0, 0); #if 0 { const uint32_t *p = f->store; unsigned i; printf("# %u instructions\n", f->csr - f->store); printf("\t.text\n"); for (i = 0; i < 64; i++) { printf("\t.long\t0x%04x\n", p[i]); } fflush(stdout); } #endif }