/************************************************************************** * * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /** * TGSI interpretor/executor. * * Flow control information: * * Since we operate on 'quads' (4 pixels or 4 vertices in parallel) * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special * care since a condition may be true for some quad components but false * for other components. * * We basically execute all statements (even if they're in the part of * an IF/ELSE clause that's "not taken") and use a special mask to * control writing to destination registers. This is the ExecMask. * See store_dest(). * * The ExecMask is computed from three other masks (CondMask, LoopMask and * ContMask) which are controlled by the flow control instructions (namely: * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT). * * * Authors: * Michal Krol * Brian Paul */ #include <transpose_matrix4x4.h> #include <simdmath/ceilf4.h> #include <simdmath/cosf4.h> #include <simdmath/divf4.h> #include <simdmath/floorf4.h> #include <simdmath/log2f4.h> #include <simdmath/powf4.h> #include <simdmath/sinf4.h> #include <simdmath/sqrtf4.h> #include <simdmath/truncf4.h> #include "pipe/p_compiler.h" #include "pipe/p_state.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "spu_exec.h" #include "spu_main.h" #include "spu_vertex_shader.h" #include "spu_dcache.h" #include "cell/common.h" #define TILE_TOP_LEFT 0 #define TILE_TOP_RIGHT 1 #define TILE_BOTTOM_LEFT 2 #define TILE_BOTTOM_RIGHT 3 /* * Shorthand locations of various utility registers (_I = Index, _C = Channel) */ #define TEMP_0_I TGSI_EXEC_TEMP_00000000_I #define TEMP_0_C TGSI_EXEC_TEMP_00000000_C #define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I #define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C #define TEMP_80_I TGSI_EXEC_TEMP_80000000_I #define TEMP_80_C TGSI_EXEC_TEMP_80000000_C #define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I #define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C #define TEMP_1_I TGSI_EXEC_TEMP_ONE_I #define TEMP_1_C TGSI_EXEC_TEMP_ONE_C #define TEMP_2_I TGSI_EXEC_TEMP_TWO_I #define TEMP_2_C TGSI_EXEC_TEMP_TWO_C #define TEMP_128_I TGSI_EXEC_TEMP_128_I #define TEMP_128_C TGSI_EXEC_TEMP_128_C #define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I #define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C #define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I #define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C #define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I #define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C #define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I #define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C #define TEMP_R0 TGSI_EXEC_TEMP_R0 #define FOR_EACH_CHANNEL(CHAN)\ for (CHAN = 0; CHAN < 4; CHAN++) #define IS_CHANNEL_ENABLED(INST, CHAN)\ ((INST).Dst[0].Register.WriteMask & (1 << (CHAN))) #define IS_CHANNEL_ENABLED2(INST, CHAN)\ ((INST).Dst[1].Register.WriteMask & (1 << (CHAN))) #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\ FOR_EACH_CHANNEL( CHAN )\ if (IS_CHANNEL_ENABLED( INST, CHAN )) #define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\ FOR_EACH_CHANNEL( CHAN )\ if (IS_CHANNEL_ENABLED2( INST, CHAN )) /** The execution mask depends on the conditional mask and the loop mask */ #define UPDATE_EXEC_MASK(MACH) \ MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask #define CHAN_X 0 #define CHAN_Y 1 #define CHAN_Z 2 #define CHAN_W 3 /** * Initialize machine state by expanding tokens to full instructions, * allocating temporary storage, setting up constants, etc. * After this, we can call spu_exec_machine_run() many times. */ void spu_exec_machine_init(struct spu_exec_machine *mach, uint numSamplers, struct spu_sampler *samplers, unsigned processor) { const qword zero = si_il(0); const qword not_zero = si_il(~0); (void) numSamplers; mach->Samplers = samplers; mach->Processor = processor; mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS]; /* Setup constants. */ mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero; mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero; mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1); mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31); mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f); mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f); mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f); mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f); } static INLINE qword micro_abs(qword src) { return si_rotmi(si_shli(src, 1), -1); } static INLINE qword micro_ceil(qword src) { return (qword) _ceilf4((vec_float4) src); } static INLINE qword micro_cos(qword src) { return (qword) _cosf4((vec_float4) src); } static const qword br_shuf = { TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1, TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3, }; static const qword bl_shuf = { TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1, TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3, }; static const qword tl_shuf = { TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1, TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3, }; static qword micro_ddx(qword src) { qword bottom_right = si_shufb(src, src, br_shuf); qword bottom_left = si_shufb(src, src, bl_shuf); return si_fs(bottom_right, bottom_left); } static qword micro_ddy(qword src) { qword top_left = si_shufb(src, src, tl_shuf); qword bottom_left = si_shufb(src, src, bl_shuf); return si_fs(top_left, bottom_left); } static INLINE qword micro_div(qword src0, qword src1) { return (qword) _divf4((vec_float4) src0, (vec_float4) src1); } static qword micro_flr(qword src) { return (qword) _floorf4((vec_float4) src); } static qword micro_frc(qword src) { return si_fs(src, (qword) _floorf4((vec_float4) src)); } static INLINE qword micro_ge(qword src0, qword src1) { return si_or(si_fceq(src0, src1), si_fcgt(src0, src1)); } static qword micro_lg2(qword src) { return (qword) _log2f4((vec_float4) src); } static INLINE qword micro_lt(qword src0, qword src1) { const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1)); return si_xori(tmp, 0xff); } static INLINE qword micro_max(qword src0, qword src1) { return si_selb(src1, src0, si_fcgt(src0, src1)); } static INLINE qword micro_min(qword src0, qword src1) { return si_selb(src0, src1, si_fcgt(src0, src1)); } static qword micro_neg(qword src) { return si_xor(src, (qword) spu_splats(0x80000000)); } static qword micro_set_sign(qword src) { return si_or(src, (qword) spu_splats(0x80000000)); } static qword micro_pow(qword src0, qword src1) { return (qword) _powf4((vec_float4) src0, (vec_float4) src1); } static qword micro_rnd(qword src) { const qword half = (qword) spu_splats(0.5f); /* May be able to use _roundf4. There may be some difference, though. */ return (qword) _floorf4((vec_float4) si_fa(src, half)); } static INLINE qword micro_ishr(qword src0, qword src1) { return si_rotma(src0, si_sfi(src1, 0)); } static qword micro_trunc(qword src) { return (qword) _truncf4((vec_float4) src); } static qword micro_sin(qword src) { return (qword) _sinf4((vec_float4) src); } static INLINE qword micro_sqrt(qword src) { return (qword) _sqrtf4((vec_float4) src); } static void fetch_src_file_channel( const struct spu_exec_machine *mach, const uint file, const uint swizzle, const union spu_exec_channel *index, union spu_exec_channel *chan ) { switch( swizzle ) { case TGSI_SWIZZLE_X: case TGSI_SWIZZLE_Y: case TGSI_SWIZZLE_Z: case TGSI_SWIZZLE_W: switch( file ) { case TGSI_FILE_CONSTANT: { unsigned i; for (i = 0; i < 4; i++) { const float *ptr = mach->Consts[index->i[i]]; float tmp[4]; spu_dcache_fetch_unaligned((qword *) tmp, (uintptr_t)(ptr + swizzle), sizeof(float)); chan->f[i] = tmp[0]; } break; } case TGSI_FILE_INPUT: chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0]; chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1]; chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2]; chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3]; break; case TGSI_FILE_TEMPORARY: chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0]; chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1]; chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2]; chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3]; break; case TGSI_FILE_IMMEDIATE: ASSERT( index->i[0] < (int) mach->ImmLimit ); ASSERT( index->i[1] < (int) mach->ImmLimit ); ASSERT( index->i[2] < (int) mach->ImmLimit ); ASSERT( index->i[3] < (int) mach->ImmLimit ); chan->f[0] = mach->Imms[index->i[0]][swizzle]; chan->f[1] = mach->Imms[index->i[1]][swizzle]; chan->f[2] = mach->Imms[index->i[2]][swizzle]; chan->f[3] = mach->Imms[index->i[3]][swizzle]; break; case TGSI_FILE_ADDRESS: chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0]; chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1]; chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2]; chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3]; break; case TGSI_FILE_OUTPUT: /* vertex/fragment output vars can be read too */ chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0]; chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1]; chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2]; chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3]; break; default: ASSERT( 0 ); } break; default: ASSERT( 0 ); } } static void fetch_source( const struct spu_exec_machine *mach, union spu_exec_channel *chan, const struct tgsi_full_src_register *reg, const uint chan_index ) { union spu_exec_channel index; uint swizzle; index.i[0] = index.i[1] = index.i[2] = index.i[3] = reg->Register.Index; if (reg->Register.Indirect) { union spu_exec_channel index2; union spu_exec_channel indir_index; index2.i[0] = index2.i[1] = index2.i[2] = index2.i[3] = reg->Indirect.Index; swizzle = tgsi_util_get_src_register_swizzle(®->Indirect, CHAN_X); fetch_src_file_channel( mach, reg->Indirect.File, swizzle, &index2, &indir_index ); index.q = si_a(index.q, indir_index.q); } if( reg->Register.Dimension ) { switch( reg->Register.File ) { case TGSI_FILE_INPUT: index.q = si_mpyi(index.q, 17); break; case TGSI_FILE_CONSTANT: index.q = si_shli(index.q, 12); break; default: ASSERT( 0 ); } index.i[0] += reg->Dimension.Index; index.i[1] += reg->Dimension.Index; index.i[2] += reg->Dimension.Index; index.i[3] += reg->Dimension.Index; if (reg->Dimension.Indirect) { union spu_exec_channel index2; union spu_exec_channel indir_index; index2.i[0] = index2.i[1] = index2.i[2] = index2.i[3] = reg->DimIndirect.Index; swizzle = tgsi_util_get_src_register_swizzle( ®->DimIndirect, CHAN_X ); fetch_src_file_channel( mach, reg->DimIndirect.File, swizzle, &index2, &indir_index ); index.q = si_a(index.q, indir_index.q); } } swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index ); fetch_src_file_channel( mach, reg->Register.File, swizzle, &index, chan ); switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) { case TGSI_UTIL_SIGN_CLEAR: chan->q = micro_abs(chan->q); break; case TGSI_UTIL_SIGN_SET: chan->q = micro_set_sign(chan->q); break; case TGSI_UTIL_SIGN_TOGGLE: chan->q = micro_neg(chan->q); break; case TGSI_UTIL_SIGN_KEEP: break; } if (reg->RegisterExtMod.Complement) { chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q); } } static void store_dest( struct spu_exec_machine *mach, const union spu_exec_channel *chan, const struct tgsi_full_dst_register *reg, const struct tgsi_full_instruction *inst, uint chan_index ) { union spu_exec_channel *dst; switch( reg->Register.File ) { case TGSI_FILE_NULL: return; case TGSI_FILE_OUTPUT: dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] + reg->Register.Index].xyzw[chan_index]; break; case TGSI_FILE_TEMPORARY: dst = &mach->Temps[reg->Register.Index].xyzw[chan_index]; break; case TGSI_FILE_ADDRESS: dst = &mach->Addrs[reg->Register.Index].xyzw[chan_index]; break; default: ASSERT( 0 ); return; } switch (inst->Instruction.Saturate) { case TGSI_SAT_NONE: if (mach->ExecMask & 0x1) dst->i[0] = chan->i[0]; if (mach->ExecMask & 0x2) dst->i[1] = chan->i[1]; if (mach->ExecMask & 0x4) dst->i[2] = chan->i[2]; if (mach->ExecMask & 0x8) dst->i[3] = chan->i[3]; break; case TGSI_SAT_ZERO_ONE: /* XXX need to obey ExecMask here */ dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q); break; case TGSI_SAT_MINUS_PLUS_ONE: ASSERT( 0 ); break; default: ASSERT( 0 ); } } #define FETCH(VAL,INDEX,CHAN)\ fetch_source (mach, VAL, &inst->Src[INDEX], CHAN) #define STORE(VAL,INDEX,CHAN)\ store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN ) /** * Execute ARB-style KIL which is predicated by a src register. * Kill fragment if any of the four values is less than zero. */ static void exec_kil(struct spu_exec_machine *mach, const struct tgsi_full_instruction *inst) { uint uniquemask; uint chan_index; uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */ union spu_exec_channel r[1]; /* This mask stores component bits that were already tested. */ uniquemask = 0; for (chan_index = 0; chan_index < 4; chan_index++) { uint swizzle; uint i; /* unswizzle channel */ swizzle = tgsi_util_get_full_src_register_swizzle ( &inst->Src[0], chan_index); /* check if the component has not been already tested */ if (uniquemask & (1 << swizzle)) continue; uniquemask |= 1 << swizzle; FETCH(&r[0], 0, chan_index); for (i = 0; i < 4; i++) if (r[0].f[i] < 0.0f) kilmask |= 1 << i; } mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask; } /** * Execute NVIDIA-style KIL which is predicated by a condition code. * Kill fragment if the condition code is TRUE. */ static void exec_kilp(struct tgsi_exec_machine *mach, const struct tgsi_full_instruction *inst) { uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */ /* TODO: build kilmask from CC mask */ mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask; } /* * Fetch a texel using STR texture coordinates. */ static void fetch_texel( struct spu_sampler *sampler, const union spu_exec_channel *s, const union spu_exec_channel *t, const union spu_exec_channel *p, float lodbias, /* XXX should be float[4] */ union spu_exec_channel *r, union spu_exec_channel *g, union spu_exec_channel *b, union spu_exec_channel *a ) { qword rgba[4]; qword out[4]; sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float (*)[4]) rgba); _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba); r->q = out[0]; g->q = out[1]; b->q = out[2]; a->q = out[3]; } static void exec_tex(struct spu_exec_machine *mach, const struct tgsi_full_instruction *inst, boolean biasLod, boolean projected) { const uint unit = inst->Src[1].Register.Index; union spu_exec_channel r[8]; uint chan_index; float lodBias; /* printf("Sampler %u unit %u\n", sampler, unit); */ switch (inst->InstructionExtTexture.Texture) { case TGSI_TEXTURE_1D: FETCH(&r[0], 0, CHAN_X); if (projected) { FETCH(&r[1], 0, CHAN_W); r[0].q = micro_div(r[0].q, r[1].q); } if (biasLod) { FETCH(&r[1], 0, CHAN_W); lodBias = r[2].f[0]; } else lodBias = 0.0; fetch_texel(&mach->Samplers[unit], &r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */ &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */ break; case TGSI_TEXTURE_2D: case TGSI_TEXTURE_RECT: FETCH(&r[0], 0, CHAN_X); FETCH(&r[1], 0, CHAN_Y); FETCH(&r[2], 0, CHAN_Z); if (projected) { FETCH(&r[3], 0, CHAN_W); r[0].q = micro_div(r[0].q, r[3].q); r[1].q = micro_div(r[1].q, r[3].q); r[2].q = micro_div(r[2].q, r[3].q); } if (biasLod) { FETCH(&r[3], 0, CHAN_W); lodBias = r[3].f[0]; } else lodBias = 0.0; fetch_texel(&mach->Samplers[unit], &r[0], &r[1], &r[2], lodBias, /* inputs */ &r[0], &r[1], &r[2], &r[3]); /* outputs */ break; case TGSI_TEXTURE_3D: case TGSI_TEXTURE_CUBE: FETCH(&r[0], 0, CHAN_X); FETCH(&r[1], 0, CHAN_Y); FETCH(&r[2], 0, CHAN_Z); if (projected) { FETCH(&r[3], 0, CHAN_W); r[0].q = micro_div(r[0].q, r[3].q); r[1].q = micro_div(r[1].q, r[3].q); r[2].q = micro_div(r[2].q, r[3].q); } if (biasLod) { FETCH(&r[3], 0, CHAN_W); lodBias = r[3].f[0]; } else lodBias = 0.0; fetch_texel(&mach->Samplers[unit], &r[0], &r[1], &r[2], lodBias, &r[0], &r[1], &r[2], &r[3]); break; default: ASSERT (0); } FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[chan_index], 0, chan_index ); } } static void constant_interpolation( struct spu_exec_machine *mach, unsigned attrib, unsigned chan ) { unsigned i; for( i = 0; i < QUAD_SIZE; i++ ) { mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan]; } } static void linear_interpolation( struct spu_exec_machine *mach, unsigned attrib, unsigned chan ) { const float x = mach->QuadPos.xyzw[0].f[0]; const float y = mach->QuadPos.xyzw[1].f[0]; const float dadx = mach->InterpCoefs[attrib].dadx[chan]; const float dady = mach->InterpCoefs[attrib].dady[chan]; const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; mach->Inputs[attrib].xyzw[chan].f[0] = a0; mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx; mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady; mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady; } static void perspective_interpolation( struct spu_exec_machine *mach, unsigned attrib, unsigned chan ) { const float x = mach->QuadPos.xyzw[0].f[0]; const float y = mach->QuadPos.xyzw[1].f[0]; const float dadx = mach->InterpCoefs[attrib].dadx[chan]; const float dady = mach->InterpCoefs[attrib].dady[chan]; const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y; const float *w = mach->QuadPos.xyzw[3].f; /* divide by W here */ mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0]; mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1]; mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2]; mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3]; } typedef void (* interpolation_func)( struct spu_exec_machine *mach, unsigned attrib, unsigned chan ); static void exec_declaration(struct spu_exec_machine *mach, const struct tgsi_full_declaration *decl) { if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { if( decl->Declaration.File == TGSI_FILE_INPUT ) { unsigned first, last, mask; interpolation_func interp; first = decl->Range.First; last = decl->Range.Last; mask = decl->Declaration.UsageMask; switch( decl->Declaration.Interpolate ) { case TGSI_INTERPOLATE_CONSTANT: interp = constant_interpolation; break; case TGSI_INTERPOLATE_LINEAR: interp = linear_interpolation; break; case TGSI_INTERPOLATE_PERSPECTIVE: interp = perspective_interpolation; break; default: ASSERT( 0 ); } if( mask == TGSI_WRITEMASK_XYZW ) { unsigned i, j; for( i = first; i <= last; i++ ) { for( j = 0; j < NUM_CHANNELS; j++ ) { interp( mach, i, j ); } } } else { unsigned i, j; for( j = 0; j < NUM_CHANNELS; j++ ) { if( mask & (1 << j) ) { for( i = first; i <= last; i++ ) { interp( mach, i, j ); } } } } } } } static void exec_instruction( struct spu_exec_machine *mach, const struct tgsi_full_instruction *inst, int *pc ) { uint chan_index; union spu_exec_channel r[8]; (*pc)++; switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ARL: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = si_cflts(r[0].q, 0); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_MOV: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_LIT: if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { FETCH( &r[0], 0, CHAN_X ); if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); STORE( &r[0], 0, CHAN_Y ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { FETCH( &r[1], 0, CHAN_Y ); r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); FETCH( &r[2], 0, CHAN_W ); r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q); r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q); r[1].q = micro_pow(r[1].q, r[2].q); /* r0 = (r0 > 0.0) ? r1 : 0.0 */ r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q); r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q, r[0].q); STORE( &r[0], 0, CHAN_Z ); } } if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); } break; case TGSI_OPCODE_RCP: FETCH( &r[0], 0, CHAN_X ); r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_RSQ: FETCH( &r[0], 0, CHAN_X ); r[0].q = micro_sqrt(r[0].q); r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_EXP: ASSERT (0); break; case TGSI_OPCODE_LOG: ASSERT (0); break; case TGSI_OPCODE_MUL: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH(&r[0], 0, chan_index); FETCH(&r[1], 1, chan_index); r[0].q = si_fm(r[0].q, r[1].q); STORE(&r[0], 0, chan_index); } break; case TGSI_OPCODE_ADD: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_fa(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_DP3: /* TGSI_OPCODE_DOT3 */ FETCH( &r[0], 0, CHAN_X ); FETCH( &r[1], 1, CHAN_X ); r[0].q = si_fm(r[0].q, r[1].q); FETCH( &r[1], 0, CHAN_Y ); FETCH( &r[2], 1, CHAN_Y ); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FETCH( &r[1], 0, CHAN_Z ); FETCH( &r[2], 1, CHAN_Z ); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_DP4: /* TGSI_OPCODE_DOT4 */ FETCH(&r[0], 0, CHAN_X); FETCH(&r[1], 1, CHAN_X); r[0].q = si_fm(r[0].q, r[1].q); FETCH(&r[1], 0, CHAN_Y); FETCH(&r[2], 1, CHAN_Y); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FETCH(&r[1], 0, CHAN_Z); FETCH(&r[2], 1, CHAN_Z); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FETCH(&r[1], 0, CHAN_W); FETCH(&r[2], 1, CHAN_W); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_DST: if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { FETCH( &r[0], 0, CHAN_Y ); FETCH( &r[1], 1, CHAN_Y); r[0].q = si_fm(r[0].q, r[1].q); STORE( &r[0], 0, CHAN_Y ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { FETCH( &r[0], 0, CHAN_Z ); STORE( &r[0], 0, CHAN_Z ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { FETCH( &r[0], 1, CHAN_W ); STORE( &r[0], 0, CHAN_W ); } break; case TGSI_OPCODE_MIN: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH(&r[0], 0, chan_index); FETCH(&r[1], 1, chan_index); r[0].q = micro_min(r[0].q, r[1].q); STORE(&r[0], 0, chan_index); } break; case TGSI_OPCODE_MAX: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH(&r[0], 0, chan_index); FETCH(&r[1], 1, chan_index); r[0].q = micro_max(r[0].q, r[1].q); STORE(&r[0], 0, chan_index ); } break; case TGSI_OPCODE_SLT: /* TGSI_OPCODE_SETLT */ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = micro_ge(r[0].q, r[1].q); r[0].q = si_xori(r[0].q, 0xff); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SGE: /* TGSI_OPCODE_SETGE */ FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = micro_ge(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_MAD: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); FETCH( &r[2], 2, chan_index ); r[0].q = si_fma(r[0].q, r[1].q, r[2].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SUB: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH(&r[0], 0, chan_index); FETCH(&r[1], 1, chan_index); r[0].q = si_fs(r[0].q, r[1].q); STORE(&r[0], 0, chan_index); } break; case TGSI_OPCODE_LRP: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH(&r[0], 0, chan_index); FETCH(&r[1], 1, chan_index); FETCH(&r[2], 2, chan_index); r[1].q = si_fs(r[1].q, r[2].q); r[0].q = si_fma(r[0].q, r[1].q, r[2].q); STORE(&r[0], 0, chan_index); } break; case TGSI_OPCODE_CND: ASSERT (0); break; case TGSI_OPCODE_DP2A: ASSERT (0); break; case TGSI_OPCODE_FRC: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = micro_frc(r[0].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_CLAMP: ASSERT (0); break; case TGSI_OPCODE_FLR: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = micro_flr(r[0].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_ROUND: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = micro_rnd(r[0].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_EX2: FETCH(&r[0], 0, CHAN_X); r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_LG2: FETCH( &r[0], 0, CHAN_X ); r[0].q = micro_lg2(r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_POW: FETCH(&r[0], 0, CHAN_X); FETCH(&r[1], 1, CHAN_X); r[0].q = micro_pow(r[0].q, r[1].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_XPD: /* TGSI_OPCODE_XPD */ FETCH(&r[0], 0, CHAN_Y); FETCH(&r[1], 1, CHAN_Z); FETCH(&r[3], 0, CHAN_Z); FETCH(&r[4], 1, CHAN_Y); /* r2 = (r0 * r1) - (r3 * r5) */ r[2].q = si_fm(r[3].q, r[5].q); r[2].q = si_fms(r[0].q, r[1].q, r[2].q); if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) { STORE( &r[2], 0, CHAN_X ); } FETCH(&r[2], 1, CHAN_X); FETCH(&r[5], 0, CHAN_X); /* r3 = (r3 * r2) - (r1 * r5) */ r[1].q = si_fm(r[1].q, r[5].q); r[3].q = si_fms(r[3].q, r[2].q, r[1].q); if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) { STORE( &r[3], 0, CHAN_Y ); } /* r5 = (r5 * r4) - (r0 * r2) */ r[0].q = si_fm(r[0].q, r[2].q); r[5].q = si_fms(r[5].q, r[4].q, r[0].q); if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) { STORE( &r[5], 0, CHAN_Z ); } if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) { STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); } break; case TGSI_OPCODE_ABS: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH(&r[0], 0, chan_index); r[0].q = micro_abs(r[0].q); STORE(&r[0], 0, chan_index); } break; case TGSI_OPCODE_RCC: ASSERT (0); break; case TGSI_OPCODE_DPH: FETCH(&r[0], 0, CHAN_X); FETCH(&r[1], 1, CHAN_X); r[0].q = si_fm(r[0].q, r[1].q); FETCH(&r[1], 0, CHAN_Y); FETCH(&r[2], 1, CHAN_Y); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FETCH(&r[1], 0, CHAN_Z); FETCH(&r[2], 1, CHAN_Z); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FETCH(&r[1], 1, CHAN_W); r[0].q = si_fa(r[0].q, r[1].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_COS: FETCH(&r[0], 0, CHAN_X); r[0].q = micro_cos(r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_DDX: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = micro_ddx(r[0].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_DDY: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = micro_ddy(r[0].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_KILP: exec_kilp (mach, inst); break; case TGSI_OPCODE_KIL: exec_kil (mach, inst); break; case TGSI_OPCODE_PK2H: ASSERT (0); break; case TGSI_OPCODE_PK2US: ASSERT (0); break; case TGSI_OPCODE_PK4B: ASSERT (0); break; case TGSI_OPCODE_PK4UB: ASSERT (0); break; case TGSI_OPCODE_RFL: ASSERT (0); break; case TGSI_OPCODE_SEQ: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_fceq(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SFL: ASSERT (0); break; case TGSI_OPCODE_SGT: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_fcgt(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SIN: FETCH( &r[0], 0, CHAN_X ); r[0].q = micro_sin(r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SLE: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_fcgt(r[0].q, r[1].q); r[0].q = si_xori(r[0].q, 0xff); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SNE: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_fceq(r[0].q, r[1].q); r[0].q = si_xori(r[0].q, 0xff); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_STR: ASSERT (0); break; case TGSI_OPCODE_TEX: /* simple texture lookup */ /* src[0] = texcoord */ /* src[1] = sampler unit */ exec_tex(mach, inst, FALSE, FALSE); break; case TGSI_OPCODE_TXB: /* Texture lookup with lod bias */ /* src[0] = texcoord (src[0].w = load bias) */ /* src[1] = sampler unit */ exec_tex(mach, inst, TRUE, FALSE); break; case TGSI_OPCODE_TXD: /* Texture lookup with explict partial derivatives */ /* src[0] = texcoord */ /* src[1] = d[strq]/dx */ /* src[2] = d[strq]/dy */ /* src[3] = sampler unit */ ASSERT (0); break; case TGSI_OPCODE_TXL: /* Texture lookup with explit LOD */ /* src[0] = texcoord (src[0].w = load bias) */ /* src[1] = sampler unit */ exec_tex(mach, inst, TRUE, FALSE); break; case TGSI_OPCODE_TXP: /* Texture lookup with projection */ /* src[0] = texcoord (src[0].w = projection) */ /* src[1] = sampler unit */ exec_tex(mach, inst, TRUE, TRUE); break; case TGSI_OPCODE_UP2H: ASSERT (0); break; case TGSI_OPCODE_UP2US: ASSERT (0); break; case TGSI_OPCODE_UP4B: ASSERT (0); break; case TGSI_OPCODE_UP4UB: ASSERT (0); break; case TGSI_OPCODE_X2D: ASSERT (0); break; case TGSI_OPCODE_ARA: ASSERT (0); break; case TGSI_OPCODE_ARR: ASSERT (0); break; case TGSI_OPCODE_BRA: ASSERT (0); break; case TGSI_OPCODE_CAL: /* skip the call if no execution channels are enabled */ if (mach->ExecMask) { /* do the call */ /* push the Cond, Loop, Cont stacks */ ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); mach->CondStack[mach->CondStackTop++] = mach->CondMask; ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); mach->ContStack[mach->ContStackTop++] = mach->ContMask; ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING); mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask; /* note that PC was already incremented above */ mach->CallStack[mach->CallStackTop++] = *pc; *pc = inst->InstructionExtLabel.Label; } break; case TGSI_OPCODE_RET: mach->FuncMask &= ~mach->ExecMask; UPDATE_EXEC_MASK(mach); if (mach->ExecMask == 0x0) { /* really return now (otherwise, keep executing */ if (mach->CallStackTop == 0) { /* returning from main() */ *pc = -1; return; } *pc = mach->CallStack[--mach->CallStackTop]; /* pop the Cond, Loop, Cont stacks */ ASSERT(mach->CondStackTop > 0); mach->CondMask = mach->CondStack[--mach->CondStackTop]; ASSERT(mach->LoopStackTop > 0); mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; ASSERT(mach->ContStackTop > 0); mach->ContMask = mach->ContStack[--mach->ContStackTop]; ASSERT(mach->FuncStackTop > 0); mach->FuncMask = mach->FuncStack[--mach->FuncStackTop]; UPDATE_EXEC_MASK(mach); } break; case TGSI_OPCODE_SSG: ASSERT (0); break; case TGSI_OPCODE_CMP: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH(&r[0], 0, chan_index); FETCH(&r[1], 1, chan_index); FETCH(&r[2], 2, chan_index); /* r0 = (r0 < 0.0) ? r1 : r2 */ r[3].q = si_xor(r[3].q, r[3].q); r[0].q = micro_lt(r[0].q, r[3].q); r[0].q = si_selb(r[1].q, r[2].q, r[0].q); STORE(&r[0], 0, chan_index); } break; case TGSI_OPCODE_SCS: if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { FETCH( &r[0], 0, CHAN_X ); } if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) { r[1].q = micro_cos(r[0].q); STORE( &r[1], 0, CHAN_X ); } if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) { r[1].q = micro_sin(r[0].q); STORE( &r[1], 0, CHAN_Y ); } if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) { STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z ); } if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) { STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W ); } break; case TGSI_OPCODE_NRM: ASSERT (0); break; case TGSI_OPCODE_DIV: ASSERT( 0 ); break; case TGSI_OPCODE_DP2: FETCH( &r[0], 0, CHAN_X ); FETCH( &r[1], 1, CHAN_X ); r[0].q = si_fm(r[0].q, r[1].q); FETCH( &r[1], 0, CHAN_Y ); FETCH( &r[2], 1, CHAN_Y ); r[0].q = si_fma(r[1].q, r[2].q, r[0].q); FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_IF: /* push CondMask */ ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING); mach->CondStack[mach->CondStackTop++] = mach->CondMask; FETCH( &r[0], 0, CHAN_X ); /* update CondMask */ if( ! r[0].u[0] ) { mach->CondMask &= ~0x1; } if( ! r[0].u[1] ) { mach->CondMask &= ~0x2; } if( ! r[0].u[2] ) { mach->CondMask &= ~0x4; } if( ! r[0].u[3] ) { mach->CondMask &= ~0x8; } UPDATE_EXEC_MASK(mach); /* Todo: If CondMask==0, jump to ELSE */ break; case TGSI_OPCODE_ELSE: /* invert CondMask wrt previous mask */ { uint prevMask; ASSERT(mach->CondStackTop > 0); prevMask = mach->CondStack[mach->CondStackTop - 1]; mach->CondMask = ~mach->CondMask & prevMask; UPDATE_EXEC_MASK(mach); /* Todo: If CondMask==0, jump to ENDIF */ } break; case TGSI_OPCODE_ENDIF: /* pop CondMask */ ASSERT(mach->CondStackTop > 0); mach->CondMask = mach->CondStack[--mach->CondStackTop]; UPDATE_EXEC_MASK(mach); break; case TGSI_OPCODE_END: /* halt execution */ *pc = -1; break; case TGSI_OPCODE_REP: ASSERT (0); break; case TGSI_OPCODE_ENDREP: ASSERT (0); break; case TGSI_OPCODE_PUSHA: ASSERT (0); break; case TGSI_OPCODE_POPA: ASSERT (0); break; case TGSI_OPCODE_CEIL: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = micro_ceil(r[0].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_I2F: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = si_csflt(r[0].q, 0); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_NOT: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = si_xorbi(r[0].q, 0xff); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_TRUNC: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); r[0].q = micro_trunc(r[0].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SHL: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_shl(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SHR: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = micro_ishr(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_AND: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_and(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_OR: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_or(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_MOD: ASSERT (0); break; case TGSI_OPCODE_XOR: FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) { FETCH( &r[0], 0, chan_index ); FETCH( &r[1], 1, chan_index ); r[0].q = si_xor(r[0].q, r[1].q); STORE( &r[0], 0, chan_index ); } break; case TGSI_OPCODE_SAD: ASSERT (0); break; case TGSI_OPCODE_TXF: ASSERT (0); break; case TGSI_OPCODE_TXQ: ASSERT (0); break; case TGSI_OPCODE_EMIT: mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16; mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++; break; case TGSI_OPCODE_ENDPRIM: mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++; mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0; break; case TGSI_OPCODE_BGNFOR: /* fall-through (for now) */ case TGSI_OPCODE_BGNLOOP: /* push LoopMask and ContMasks */ ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING); mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask; ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING); mach->ContStack[mach->ContStackTop++] = mach->ContMask; break; case TGSI_OPCODE_ENDFOR: /* fall-through (for now at least) */ case TGSI_OPCODE_ENDLOOP: /* Restore ContMask, but don't pop */ ASSERT(mach->ContStackTop > 0); mach->ContMask = mach->ContStack[mach->ContStackTop - 1]; if (mach->LoopMask) { /* repeat loop: jump to instruction just past BGNLOOP */ *pc = inst->InstructionExtLabel.Label + 1; } else { /* exit loop: pop LoopMask */ ASSERT(mach->LoopStackTop > 0); mach->LoopMask = mach->LoopStack[--mach->LoopStackTop]; /* pop ContMask */ ASSERT(mach->ContStackTop > 0); mach->ContMask = mach->ContStack[--mach->ContStackTop]; } UPDATE_EXEC_MASK(mach); break; case TGSI_OPCODE_BRK: /* turn off loop channels for each enabled exec channel */ mach->LoopMask &= ~mach->ExecMask; /* Todo: if mach->LoopMask == 0, jump to end of loop */ UPDATE_EXEC_MASK(mach); break; case TGSI_OPCODE_CONT: /* turn off cont channels for each enabled exec channel */ mach->ContMask &= ~mach->ExecMask; /* Todo: if mach->LoopMask == 0, jump to end of loop */ UPDATE_EXEC_MASK(mach); break; case TGSI_OPCODE_BGNSUB: /* no-op */ break; case TGSI_OPCODE_ENDSUB: /* no-op */ break; case TGSI_OPCODE_NOP: break; default: ASSERT( 0 ); } } /** * Run TGSI interpreter. * \return bitmask of "alive" quad components */ uint spu_exec_machine_run( struct spu_exec_machine *mach ) { uint i; int pc = 0; mach->CondMask = 0xf; mach->LoopMask = 0xf; mach->ContMask = 0xf; mach->FuncMask = 0xf; mach->ExecMask = 0xf; mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */ ASSERT(mach->CondStackTop == 0); ASSERT(mach->LoopStackTop == 0); ASSERT(mach->ContStackTop == 0); ASSERT(mach->CallStackTop == 0); mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0; mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0; if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) { mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0; mach->Primitives[0] = 0; } /* execute declarations (interpolants) */ if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) { for (i = 0; i < mach->NumDeclarations; i++) { union { struct tgsi_full_declaration decl; qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16]; } d ALIGN16_ATTRIB; unsigned ea = (unsigned) (mach->Declarations + pc); spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl)); exec_declaration( mach, &d.decl ); } } /* execute instructions, until pc is set to -1 */ while (pc != -1) { union { struct tgsi_full_instruction inst; qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16]; } i ALIGN16_ATTRIB; unsigned ea = (unsigned) (mach->Instructions + pc); spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst)); exec_instruction( mach, & i.inst, &pc ); } #if 0 /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */ if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) { /* * Scale back depth component. */ for (i = 0; i < 4; i++) mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF; } #endif return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; }