From 5fae9514c235cc5590f15fd802bd762107fc689d Mon Sep 17 00:00:00 2001 From: Michal Krol Date: Fri, 10 Apr 2009 11:58:49 +0200 Subject: tgsi/sse2: Cleanup NRM/NRM4 implementation. Fix comments. Make sure .w is set to 1.0 for NRM. Optimise for non-.xyzw writemasks. --- src/gallium/auxiliary/tgsi/tgsi_sse2.c | 101 +++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 25 deletions(-) (limited to 'src/gallium/auxiliary') diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 31225ba430..4b4e34b29e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -2178,32 +2178,83 @@ emit_instruction( /* 3 or 4-component normalization */ { uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4; - /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */ - FETCH( func, *inst, 4, 0, CHAN_X ); /* xmm4 = src[0].x */ - FETCH( func, *inst, 5, 0, CHAN_Y ); /* xmm5 = src[0].y */ - FETCH( func, *inst, 6, 0, CHAN_Z ); /* xmm6 = src[0].z */ - if (dims == 4) { - FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */ - } - emit_MOV( func, 0, 4 ); /* xmm0 = xmm3 */ - emit_mul( func, 0, 4 ); /* xmm0 *= xmm3 */ - emit_MOV( func, 1, 5 ); /* xmm1 = xmm4 */ - emit_mul( func, 1, 5 ); /* xmm1 *= xmm4 */ - emit_add( func, 0, 1 ); /* xmm0 += xmm1 */ - emit_MOV( func, 1, 6 ); /* xmm1 = xmm5 */ - emit_mul( func, 1, 6 ); /* xmm1 *= xmm5 */ - emit_add( func, 0, 1 ); /* xmm0 += xmm1 */ - if (dims == 4) { - emit_MOV( func, 1, 7 ); /* xmm1 = xmm7 */ - emit_mul( func, 1, 7 ); /* xmm1 *= xmm7 */ - emit_add( func, 0, 0 ); /* xmm0 += xmm1 */ - } - emit_rsqrt( func, 1, 0 ); /* xmm1 = 1/sqrt(xmm0) */ - FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - if (chan_index < dims) { - emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */ - STORE( func, *inst, 4+chan_index, 0, chan_index ); + + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) || + IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) || + (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) { + + /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */ + + /* xmm4 = src.x */ + /* xmm0 = src.x * src.x */ + FETCH(func, *inst, 0, 0, CHAN_X); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { + emit_MOV(func, 4, 0); } + emit_mul(func, 0, 0); + + /* xmm5 = src.y */ + /* xmm0 = xmm0 + src.y * src.y */ + FETCH(func, *inst, 1, 0, CHAN_Y); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + emit_MOV(func, 5, 1); + } + emit_mul(func, 1, 1); + emit_add(func, 0, 1); + + /* xmm6 = src.z */ + /* xmm0 = xmm0 + src.z * src.z */ + FETCH(func, *inst, 1, 0, CHAN_Z); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + emit_MOV(func, 6, 1); + } + emit_mul(func, 1, 1); + emit_add(func, 0, 1); + + if (dims == 4) { + /* xmm7 = src.w */ + /* xmm0 = xmm0 + src.w * src.w */ + FETCH(func, *inst, 1, 0, CHAN_W); + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) { + emit_MOV(func, 7, 1); + } + emit_mul(func, 1, 1); + emit_add(func, 0, 1); + } + + /* xmm1 = 1 / sqrt(xmm0) */ + emit_rsqrt(func, 1, 0); + + /* dst.x = xmm1 * src.x */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) { + emit_mul(func, 4, 1); + STORE(func, *inst, 4, 0, CHAN_X); + } + + /* dst.y = xmm1 * src.y */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) { + emit_mul(func, 5, 1); + STORE(func, *inst, 5, 0, CHAN_Y); + } + + /* dst.z = xmm1 * src.z */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) { + emit_mul(func, 6, 1); + STORE(func, *inst, 6, 0, CHAN_Z); + } + + /* dst.w = xmm1 * src.w */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) { + emit_mul(func, 7, 1); + STORE(func, *inst, 7, 0, CHAN_W); + } + } + + /* dst0.w = 1.0 */ + if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) { + emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C); + STORE(func, *inst, 0, 0, CHAN_W); } } break; -- cgit v1.2.3