From 1ba10e5ccf5cd0c990922e982e1e9bc6be48a5e4 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 09:44:16 +0100 Subject: draw: add aos vertex shader varient --- src/gallium/auxiliary/draw/draw_vs_aos.c | 1739 ++++++++++++++++++++++++++++++ 1 file changed, 1739 insertions(+) create mode 100644 src/gallium/auxiliary/draw/draw_vs_aos.c (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c new file mode 100644 index 0000000000..620f5e3592 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -0,0 +1,1739 @@ +/* + * Mesa 3-D graphics library + * Version: 6.3 + * + * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code + * using the rtasm runtime assembler. Based on the old + * t_vb_arb_program_sse.c + */ + + +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/util/tgsi_parse.h" +#include "tgsi/util/tgsi_util.h" +#include "tgsi/exec/tgsi_exec.h" +#include "tgsi/util/tgsi_dump.h" + +#include "draw_vs.h" +#include "draw_vs_aos.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 + + +#define DISASSEM 0 + + + + + +static INLINE boolean eq( struct x86_reg a, + struct x86_reg b ) +{ + return (a.file == b.file && + a.idx == b.idx && + a.mod == b.mod && + a.disp == b.disp); +} + + +static struct x86_reg get_reg_ptr(struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + struct x86_reg ptr = cp->machine_EDX; + + switch (file) { + case TGSI_FILE_INPUT: + return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); + + case TGSI_FILE_OUTPUT: + return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); + + case TGSI_FILE_TEMPORARY: + return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); + + case TGSI_FILE_IMMEDIATE: + return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + + case TGSI_FILE_CONSTANT: + return x86_make_disp(ptr, Offset(struct aos_machine, constant[idx])); + + case AOS_FILE_INTERNAL: + return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + + default: + ERROR(cp, "unknown reg file"); + return x86_make_reg(0,0); + } +} + + +struct x86_reg aos_get_internal( struct aos_compilation *cp, + unsigned imm ) +{ + return get_reg_ptr( cp, + AOS_FILE_INTERNAL, + imm + 1 ); +} + +static void spill( struct aos_compilation *cp, unsigned idx ) +{ + if (!cp->xmm[idx].dirty || + (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ + cp->xmm[idx].file != TGSI_FILE_OUTPUT && + cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { + ERROR(cp, "invalid spill"); + return; + } + else { + struct x86_reg oldval = get_reg_ptr(cp, + cp->xmm[idx].file, + cp->xmm[idx].idx); + + assert(cp->xmm[idx].dirty); + sse_movups(cp->func, oldval, x86_make_reg(file_XMM, idx)); + cp->xmm[idx].dirty = 0; + } +} + +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) +{ + unsigned i; + unsigned oldest = 0; + + for (i = 0; i < 8; i++) + if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) + oldest = i; + + /* Need to write out the old value? + */ + if (cp->xmm[oldest].dirty) + spill(cp, oldest); + + assert(cp->xmm[oldest].last_used != cp->insn_counter); + + cp->xmm[oldest].file = TGSI_FILE_NULL; + cp->xmm[oldest].idx = 0; + cp->xmm[oldest].last_used = cp->insn_counter; + return x86_make_reg(file_XMM, oldest); +} + +void aos_release_xmm_reg( struct aos_compilation *cp, + unsigned idx ) +{ + cp->xmm[idx].file = TGSI_FILE_NULL; + cp->xmm[idx].idx = 0; + cp->xmm[idx].dirty = 0; + cp->xmm[idx].last_used = 0; +} + +static void invalidate_xmm( struct aos_compilation *cp, + unsigned file, unsigned idx ) +{ + unsigned i; + + /* Invalidate any old copy of this register in XMM0-7. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { + + if (cp->xmm[i].dirty) + spill(cp, i); + + aos_release_xmm_reg(cp, i); + break; + } + } + + for (; i < 8; i++) { + if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { + assert(0); + } + } +} + + +void aos_adopt_xmm_reg( struct aos_compilation *cp, + struct x86_reg reg, + unsigned file, + unsigned idx, + unsigned dirty ) +{ + if (reg.file != file_XMM) { + assert(0); + return; + } + + invalidate_xmm(cp, file, idx); + cp->xmm[reg.idx].file = file; + cp->xmm[reg.idx].idx = idx; + cp->xmm[reg.idx].dirty = dirty; +} + + + +static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + invalidate_xmm( cp, file, idx ); + return get_reg_ptr( cp, file, idx ); +} + + +/* As above, but return a pointer. Note - this pointer may alias + * those returned by get_arg_ptr(). + */ +static struct x86_reg get_dst_ptr( struct aos_compilation *cp, + const struct tgsi_full_dst_register *dst ) +{ + return aos_get_shader_reg_ptr( cp, dst->DstRegister.File, dst->DstRegister.Index ); +} + + + + + +/* Return an XMM reg if the argument is resident, otherwise return a + * base+offset pointer to the saved value. + */ +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + unsigned i; + + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx) + { + cp->xmm[i].last_used = cp->insn_counter; + return x86_make_reg(file_XMM, i); + } + } + + /* If not found in the XMM register file, return an indirect + * reference to the in-memory copy: + */ + return get_reg_ptr( cp, file, idx ); +} + + + + + +/* Emulate pshufd insn in regular SSE, if necessary: + */ +static void emit_pshufd( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + ubyte shuf ) +{ + if (cp->have_sse2) { + sse2_pshufd(cp->func, dst, arg0, shuf); + } + else { + if (!eq(dst, arg0)) + sse_movups(cp->func, dst, arg0); + + sse_shufps(cp->func, dst, dst, shuf); + } +} + + + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy1( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + struct x86_reg arg1, + ubyte shuf ) +{ + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, arg1); + emit_pshufd(cp, dst, dst, shuf); + emit_pshufd(cp, tmp, arg0, shuf); + + sse_movss(cp->func, dst, tmp); + + emit_pshufd(cp, dst, dst, shuf); + + aos_release_xmm_reg(cp, tmp.idx); + return TRUE; +} + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy2( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg arg0, + struct x86_reg arg1, + ubyte shuf ) +{ + struct x86_reg tmp = aos_get_xmm_reg(cp); + emit_pshufd(cp, dst, arg1, shuf); + emit_pshufd(cp, tmp, arg0, shuf); + + sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); + + emit_pshufd(cp, dst, dst, shuf); + + aos_release_xmm_reg(cp, tmp.idx); + return TRUE; +} + +#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) + + +/* Locate a source register and perform any required (simple) swizzle. + * + * Just fail on complex swizzles at this point. + */ +static struct x86_reg fetch_src( struct aos_compilation *cp, + const struct tgsi_full_src_register *src ) +{ + struct x86_reg arg0 = aos_get_shader_reg(cp, + src->SrcRegister.File, + src->SrcRegister.Index); + unsigned i; + unsigned swz = 0; + unsigned negs = 0; + unsigned abs = 0; + + for (i = 0; i < 4; i++) { + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i ); + unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); + + switch (swizzle) { + case TGSI_EXTSWIZZLE_ZERO: + case TGSI_EXTSWIZZLE_ONE: + ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2"); + break; + + default: + swz |= (swizzle & 0x3) << (i * 2); + break; + } + + switch (neg) { + case TGSI_UTIL_SIGN_TOGGLE: + negs |= (1<func, dst, arg0); + + aos_release_xmm_reg(cp, tmp.idx); + arg0 = dst; + } + + if (abs && abs != 0xf) { + ERROR(cp, "unsupported partial abs"); + } + + if (abs) { + struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, tmp, arg0); + sse_mulps(cp->func, tmp, neg); + sse_maxps(cp->func, dst, arg0); + + aos_release_xmm_reg(cp, tmp.idx); + arg0 = dst; + } + } + + return arg0; +} + +static void x87_fld_src( struct aos_compilation *cp, + const struct tgsi_full_src_register *src, + unsigned channel ) +{ + struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, + src->SrcRegister.File, + src->SrcRegister.Index); + + unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel ); + unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); + + switch (swizzle) { + case TGSI_EXTSWIZZLE_ZERO: + x87_fldz( cp->func ); + break; + + case TGSI_EXTSWIZZLE_ONE: + x87_fld1( cp->func ); + break; + + default: + x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); + break; + } + + + switch (neg) { + case TGSI_UTIL_SIGN_TOGGLE: + /* Flip the sign: + */ + x87_fchs( cp->func ); + break; + + case TGSI_UTIL_SIGN_KEEP: + break; + + case TGSI_UTIL_SIGN_CLEAR: + x87_fabs( cp->func ); + break; + + case TGSI_UTIL_SIGN_SET: + x87_fabs( cp->func ); + x87_fchs( cp->func ); + break; + + default: + ERROR(cp, "unsupported sign-mode"); + break; + } +} + + + + + + +/* Used to implement write masking. This and most of the other instructions + * here would be easier to implement if there had been a translation + * to a 2 argument format (dst/arg0, arg1) at the shader level before + * attempting to translate to x86/sse code. + */ +static void store_dest( struct aos_compilation *cp, + const struct tgsi_full_dst_register *reg, + struct x86_reg result ) +{ + if (reg->DstRegister.WriteMask == 0) + { + return; + } + else if (reg->DstRegister.WriteMask == TGSI_WRITEMASK_XYZW) + { + if (result.file == file_XMM) { + aos_adopt_xmm_reg(cp, + result, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); + } + else { + struct x86_reg dst = aos_get_xmm_reg(cp); + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); + sse_movups(cp->func, dst, result); + } + } + else + { + /* Previous value of the dest register: + */ + struct x86_reg old_dst = aos_get_shader_reg(cp, + reg->DstRegister.File, + reg->DstRegister.Index); + + + /* Alloc an xmm reg to hold the new value of the dest register: + */ + struct x86_reg dst = aos_get_xmm_reg(cp); + + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE ); + + switch (reg->DstRegister.WriteMask) { + case TGSI_WRITEMASK_X: + if (result.file == file_XMM) { + sse_movups(cp->func, dst, old_dst); + sse_movss(cp->func, dst, result); + } + else { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, old_dst); + sse_movss(cp->func, tmp, result); + sse_movss(cp->func, dst, tmp); + aos_release_xmm_reg(cp, tmp.idx); + } + break; + + case TGSI_WRITEMASK_XY: + sse_movups(cp->func, dst, old_dst); + sse_shufps(cp->func, dst, result, SHUF(X, Y, Z, W)); + break; + + case TGSI_WRITEMASK_ZW: + sse_movups(cp->func, dst, result); + sse_shufps(cp->func, dst, old_dst, SHUF(X, Y, Z, W)); + break; + + case TGSI_WRITEMASK_YZW: + if (old_dst.file == file_XMM) { + sse_movups(cp->func, dst, result); + sse_movss(cp->func, dst, old_dst); + } + else { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, result); + sse_movss(cp->func, tmp, old_dst); + sse_movss(cp->func, dst, tmp); + aos_release_xmm_reg(cp, tmp.idx); + } + break; + + case TGSI_WRITEMASK_Y: + emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Y,X,Z,W)); + break; + + case TGSI_WRITEMASK_Z: + emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); + break; + + case TGSI_WRITEMASK_W: + emit_shuf_copy1(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); + break; + + case TGSI_WRITEMASK_XZ: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,Z,Y,W)); + break; + + case TGSI_WRITEMASK_XW: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,W,Z,Y)); + + case TGSI_WRITEMASK_YZ: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); + break; + + case TGSI_WRITEMASK_YW: + emit_shuf_copy2(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); + break; + + case TGSI_WRITEMASK_XZW: + emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Y,X,Z,W)); + break; + + case TGSI_WRITEMASK_XYW: + emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Z,Y,X,W)); + break; + + case TGSI_WRITEMASK_XYZ: + emit_shuf_copy1(cp, dst, old_dst, result, SHUF(W,Y,Z,X)); + break; + + default: + assert(0); /* not possible */ + break; + } + } +} + + +static void x87_fst_or_nop( struct x86_function *func, + unsigned writemask, + unsigned channel, + struct x86_reg ptr ) +{ + if (writemask & (1<DstRegister.WriteMask; + + x87_fst_or_nop(cp->func, writemask, 0, ptr); + x87_fst_or_nop(cp->func, writemask, 1, ptr); + x87_fst_or_nop(cp->func, writemask, 2, ptr); + x87_fstp_or_pop(cp->func, writemask, 3, ptr); +} + +/* Save current x87 state and put it into single precision mode. + */ +static void save_fpu_state( struct aos_compilation *cp ) +{ +#if 0 + x87_fnstcw( cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); + x87_fldcw( cp->func, ); +#endif +} + +static void restore_fpu_state( struct aos_compilation *cp ) +{ +#if 0 + x87_fnclex(cp->func); + x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); +#endif +} + +static void set_fpu_round_neg_inf( struct aos_compilation *cp ) +{ +#if 0 + if (cp->fpucntl != RND_NEG_FPU) { + struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX); + struct arb_vp_machine *m = NULL; + + cp->fpucntl = RND_NEG_FPU; + x87_fnclex(cp->func); + x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg))); + } +#endif +} + +static void set_fpu_round_nearest( struct aos_compilation *cp ) +{ +#if 0 +#endif +} + + +static void emit_x87_ex2( struct aos_compilation *cp ) +{ + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st3 = x86_make_reg(file_x87, 3); + + set_fpu_round_neg_inf( cp ); + + x87_fld(cp->func, st0); /* a a */ + x87_fprndint( cp->func ); /* int(a) a */ + x87_fld(cp->func, st0); /* int(a) int(a) a */ + x87_fstp(cp->func, st3); /* int(a) a int(a)*/ + x87_fsubp(cp->func, st1); /* frac(a) int(a) */ + x87_f2xm1(cp->func); /* (2^frac(a))-1 int(a)*/ + x87_fld1(cp->func); /* 1 (2^frac(a))-1 int(a)*/ + x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ + x87_fscale(cp->func); /* 2^a */ +} + + + +/** + * The traditional instructions. All operate on internal registers + * and ignore write masks and swizzling issues. + */ + +static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, neg); + sse_maxps(cp->func, dst, arg0); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_addps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + x87_fcos(cp->func); + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + +/* The dotproduct instructions don't really do that well in sse: + */ +static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the first 3 values: + */ + sse_movhlps(cp->func, tmp, dst); + sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + + +static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the values: + */ + sse_movhlps(cp->func, tmp, dst); + sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + /* Now the hard bit: sum the values (from DP3): + */ + sse_movhlps(cp->func, tmp, dst); + sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ + emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); + sse_addss(cp->func, dst, tmp); + emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); + sse_addss(cp->func, dst, tmp); + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + +/* dst[0] = 1.0 * 1.0F; */ +/* dst[1] = arg0[1] * arg1[1]; */ +/* dst[2] = arg0[2] * 1.0; */ +/* dst[3] = 1.0 * arg1[3]; */ + + emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); + emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); + sse_mulps(cp->func, dst, tmp); + + aos_release_xmm_reg(cp, tmp.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld1(cp->func); /* 1 */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 1 */ + x87_fyl2x(cp->func); /* log2(a0) */ + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + +static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + + emit_x87_ex2(cp); + + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + +static boolean emit_EXP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st3 = x86_make_reg(file_x87, 3); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + /* CAUTION: dst may alias arg0! + */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */ + x87_fld(cp->func, st0); /* arg arg */ + + /* by default, fpu is setup to round-to-nearest. We want to + * change this now, and track the state through to the end of the + * generated function so that it isn't repeated unnecessarily. + * Alternately, could subtract .5 to get round to -inf behaviour. + */ + set_fpu_round_neg_inf( cp ); + x87_fprndint( cp->func ); /* flr(a) a */ + x87_fld(cp->func, st0); /* flr(a) flr(a) a */ + x87_fld1(cp->func); /* 1 floor(a) floor(a) a */ + x87_fst_or_nop(cp->func, writemask, 3, dst); /* stack unchanged */ + + x87_fscale(cp->func); /* 2^floor(a) floor(a) a */ + x87_fst(cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/ + + x87_fstp_or_pop(cp->func, writemask, 0, dst); /* flr(a) a 2^flr(a) */ + + x87_fsubrp(cp->func, st1); /* frac(a) 2^flr(a) */ + + x87_fst_or_nop(cp->func, writemask, 1, dst); /* frac(a) 2^flr(a) */ + + x87_f2xm1(cp->func); /* (2^frac(a))-1 2^flr(a)*/ + x87_fld1(cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/ + x87_faddp(cp->func, st1); /* 2^frac(a) 2^flr(a) */ + x87_fmulp(cp->func, st1); /* 2^a */ + + x87_fstp_or_pop(cp->func, writemask, 2, dst); + +/* dst[0] = 2^floor(tmp); */ +/* dst[1] = frac(tmp); */ +/* dst[2] = 2^floor(tmp) * 2^frac(tmp); */ +/* dst[3] = 1.0F; */ + return TRUE; +} + +static boolean emit_LOG( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st2 = x86_make_reg(file_x87, 2); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + /* CAUTION: dst may alias arg0! + */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */ + x87_fabs(cp->func); /* |arg0.x| */ + x87_fxtract(cp->func); /* mantissa(arg0.x), exponent(arg0.x) */ + x87_fst(cp->func, st2); /* mantissa, exponent, mantissa */ + x87_fld1(cp->func); /* 1, mantissa, exponent, mantissa */ + x87_fyl2x(cp->func); /* log2(mantissa), exponent, mantissa */ + x87_fadd(cp->func, st0, st1); /* e+l2(m), e, m */ + + x87_fstp_or_pop(cp->func, writemask, 2, dst); /* e, m */ + + x87_fld1(cp->func); /* 1, e, m */ + x87_fsub(cp->func, st1, st0); /* 1, e-1, m */ + + x87_fstp_or_pop(cp->func, writemask, 3, dst); /* e-1,m */ + x87_fstp_or_pop(cp->func, writemask, 0, dst); /* m */ + + x87_fadd(cp->func, st0, st0); /* 2m */ + + x87_fstp_or_pop( cp->func, writemask, 1, dst ); + + return TRUE; +} + +static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_neg_inf( cp ); + + /* Load all sources first to avoid aliasing + */ + for (i = 0; i < 4; i++) { + if (writemask & (1<FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<func ); + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + +static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_nearest( cp ); + + /* Load all sources first to avoid aliasing + */ + for (i = 0; i < 4; i++) { + if (writemask & (1<FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<func ); + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + +static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st0 = x86_make_reg(file_x87, 0); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + int i; + + set_fpu_round_neg_inf( cp ); + + /* suck all the source values onto the stack before writing out any + * dst, which may alias... + */ + for (i = 0; i < 4; i++) { + if (writemask & (1<FullSrcRegisters[0], i); + } + } + + for (i = 0; i < 4; i++) { + if (writemask & (1<func, st0); /* a a */ + x87_fprndint( cp->func ); /* flr(a) a */ + x87_fsubrp(cp->func, st1); /* frc(a) */ + x87_fstp(cp->func, x86_make_disp(dst, i*4)); + } + } + + return TRUE; +} + + + +static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg st1 = x86_make_reg(file_x87, 1); + unsigned fixup1, fixup2; + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + + /* Load the interesting parts of arg0: + */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 3); + x87_fld_src(cp, &op->FullSrcRegisters[0], 1); + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + + + if (writemask & TGSI_WRITEMASK_XW) { + x87_fld1(cp->func); + x87_fst_or_nop(cp->func, writemask, 0, dst); + x87_fstp_or_pop(cp->func, writemask, 3, dst); + } + + if (writemask & TGSI_WRITEMASK_YZ) { + + /* Pre-zero destinations, may be overwritten later... fixme. + */ + x87_fldz(cp->func); + x87_fst_or_nop(cp->func, writemask, 1, dst); + x87_fstp_or_pop(cp->func, writemask, 2, dst); + + + /* Check arg0[0]: + */ + x87_fldz(cp->func); /* 0 a0 a1 a3 */ + x87_fucomp(cp->func, st1); /* a0 a1 a3 */ + x87_fnstsw(cp->func, cp->tmp_EAX); + x86_sahf(cp->func); + fixup1 = x86_jcc_forward(cp->func, cc_AE); + + x87_fstp_or_pop(cp->func, writemask, 1, dst); /* a1 a3 */ + + /* Check arg0[1]: + */ + x87_fldz(cp->func); /* 0 a1 a3 */ + x87_fucomp(cp->func, st1); /* a1 a3 */ + x87_fnstsw(cp->func, cp->tmp_EAX); + x86_sahf(cp->func); + fixup2 = x86_jcc_forward(cp->func, cc_AE); + + /* Compute pow(a1, a3) + */ + x87_fyl2x(cp->func); /* a3*log2(a1) */ + + emit_x87_ex2( cp ); /* 2^(a3*log2(a1)) */ + + x87_fstp_or_pop(cp->func, writemask, 2, dst); + + /* Land jumps: + */ + x86_fixup_fwd_jump(cp->func, fixup1); + x86_fixup_fwd_jump(cp->func, fixup2); + } + + return TRUE; +} + + + +static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_maxps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_minps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_mulps(cp->func, dst, arg1); + sse_addps(cp->func, dst, arg2); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */ + x87_fyl2x(cp->func); /* a1*log2(a0) */ + + emit_x87_ex2( cp ); /* 2^(a1*log2(a0)) */ + + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + +static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + if (cp->have_sse2) { + sse2_rcpss(cp->func, dst, arg0); + /* extend precision here... + */ + } + else { + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + sse_movss(cp->func, dst, ones); + sse_divss(cp->func, dst, arg0); + } + + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_rsqrtss(cp->func, dst, arg0); + + /* Extend precision here... + */ + + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + + sse_movups(cp->func, dst, arg0); + sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); + sse_andps(cp->func, dst, ones); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + x87_fsin(cp->func); + x87_fstp_dest4(cp, &op->FullDstRegisters[0]); + return TRUE; +} + + + +static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + + sse_movups(cp->func, dst, arg0); + sse_cmpps(cp->func, dst, arg1, cc_LessThan); + sse_andps(cp->func, dst, ones); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + +static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + + sse_movups(cp->func, dst, arg0); + sse_subps(cp->func, dst, arg1); + + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + +static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); + struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg tmp0 = aos_get_xmm_reg(cp); + struct x86_reg tmp1 = aos_get_xmm_reg(cp); + + /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way + * to invalidate registers. This will come with better analysis + * (liveness analysis) of the incoming program. + */ + emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W)); + emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W)); + sse_mulps(cp->func, dst, tmp1); + emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W)); + emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); + sse_mulps(cp->func, tmp0, tmp1); + sse_subps(cp->func, dst, tmp0); + +/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ +/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ +/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ +/* dst[3] is undef */ + + + aos_release_xmm_reg(cp, tmp0.idx); + aos_release_xmm_reg(cp, tmp1.idx); + store_dest(cp, &op->FullDstRegisters[0], dst); + return TRUE; +} + + + +static boolean +emit_instruction( struct aos_compilation *cp, + struct tgsi_full_instruction *inst ) +{ + switch( inst->Instruction.Opcode ) { + case TGSI_OPCODE_MOV: + return emit_MOV( cp, inst ); + + case TGSI_OPCODE_LIT: + return emit_LIT(cp, inst); + + case TGSI_OPCODE_RCP: + return emit_RCP(cp, inst); + + case TGSI_OPCODE_RSQ: + return emit_RSQ(cp, inst); + + case TGSI_OPCODE_EXP: + return emit_EXP(cp, inst); + + case TGSI_OPCODE_LOG: + return emit_LOG(cp, inst); + + case TGSI_OPCODE_MUL: + return emit_MUL(cp, inst); + + case TGSI_OPCODE_ADD: + return emit_ADD(cp, inst); + + case TGSI_OPCODE_DP3: + return emit_DP3(cp, inst); + + case TGSI_OPCODE_DP4: + return emit_DP4(cp, inst); + + case TGSI_OPCODE_DST: + return emit_DST(cp, inst); + + case TGSI_OPCODE_MIN: + return emit_MIN(cp, inst); + + case TGSI_OPCODE_MAX: + return emit_MAX(cp, inst); + + case TGSI_OPCODE_SLT: + return emit_SLT(cp, inst); + + case TGSI_OPCODE_SGE: + return emit_SGE(cp, inst); + + case TGSI_OPCODE_MAD: + return emit_MAD(cp, inst); + + case TGSI_OPCODE_SUB: + return emit_SUB(cp, inst); + + case TGSI_OPCODE_LERP: +// return emit_LERP(cp, inst); + return FALSE; + + case TGSI_OPCODE_FRAC: + return emit_FRC(cp, inst); + + case TGSI_OPCODE_CLAMP: +// return emit_CLAMP(cp, inst); + return FALSE; + + case TGSI_OPCODE_FLOOR: + return emit_FLR(cp, inst); + + case TGSI_OPCODE_ROUND: + return emit_RND(cp, inst); + + case TGSI_OPCODE_EXPBASE2: + return emit_EX2(cp, inst); + + case TGSI_OPCODE_LOGBASE2: + return emit_LG2(cp, inst); + + case TGSI_OPCODE_POWER: + return emit_POW(cp, inst); + + case TGSI_OPCODE_CROSSPRODUCT: + return emit_XPD(cp, inst); + + case TGSI_OPCODE_ABS: + return emit_ABS(cp, inst); + + case TGSI_OPCODE_DPH: + return emit_DPH(cp, inst); + + case TGSI_OPCODE_COS: + return emit_COS(cp, inst); + + case TGSI_OPCODE_SIN: + return emit_SIN(cp, inst); + + case TGSI_OPCODE_END: + return TRUE; + + default: + return FALSE; + } +} + +static boolean note_immediate( struct aos_compilation *cp, + struct tgsi_full_immediate *imm ) +{ + unsigned pos = cp->num_immediates++; + unsigned j; + + for (j = 0; j < imm->Immediate.Size; j++) { + cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float; + } + + return TRUE; +} + + + + +static void find_last_write_outputs( struct aos_compilation *cp ) +{ + struct tgsi_parse_context parse; + unsigned this_instruction = 0; + unsigned i; + + tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); + + while (!tgsi_parse_end_of_tokens( &parse )) { + + tgsi_parse_token( &parse ); + + if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) + continue; + + for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { + if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File == + TGSI_FILE_OUTPUT) + { + unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index; + cp->output_last_write[idx] = this_instruction; + } + } + + this_instruction++; + } + + tgsi_parse_free( &parse ); +} + + +#define ARG_VARIENT 1 +#define ARG_START_ELTS 2 +#define ARG_COUNT 3 +#define ARG_OUTBUF 4 + + +static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, + boolean linear ) +{ + struct tgsi_parse_context parse; + struct aos_compilation cp; + unsigned fixup, label; + + tgsi_parse_init( &parse, varient->base.vs->state.tokens ); + + memset(&cp, 0, sizeof(cp)); + + cp.insn_counter = 1; + cp.vaos = varient; + cp.have_sse2 = 1; + cp.func = &varient->func[ linear ? 0 : 1 ]; + + cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX); + cp.idx_EBX = x86_make_reg(file_REG32, reg_BX); + cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX); + cp.machine_EDX = x86_make_reg(file_REG32, reg_DX); + cp.count_ESI = x86_make_reg(file_REG32, reg_SI); + + x86_init_func(cp.func); + + find_last_write_outputs(&cp); + + x86_push(cp.func, cp.idx_EBX); + x86_push(cp.func, cp.count_ESI); + + + /* Load arguments into regs: + */ + x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_VARIENT)); + x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); + x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); + x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); + + + /* Compare count to zero and possibly bail. + */ + x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); + x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); + fixup = x86_jcc_forward(cp.func, cc_E); + + /* Dig out the machine pointer from inside the varient arg + */ + x86_mov(cp.func, cp.machine_EDX, + x86_make_disp(cp.machine_EDX, + Offset( struct draw_vs_varient_aos_sse, machine ))); + + save_fpu_state( &cp ); + + /* Note address for loop jump + */ + label = x86_get_label(cp.func); + { + /* Fetch inputs... TODO: fetch lazily... + */ + if (!aos_fetch_inputs( &cp, linear )) + goto fail; + + /* Emit the shader: + */ + while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) + { + tgsi_parse_token( &parse ); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: + if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) + goto fail; + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) + goto fail; + break; + } + + cp.insn_counter++; + debug_printf("\n"); + } + + if (cp.error) + goto fail; + + /* Emit output... TODO: do this eagerly after the last write to a + * given output. + */ + if (!aos_emit_outputs( &cp )) + goto fail; + + + /* Next vertex: + */ + x86_lea(cp.func, + cp.outbuf_ECX, + x86_make_disp(cp.outbuf_ECX, + cp.vaos->base.key.output_stride)); + + /* Incr index + */ + if (linear) { + x86_inc(cp.func, cp.idx_EBX); + } + else { + x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4)); + } + + } + /* decr count, loop if not zero + */ + x86_dec(cp.func, cp.count_ESI); +/* x86_test(cp.func, cp.count_ESI, cp.count_ESI); */ + x86_jcc(cp.func, cc_NZ, label); + + restore_fpu_state(&cp); + + /* Land forward jump here: + */ + x86_fixup_fwd_jump(cp.func, fixup); + + /* Exit mmx state? + */ + if (cp.func->need_emms) + mmx_emms(cp.func); + + x86_pop(cp.func, cp.count_ESI); + x86_pop(cp.func, cp.idx_EBX); + + x86_ret(cp.func); + + tgsi_parse_free( &parse ); + return !cp.error; + + fail: + tgsi_parse_free( &parse ); + return FALSE; +} + + + +static void vaos_set_buffer( struct draw_vs_varient *varient, + unsigned buf, + const void *ptr, + unsigned stride ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + unsigned i; + + for (i = 0; i < vaos->base.vs->info.num_inputs; i++) { + if (vaos->base.key.element[i].in.buffer == buf) { + vaos->machine->attrib[i].input_ptr = ((char *)ptr + + vaos->base.key.element[i].in.offset); + vaos->machine->attrib[i].input_stride = stride; + } + } +} + + +static void vaos_destroy( struct draw_vs_varient *varient ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + if (vaos->machine) + align_free( vaos->machine ); + + x86_release_func( &vaos->func[0] ); + x86_release_func( &vaos->func[1] ); + + FREE(vaos); +} + +static void vaos_run_elts( struct draw_vs_varient *varient, + const unsigned *elts, + unsigned count, + void *output_buffer ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + vaos->gen_run_elts( varient, + elts, + count, + output_buffer ); +} + +static void vaos_run_linear( struct draw_vs_varient *varient, + unsigned start, + unsigned count, + void *output_buffer ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + vaos->gen_run_linear( varient, + start, + count, + output_buffer ); +} + + +static void vaos_set_constants( struct draw_vs_varient *varient, + const float (*constants)[4] ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + memcpy(vaos->machine->constant, + constants, + (vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1) * 4 * sizeof(float)); +} + + +static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); + + if (!vaos) + goto fail; + + vaos->base.key = *key; + vaos->base.vs = vs; + vaos->base.set_input = vaos_set_buffer; + vaos->base.set_constants = vaos_set_constants; + vaos->base.destroy = vaos_destroy; + vaos->base.run_linear = vaos_run_linear; + vaos->base.run_elts = vaos_run_elts; + + vaos->machine = align_malloc( sizeof(struct aos_machine), 16 ); + if (!vaos->machine) + goto fail; + + memset(vaos->machine, 0, sizeof(struct aos_machine)); + + tgsi_dump(vs->state.tokens, 0); + + if (!build_vertex_program( vaos, TRUE )) + goto fail; + + if (!build_vertex_program( vaos, FALSE )) + goto fail; + + vaos->gen_run_linear = (vsv_run_linear_func)x86_get_func(&vaos->func[0]); + if (!vaos->gen_run_linear) + goto fail; + + vaos->gen_run_elts = (vsv_run_elts_func)x86_get_func(&vaos->func[1]); + if (!vaos->gen_run_elts) + goto fail; + + return &vaos->base; + + fail: + if (vaos->machine) + align_free( vaos->machine ); + + if (vaos) + x86_release_func( &vaos->func[0] ); + + if (vaos) + x86_release_func( &vaos->func[1] ); + + FREE(vaos); + + return NULL; +} + + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, + const struct draw_vs_varient_key *key ) +{ + struct draw_vs_varient *varient = varient_aos_sse( vs, key ); + + if (varient == NULL) { + assert(0); + varient = draw_vs_varient_generic( vs, key ); + } + + return varient; +} + + + +#endif -- cgit v1.2.3 From 889473b3f5a216bd753c357974d6bae29fe3c41d Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 20:28:56 +0100 Subject: draw: add viewport to varient state --- .../auxiliary/draw/draw_pt_fetch_shade_emit.c | 9 +++- src/gallium/auxiliary/draw/draw_vs.h | 8 +++- src/gallium/auxiliary/draw/draw_vs_aos.c | 50 ++++++++++++++++++++++ src/gallium/auxiliary/draw/draw_vs_aos.h | 9 +++- src/gallium/auxiliary/draw/draw_vs_sse.c | 4 +- src/gallium/auxiliary/draw/draw_vs_varient.c | 10 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index 74945dcfe9..984fbb6767 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -95,10 +95,14 @@ static void fse_prepare( struct draw_pt_middle_end *middle, + fse->key.output_stride = vinfo->size * 4; fse->key.nr_elements = MAX2(num_vs_outputs, /* outputs - translate to hw format */ num_vs_inputs); /* inputs - fetch from api format */ - fse->key.output_stride = vinfo->size * 4; + fse->key.viewport = 1; + fse->key.clip = 0; + fse->key.pad = 0; + memset(fse->key.element, 0, fse->key.nr_elements * sizeof(fse->key.element[0])); @@ -211,6 +215,9 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->active->set_constants( fse->active, (const float (*)[4])draw->pt.user.constants ); + fse->active->set_viewport( fse->active, + &draw->viewport ); + //return TRUE; } diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index 5a8d0da06d..ff3e19b2a8 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -58,7 +58,10 @@ struct draw_vs_element { struct draw_vs_varient_key { unsigned output_stride; - unsigned nr_elements; + unsigned nr_elements:16; + unsigned viewport:1; + unsigned clip:1; + unsigned pad:14; struct draw_vs_element element[PIPE_MAX_ATTRIBS]; }; @@ -88,6 +91,9 @@ struct draw_vs_varient { void (*set_constants)( struct draw_vs_varient *, const float (*constants)[4] ); + void (*set_viewport)( struct draw_vs_varient *, + const struct pipe_viewport_state * ); + void (PIPE_CDECL *run_linear)( struct draw_vs_varient *shader, unsigned start, unsigned count, diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 620f5e3592..b8e66e8b78 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1401,6 +1401,37 @@ emit_instruction( struct aos_compilation *cp, } } + +static boolean emit_viewport( struct aos_compilation *cp ) +{ + struct x86_reg pos = aos_get_shader_reg(cp, + TGSI_FILE_OUTPUT, + 0); + + struct x86_reg scale = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, scale)); + + struct x86_reg translate = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, translate)); + + if (pos.file != file_XMM) { + struct x86_reg dst = aos_get_xmm_reg(cp); + sse_movups(cp->func, dst, pos); + pos = dst; + } + + sse_mulps(cp->func, pos, scale); + sse_addps(cp->func, pos, translate); + + aos_adopt_xmm_reg( cp, + pos, + TGSI_FILE_OUTPUT, + 0, + TRUE ); + return TRUE; +} + + static boolean note_immediate( struct aos_compilation *cp, struct tgsi_full_immediate *imm ) { @@ -1540,6 +1571,10 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, if (cp.error) goto fail; + if (cp.vaos->base.key.viewport) { + emit_viewport(&cp); + } + /* Emit output... TODO: do this eagerly after the last write to a * given output. */ @@ -1665,11 +1700,25 @@ static void vaos_set_constants( struct draw_vs_varient *varient, } +static void vaos_set_viewport( struct draw_vs_varient *varient, + const struct pipe_viewport_state *viewport ) +{ + struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + + memcpy(vaos->machine->scale, viewport->scale, 4 * sizeof(float)); + memcpy(vaos->machine->translate, viewport->translate, 4 * sizeof(float)); +} + + + static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_varient_key *key ) { struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); + if (key->clip) + return NULL; + if (!vaos) goto fail; @@ -1677,6 +1726,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, vaos->base.vs = vs; vaos->base.set_input = vaos_set_buffer; vaos->base.set_constants = vaos_set_constants; + vaos->base.set_viewport = vaos_set_viewport; vaos->base.destroy = vaos_destroy; vaos->base.run_linear = vaos_run_linear; vaos->base.run_elts = vaos_run_elts; diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index 1d8a055a90..16fef6451c 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -68,8 +68,13 @@ struct aos_machine { float immediate[MAX_IMMEDIATES][4]; /* fixme -- should just be a pointer */ float internal [MAX_INTERNALS ][4]; - unsigned fpu_round_nearest; - unsigned fpu_round_neg_inf; + float scale[4]; /* viewport */ + float translate[4]; /* viewport */ + + ushort fpu_round_nearest; + ushort fpu_round_neg_inf; + ushort fpu_restore; + ushort fpucntl; /* one of FPU_* above */ struct { const void *input_ptr; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index 0581c3042f..7781782ae8 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -156,8 +156,8 @@ draw_create_vs_sse(struct draw_context *draw, tgsi_scan_shader(templ->tokens, &vs->base.info); vs->base.draw = draw; - vs->base.create_varient = draw_vs_varient_generic; -// vs->base.create_varient = draw_vs_varient_aos_sse; + vs->base.create_varient = draw_vs_varient_aos_sse; +// vs->base.create_varient = draw_vs_varient_generic; vs->base.prepare = vs_sse_prepare; vs->base.run_linear = vs_sse_run_linear; vs->base.delete = vs_sse_delete; diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index d27b0f6187..f6f621a748 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -167,6 +167,12 @@ static void vsvg_run_linear( struct draw_vs_varient *varient, + +static void vsvg_set_viewport( struct draw_vs_varient *varient, + const struct pipe_viewport_state *viewport ) +{ +} + static void vsvg_destroy( struct draw_vs_varient *varient ) { FREE(varient); @@ -179,6 +185,9 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, unsigned i; struct translate_key fetch, emit; + if (key->viewport || key->clip) + return NULL; + struct draw_vs_varient_generic *vsvg = CALLOC_STRUCT( draw_vs_varient_generic ); if (vsvg == NULL) return NULL; @@ -187,6 +196,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, vsvg->base.vs = vs; vsvg->base.set_input = vsvg_set_input; vsvg->base.set_constants = vsvg_set_constants; + vsvg->base.set_viewport = vsvg_set_viewport; vsvg->base.run_elts = vsvg_run_elts; vsvg->base.run_linear = vsvg_run_linear; vsvg->base.destroy = vsvg_destroy; -- cgit v1.2.3 From 194a7be28f6eed502f2475d9a637cb3610ca75f6 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 20:31:08 +0100 Subject: draw: fix vs aos internal/machine state --- src/gallium/auxiliary/draw/draw_vs_aos.c | 59 ++++++++++++++++++++++++++++++-- src/gallium/auxiliary/draw/draw_vs_aos.h | 9 +++-- 2 files changed, 63 insertions(+), 5 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index b8e66e8b78..67761f881d 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -83,7 +83,7 @@ static struct x86_reg get_reg_ptr(struct aos_compilation *cp, return x86_make_disp(ptr, Offset(struct aos_machine, constant[idx])); case AOS_FILE_INTERNAL: - return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx])); default: ERROR(cp, "unknown reg file"); @@ -97,9 +97,63 @@ struct x86_reg aos_get_internal( struct aos_compilation *cp, { return get_reg_ptr( cp, AOS_FILE_INTERNAL, - imm + 1 ); + imm ); +} + +#define X87_CW_EXCEPTION_INV_OP (1<<0) +#define X87_CW_EXCEPTION_DENORM_OP (1<<1) +#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2) +#define X87_CW_EXCEPTION_OVERFLOW (1<<3) +#define X87_CW_EXCEPTION_UNDERFLOW (1<<4) +#define X87_CW_EXCEPTION_PRECISION (1<<5) +#define X87_CW_PRECISION_SINGLE (0<<8) +#define X87_CW_PRECISION_RESERVED (1<<8) +#define X87_CW_PRECISION_DOUBLE (2<<8) +#define X87_CW_PRECISION_DOUBLE_EXT (3<<8) +#define X87_CW_PRECISION_MASK (3<<8) +#define X87_CW_ROUND_NEAREST (0<<10) +#define X87_CW_ROUND_DOWN (1<<10) +#define X87_CW_ROUND_UP (2<<10) +#define X87_CW_ROUND_ZERO (3<<10) +#define X87_CW_ROUND_MASK (3<<10) +#define X87_CW_INFINITY (1<<12) + +static void init_internals( struct aos_machine *machine ) +{ + float inv = 1.0f/255.0f; + float f255 = 255.0f; + + ASSIGN_4V(machine->internal[IMM_ONES], 1.0f, 1.0f, 1.0f, 1.0f); + ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f); + ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f); + ASSIGN_4V(machine->internal[IMM_INV_255], inv, inv, inv, inv); + ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255); + + + machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP | + X87_CW_EXCEPTION_DENORM_OP | + X87_CW_EXCEPTION_ZERO_DIVIDE | + X87_CW_EXCEPTION_OVERFLOW | + X87_CW_EXCEPTION_UNDERFLOW | + X87_CW_EXCEPTION_PRECISION | + (1<<6) | + X87_CW_ROUND_NEAREST | + X87_CW_PRECISION_DOUBLE_EXT); + + assert(machine->fpu_rnd_nearest == 0x37f); + + machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP | + X87_CW_EXCEPTION_DENORM_OP | + X87_CW_EXCEPTION_ZERO_DIVIDE | + X87_CW_EXCEPTION_OVERFLOW | + X87_CW_EXCEPTION_UNDERFLOW | + X87_CW_EXCEPTION_PRECISION | + (1<<6) | + X87_CW_ROUND_DOWN | + X87_CW_PRECISION_DOUBLE_EXT); } + static void spill( struct aos_compilation *cp, unsigned idx ) { if (!cp->xmm[idx].dirty || @@ -1736,6 +1790,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, goto fail; memset(vaos->machine, 0, sizeof(struct aos_machine)); + init_internals(vaos->machine); tgsi_dump(vs->state.tokens, 0); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index 16fef6451c..c2afd4e9a0 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -52,10 +52,13 @@ struct x86_function; #define MAX_TEMPS PIPE_MAX_ATTRIBS /* say */ #define MAX_CONSTANTS PIPE_MAX_ATTRIBS /* say */ #define MAX_IMMEDIATES PIPE_MAX_ATTRIBS /* say */ -#define MAX_INTERNALS 4 +#define MAX_INTERNALS 8 #define AOS_FILE_INTERNAL TGSI_FILE_COUNT +#define FPU_RND_NEG 1 +#define FPU_RND_NEAREST 2 + /* This is the temporary storage used by all the aos_sse vs varients. * Create one per context and reuse by passing a pointer in at * vs_varient creation?? @@ -71,8 +74,8 @@ struct aos_machine { float scale[4]; /* viewport */ float translate[4]; /* viewport */ - ushort fpu_round_nearest; - ushort fpu_round_neg_inf; + ushort fpu_rnd_nearest; + ushort fpu_rnd_neg_inf; ushort fpu_restore; ushort fpucntl; /* one of FPU_* above */ -- cgit v1.2.3 From 2302a5d3c1ea2c682dfc034012a054b8327a81de Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 20:32:43 +0100 Subject: draw: fix fpu control word manipulations --- src/gallium/auxiliary/draw/draw_vs_aos.c | 33 +++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 67761f881d..e736990acc 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -690,41 +690,47 @@ static void x87_fstp_dest4( struct aos_compilation *cp, x87_fstp_or_pop(cp->func, writemask, 3, ptr); } +#define FPU_MANIP 1 /* Save current x87 state and put it into single precision mode. */ static void save_fpu_state( struct aos_compilation *cp ) { -#if 0 - x87_fnstcw( cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); - x87_fldcw( cp->func, ); +#if FPU_MANIP + x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_restore))); #endif } static void restore_fpu_state( struct aos_compilation *cp ) { -#if 0 +#if FPU_MANIP x87_fnclex(cp->func); - x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); + x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_restore))); #endif } static void set_fpu_round_neg_inf( struct aos_compilation *cp ) { -#if 0 - if (cp->fpucntl != RND_NEG_FPU) { - struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX); - struct arb_vp_machine *m = NULL; - - cp->fpucntl = RND_NEG_FPU; +#if FPU_MANIP + if (cp->fpucntl != FPU_RND_NEG) { + cp->fpucntl = FPU_RND_NEG; x87_fnclex(cp->func); - x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg))); + x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_rnd_neg_inf))); } #endif } static void set_fpu_round_nearest( struct aos_compilation *cp ) { -#if 0 +#if FPU_MANIP + if (cp->fpucntl != FPU_RND_NEAREST) { + cp->fpucntl = FPU_RND_NEAREST; + x87_fnclex(cp->func); + x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, fpu_rnd_nearest))); + } #endif } @@ -1590,6 +1596,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, Offset( struct draw_vs_varient_aos_sse, machine ))); save_fpu_state( &cp ); + set_fpu_round_nearest( &cp ); /* Note address for loop jump */ -- cgit v1.2.3 From 0a7a0d79f64de9794878c42bc5b79a04772d7ed8 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 20:34:52 +0100 Subject: draw: fix x87_ex2 and partially fix lit insn --- src/gallium/auxiliary/draw/draw_vs_aos.c | 112 +++++++++++++++---------------- 1 file changed, 56 insertions(+), 56 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index e736990acc..a365d456d1 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -735,23 +735,26 @@ static void set_fpu_round_nearest( struct aos_compilation *cp ) } -static void emit_x87_ex2( struct aos_compilation *cp ) +static void x87_emit_ex2( struct aos_compilation *cp ) { struct x86_reg st0 = x86_make_reg(file_x87, 0); struct x86_reg st1 = x86_make_reg(file_x87, 1); - struct x86_reg st3 = x86_make_reg(file_x87, 3); + int stack = cp->func->x87_stack; set_fpu_round_neg_inf( cp ); - x87_fld(cp->func, st0); /* a a */ - x87_fprndint( cp->func ); /* int(a) a */ - x87_fld(cp->func, st0); /* int(a) int(a) a */ - x87_fstp(cp->func, st3); /* int(a) a int(a)*/ - x87_fsubp(cp->func, st1); /* frac(a) int(a) */ - x87_f2xm1(cp->func); /* (2^frac(a))-1 int(a)*/ - x87_fld1(cp->func); /* 1 (2^frac(a))-1 int(a)*/ - x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ - x87_fscale(cp->func); /* 2^a */ + x87_fld(cp->func, st0); /* a a */ + x87_fld(cp->func, st0); /* a a a */ + x87_fprndint( cp->func ); /* flr(a) a a*/ + x87_fsubp(cp->func, st1); /* frac(a) a */ + x87_f2xm1(cp->func); /* (2^frac(a))-1 a */ + x87_fld1(cp->func); /* 1 (2^frac(a))-1 a */ + x87_faddp(cp->func, st1); /* 2^frac(a) a */ + x87_fscale(cp->func); /* 2^a a */ + x87_fstp(cp->func, st1); + + assert( stack == cp->func->x87_stack); + } @@ -907,9 +910,7 @@ static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_inst static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { x87_fld_src(cp, &op->FullSrcRegisters[0], 0); - - emit_x87_ex2(cp); - + x87_emit_ex2(cp); x87_fstp_dest4(cp, &op->FullDstRegisters[0]); return TRUE; } @@ -1084,63 +1085,62 @@ static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_inst static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); - struct x86_reg st1 = x86_make_reg(file_x87, 1); - unsigned fixup1, fixup2; unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; - /* Load the interesting parts of arg0: - */ - x87_fld_src(cp, &op->FullSrcRegisters[0], 3); - x87_fld_src(cp, &op->FullSrcRegisters[0], 1); - x87_fld_src(cp, &op->FullSrcRegisters[0], 0); - - if (writemask & TGSI_WRITEMASK_XW) { - x87_fld1(cp->func); - x87_fst_or_nop(cp->func, writemask, 0, dst); - x87_fstp_or_pop(cp->func, writemask, 3, dst); - } if (writemask & TGSI_WRITEMASK_YZ) { + struct x86_reg st1 = x86_make_reg(file_x87, 1); + struct x86_reg st2 = x86_make_reg(file_x87, 2); + + - /* Pre-zero destinations, may be overwritten later... fixme. - */ - x87_fldz(cp->func); - x87_fst_or_nop(cp->func, writemask, 1, dst); - x87_fstp_or_pop(cp->func, writemask, 2, dst); + /* a1' = a1 <= 0 ? 1 : a1; + */ + x87_fldz(cp->func); /* 0 */ + x87_fld1(cp->func); /* 1 0 */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */ + x87_fcomi(cp->func, st2); /* a1 1 0 */ + x87_fcmovb(cp->func, st1); /* a1' 1 0 */ + x87_fstp(cp->func, st1); /* a1' 0 */ + x87_fstp(cp->func, st1); /* a1' */ + + x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1' */ + x87_fxch(cp->func, st1); /* a1' a3 */ + - /* Check arg0[0]: + /* Compute pow(a1, a3) */ - x87_fldz(cp->func); /* 0 a0 a1 a3 */ - x87_fucomp(cp->func, st1); /* a0 a1 a3 */ - x87_fnstsw(cp->func, cp->tmp_EAX); - x86_sahf(cp->func); - fixup1 = x86_jcc_forward(cp->func, cc_AE); - - x87_fstp_or_pop(cp->func, writemask, 1, dst); /* a1 a3 */ + x87_fyl2x(cp->func); /* a3*log2(a1) */ + x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */ - /* Check arg0[1]: - */ - x87_fldz(cp->func); /* 0 a1 a3 */ - x87_fucomp(cp->func, st1); /* a1 a3 */ - x87_fnstsw(cp->func, cp->tmp_EAX); - x86_sahf(cp->func); - fixup2 = x86_jcc_forward(cp->func, cc_AE); - /* Compute pow(a1, a3) + /* a0' = max2(a0, 0): */ - x87_fyl2x(cp->func); /* a3*log2(a1) */ + x87_fldz(cp->func); /* 0 r2 */ + x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */ + x87_fcomi(cp->func, st1); + x87_fcmovb(cp->func, st1); /* a0' 0 r2 */ + x87_fstp(cp->func, st1); /* a0' r2 */ - emit_x87_ex2( cp ); /* 2^(a3*log2(a1)) */ + x87_fxch(cp->func, st1); /* a0' r2 */ + x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */ + + x87_fldz(cp->func); /* 0 a0' r2 */ + x87_fcomi(cp->func, st1); /* 0 a0' r2 */ + x87_fcmovnbe(cp->func, st2); /* r2' a0' r2 */ x87_fstp_or_pop(cp->func, writemask, 2, dst); - - /* Land jumps: - */ - x86_fixup_fwd_jump(cp->func, fixup1); - x86_fixup_fwd_jump(cp->func, fixup2); + x87_fpop(cp->func); + x87_fpop(cp->func); + } + + if (writemask & TGSI_WRITEMASK_XW) { + x87_fld1(cp->func); + x87_fst_or_nop(cp->func, writemask, 0, dst); + x87_fstp_or_pop(cp->func, writemask, 3, dst); } return TRUE; @@ -1222,7 +1222,7 @@ static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_inst x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0.x a1.x */ x87_fyl2x(cp->func); /* a1*log2(a0) */ - emit_x87_ex2( cp ); /* 2^(a1*log2(a0)) */ + x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */ x87_fstp_dest4(cp, &op->FullDstRegisters[0]); return TRUE; -- cgit v1.2.3 From 083f3f5c32a28d2993a8a5a8b4f5ef81224a5ec3 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 20:38:23 +0100 Subject: draw: avoid a pointless mov in many sse opcodes --- src/gallium/auxiliary/draw/draw_vs_aos.c | 94 +++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 31 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index a365d456d1..97de43c232 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -174,14 +174,44 @@ static void spill( struct aos_compilation *cp, unsigned idx ) } } +static boolean is_xmm_tmp( struct aos_compilation *cp, + struct x86_reg reg ) +{ + return (reg.file == file_XMM && + cp->xmm[reg.idx].file == TGSI_FILE_NULL); +} + +static struct x86_reg get_xmm_tmp( struct aos_compilation *cp, + struct x86_reg reg ) +{ + if (!is_xmm_tmp(cp, reg)) { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, tmp, reg); + reg = tmp; + } + + return reg; +} + + struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) { unsigned i; unsigned oldest = 0; + boolean found = FALSE; for (i = 0; i < 8; i++) - if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) + if (cp->xmm[i].last_used != cp->insn_counter && + cp->xmm[i].file == TGSI_FILE_NULL) { oldest = i; + found = TRUE; + } + + if (!found) { + for (i = 0; i < 8; i++) + if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) + oldest = i; + } /* Need to write out the old value? */ @@ -237,15 +267,24 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp, unsigned idx, unsigned dirty ) { + unsigned i; + if (reg.file != file_XMM) { assert(0); return; } - invalidate_xmm(cp, file, idx); + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx) { + aos_release_xmm_reg(cp, i); + } + } + cp->xmm[reg.idx].file = file; cp->xmm[reg.idx].idx = idx; cp->xmm[reg.idx].dirty = dirty; + cp->xmm[reg.idx].last_used = cp->insn_counter; } @@ -659,6 +698,7 @@ static void x87_fst_or_nop( struct x86_function *func, unsigned channel, struct x86_reg ptr ) { + assert(ptr.file == file_REG32); if (writemask & (1<FullSrcRegisters[0]); - struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_mulps(cp->func, dst, neg); sse_maxps(cp->func, dst, arg0); @@ -782,9 +822,8 @@ static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_addps(cp->func, dst, arg1); store_dest(cp, &op->FullDstRegisters[0], dst); @@ -806,10 +845,9 @@ static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_mulps(cp->func, dst, arg1); /* Now the hard bit: sum the first 3 values: @@ -831,10 +869,9 @@ static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_mulps(cp->func, dst, arg1); /* Now the hard bit: sum the values: @@ -854,10 +891,9 @@ static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_mulps(cp->func, dst, arg1); /* Now the hard bit: sum the values (from DP3): @@ -1152,9 +1188,8 @@ static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_maxps(cp->func, dst, arg1); store_dest(cp, &op->FullDstRegisters[0], dst); @@ -1166,9 +1201,8 @@ static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_minps(cp->func, dst, arg1); store_dest(cp, &op->FullDstRegisters[0], dst); @@ -1178,9 +1212,9 @@ static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_inst static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); - struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); + /* potentially nothing to do */ store_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; @@ -1190,9 +1224,8 @@ static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_mulps(cp->func, dst, arg1); store_dest(cp, &op->FullDstRegisters[0], dst); @@ -1205,13 +1238,15 @@ static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]); - struct x86_reg dst = aos_get_xmm_reg(cp); - sse_movups(cp->func, dst, arg0); - sse_mulps(cp->func, dst, arg1); - sse_addps(cp->func, dst, arg2); + /* If we can't clobber old contents of arg0, get a temporary & copy + * it there, then clobber it... + */ + arg0 = get_xmm_tmp(cp, arg0); - store_dest(cp, &op->FullDstRegisters[0], dst); + sse_mulps(cp->func, arg0, arg1); + sse_addps(cp->func, arg0, arg2); + store_dest(cp, &op->FullDstRegisters[0], arg0); return TRUE; } @@ -1272,10 +1307,9 @@ static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); sse_andps(cp->func, dst, ones); @@ -1297,10 +1331,9 @@ static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_cmpps(cp->func, dst, arg1, cc_LessThan); sse_andps(cp->func, dst, ones); @@ -1312,9 +1345,8 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg dst = get_xmm_tmp(cp, arg0); - sse_movups(cp->func, dst, arg0); sse_subps(cp->func, dst, arg1); store_dest(cp, &op->FullDstRegisters[0], dst); -- cgit v1.2.3 From 5b1bd30f22ffa3955150ec008631d0f4754d340f Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Wed, 21 May 2008 20:41:01 +0100 Subject: draw: when preloading args to x87 stack, need to use reverse order --- src/gallium/auxiliary/draw/draw_vs_aos.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 97de43c232..fde92c7226 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -980,7 +980,7 @@ static boolean emit_EXP( struct aos_compilation *cp, const struct tgsi_full_inst x87_fstp_or_pop(cp->func, writemask, 0, dst); /* flr(a) a 2^flr(a) */ - x87_fsubrp(cp->func, st1); /* frac(a) 2^flr(a) */ + x87_fsubp(cp->func, st1); /* frac(a) 2^flr(a) */ x87_fst_or_nop(cp->func, writemask, 1, dst); /* frac(a) 2^flr(a) */ @@ -1041,9 +1041,9 @@ static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_inst /* Load all sources first to avoid aliasing */ - for (i = 0; i < 4; i++) { + for (i = 3; i >= 0; i--) { if (writemask & (1<FullSrcRegisters[0], i); + x87_fld_src(cp, &op->FullSrcRegisters[0], i); } } @@ -1068,9 +1068,9 @@ static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_inst /* Load all sources first to avoid aliasing */ - for (i = 0; i < 4; i++) { + for (i = 3; i >= 0; i--) { if (writemask & (1<FullSrcRegisters[0], i); + x87_fld_src(cp, &op->FullSrcRegisters[0], i); } } @@ -1098,7 +1098,7 @@ static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_inst /* suck all the source values onto the stack before writing out any * dst, which may alias... */ - for (i = 0; i < 4; i++) { + for (i = 3; i >= 0; i--) { if (writemask & (1<FullSrcRegisters[0], i); } @@ -1108,7 +1108,7 @@ static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_inst if (writemask & (1<func, st0); /* a a */ x87_fprndint( cp->func ); /* flr(a) a */ - x87_fsubrp(cp->func, st1); /* frc(a) */ + x87_fsubp(cp->func, st1); /* frc(a) */ x87_fstp(cp->func, x86_make_disp(dst, i*4)); } } @@ -1392,6 +1392,8 @@ static boolean emit_instruction( struct aos_compilation *cp, struct tgsi_full_instruction *inst ) { + x87_assert_stack_empty(cp->func); + switch( inst->Instruction.Opcode ) { case TGSI_OPCODE_MOV: return emit_MOV( cp, inst ); @@ -1657,6 +1659,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, break; } + x87_assert_stack_empty(cp.func); cp.insn_counter++; debug_printf("\n"); } @@ -1712,6 +1715,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, x86_pop(cp.func, cp.count_ESI); x86_pop(cp.func, cp.idx_EBX); + x87_assert_stack_empty(cp.func); x86_ret(cp.func); tgsi_parse_free( &parse ); -- cgit v1.2.3 From c684ffa02d8d43ee04b99ee63ccd1adb66e81c1a Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 13:41:49 +0100 Subject: draw: clean up internal immediates in aos sse --- src/gallium/auxiliary/draw/draw_vs_aos.c | 64 +++++++++++++++++++++-------- src/gallium/auxiliary/draw/draw_vs_aos.h | 5 ++- src/gallium/auxiliary/draw/draw_vs_aos_io.c | 10 ++--- 3 files changed, 55 insertions(+), 24 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index fde92c7226..0b8600696a 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -44,12 +44,6 @@ #ifdef PIPE_ARCH_X86 -#define DISASSEM 0 - - - - - static INLINE boolean eq( struct x86_reg a, struct x86_reg b ) { @@ -92,13 +86,6 @@ static struct x86_reg get_reg_ptr(struct aos_compilation *cp, } -struct x86_reg aos_get_internal( struct aos_compilation *cp, - unsigned imm ) -{ - return get_reg_ptr( cp, - AOS_FILE_INTERNAL, - imm ); -} #define X87_CW_EXCEPTION_INV_OP (1<<0) #define X87_CW_EXCEPTION_DENORM_OP (1<<1) @@ -123,6 +110,9 @@ static void init_internals( struct aos_machine *machine ) float inv = 1.0f/255.0f; float f255 = 255.0f; + ASSIGN_4V(machine->internal[IMM_SWZ], 1.0f, -1.0f, 0.0f, 1.0f); + *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff; + ASSIGN_4V(machine->internal[IMM_ONES], 1.0f, 1.0f, 1.0f, 1.0f); ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f); ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f); @@ -337,6 +327,39 @@ struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, +static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, + unsigned file, + unsigned idx ) +{ + struct x86_reg reg = aos_get_shader_reg( cp, file, idx ); + + if (reg.file != file_XMM) { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movups(cp->func, tmp, reg); + aos_adopt_xmm_reg( cp, tmp, file, idx, FALSE ); + reg = tmp; + } + + return reg; +} + + + +struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, + unsigned imm ) +{ + return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm ); +} + + +struct x86_reg aos_get_internal( struct aos_compilation *cp, + unsigned imm ) +{ + return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm ); +} + + + /* Emulate pshufd insn in regular SSE, if necessary: @@ -461,15 +484,15 @@ static struct x86_reg fetch_src( struct aos_compilation *cp, arg0 = dst; } - if (negs) { - struct x86_reg imm_negs = aos_get_internal(cp, IMM_NEGS); + if (negs && negs != 0xf) { + struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); struct x86_reg tmp = aos_get_xmm_reg(cp); /* Load 1,-1,0,0 * Use neg as arg to pshufd * Multiply */ - emit_pshufd(cp, tmp, imm_negs, + emit_pshufd(cp, tmp, imm_swz, SHUF((negs & 1) ? 1 : 0, (negs & 2) ? 1 : 0, (negs & 4) ? 1 : 0, @@ -479,12 +502,17 @@ static struct x86_reg fetch_src( struct aos_compilation *cp, aos_release_xmm_reg(cp, tmp.idx); arg0 = dst; } + else if (negs) { + struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS); + sse_mulps(cp->func, dst, imm_negs); + arg0 = dst; + } + if (abs && abs != 0xf) { ERROR(cp, "unsupported partial abs"); } - - if (abs) { + else if (abs) { struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); struct x86_reg tmp = aos_get_xmm_reg(cp); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index c2afd4e9a0..efdc9a38f4 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -142,13 +142,16 @@ boolean aos_emit_outputs( struct aos_compilation *cp ); #define IMM_ONES 0 /* 1, 1,1,1 */ -#define IMM_NEGS 1 /* 1,-1,0,0 */ +#define IMM_SWZ 1 /* 1,-1,0, 0xffffffff */ #define IMM_IDENTITY 2 /* 0, 0,0,1 */ #define IMM_INV_255 3 /* 1/255, 1/255, 1/255, 1/255 */ #define IMM_255 4 /* 255, 255, 255, 255 */ +#define IMM_NEGS 5 /* -1,-1,-1,-1 */ struct x86_reg aos_get_internal( struct aos_compilation *cp, unsigned imm ); +struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, + unsigned imm ); #define ERROR(cp, msg) \ diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c index 72b2b3d11d..0dda9df97d 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -54,7 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp, struct x86_reg src_ptr ) { sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); - sse_shufps(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); + sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) ); sse_movlps(cp->func, data, src_ptr); } @@ -63,7 +63,7 @@ static void emit_load_R32G32( struct aos_compilation *cp, struct x86_reg data, struct x86_reg src_ptr ) { - sse_movups(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ) ); + sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); sse_movlps(cp->func, data, src_ptr); } @@ -73,7 +73,7 @@ static void emit_load_R32( struct aos_compilation *cp, struct x86_reg src_ptr ) { sse_movss(cp->func, data, src_ptr); - sse_orps(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ) ); + sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) ); } @@ -82,8 +82,8 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, struct x86_reg src_ptr ) { sse_movss(cp->func, data, src_ptr); - sse2_punpcklbw(cp->func, data, aos_get_internal( cp, IMM_IDENTITY )); - sse2_punpcklbw(cp->func, data, aos_get_internal( cp, IMM_IDENTITY )); + sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); + sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY )); sse2_cvtdq2ps(cp->func, data, data); sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255)); } -- cgit v1.2.3 From 05029c919d46299ca259ee8af880d0a65f95ce7c Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 13:46:06 +0100 Subject: draw: clean up masked writes in aos sse, make some xmm function names clearer --- src/gallium/auxiliary/draw/draw_vs_aos.c | 236 +++++++++++-------------------- 1 file changed, 82 insertions(+), 154 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 0b8600696a..708ecadbac 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -171,7 +171,7 @@ static boolean is_xmm_tmp( struct aos_compilation *cp, cp->xmm[reg.idx].file == TGSI_FILE_NULL); } -static struct x86_reg get_xmm_tmp( struct aos_compilation *cp, +static struct x86_reg get_xmm_clone( struct aos_compilation *cp, struct x86_reg reg ) { if (!is_xmm_tmp(cp, reg)) { @@ -380,31 +380,37 @@ static void emit_pshufd( struct aos_compilation *cp, } } - - - -/* Helper for writemask: +/* load masks (pack into negs??) + * pshufd - shuffle according to writemask + * and - result, mask + * nand - dest, mask + * or - dest, result */ -static boolean emit_shuf_copy1( struct aos_compilation *cp, - struct x86_reg dst, - struct x86_reg arg0, - struct x86_reg arg1, - ubyte shuf ) +static boolean mask_write( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg result, + unsigned mask ) { + struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movups(cp->func, dst, arg1); - emit_pshufd(cp, dst, dst, shuf); - emit_pshufd(cp, tmp, arg0, shuf); - - sse_movss(cp->func, dst, tmp); + + emit_pshufd(cp, tmp, imm_swz, + SHUF((mask & 1) ? 2 : 3, + (mask & 2) ? 2 : 3, + (mask & 4) ? 2 : 3, + (mask & 8) ? 2 : 3)); - emit_pshufd(cp, dst, dst, shuf); + sse_andps(cp->func, dst, tmp); + sse_andnps(cp->func, tmp, result); + sse_orps(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); return TRUE; } + + /* Helper for writemask: */ static boolean emit_shuf_copy2( struct aos_compilation *cp, @@ -414,17 +420,18 @@ static boolean emit_shuf_copy2( struct aos_compilation *cp, ubyte shuf ) { struct x86_reg tmp = aos_get_xmm_reg(cp); + emit_pshufd(cp, dst, arg1, shuf); emit_pshufd(cp, tmp, arg0, shuf); - sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); - emit_pshufd(cp, dst, dst, shuf); aos_release_xmm_reg(cp, tmp.idx); return TRUE; } + + #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) @@ -593,131 +600,58 @@ static void store_dest( struct aos_compilation *cp, const struct tgsi_full_dst_register *reg, struct x86_reg result ) { - if (reg->DstRegister.WriteMask == 0) - { - return; - } - else if (reg->DstRegister.WriteMask == TGSI_WRITEMASK_XYZW) - { - if (result.file == file_XMM) { - aos_adopt_xmm_reg(cp, - result, - reg->DstRegister.File, - reg->DstRegister.Index, - TRUE); - } - else { - struct x86_reg dst = aos_get_xmm_reg(cp); - aos_adopt_xmm_reg(cp, - dst, - reg->DstRegister.File, - reg->DstRegister.Index, - TRUE); - sse_movups(cp->func, dst, result); - } - } - else - { - /* Previous value of the dest register: - */ - struct x86_reg old_dst = aos_get_shader_reg(cp, - reg->DstRegister.File, - reg->DstRegister.Index); - - - /* Alloc an xmm reg to hold the new value of the dest register: - */ - struct x86_reg dst = aos_get_xmm_reg(cp); + struct x86_reg dst; + switch (reg->DstRegister.WriteMask) { + case 0: + return; + + case TGSI_WRITEMASK_XYZW: aos_adopt_xmm_reg(cp, - dst, + get_xmm_clone(cp, result), reg->DstRegister.File, reg->DstRegister.Index, - TRUE ); - - switch (reg->DstRegister.WriteMask) { - case TGSI_WRITEMASK_X: - if (result.file == file_XMM) { - sse_movups(cp->func, dst, old_dst); - sse_movss(cp->func, dst, result); - } - else { - struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movups(cp->func, dst, old_dst); - sse_movss(cp->func, tmp, result); - sse_movss(cp->func, dst, tmp); - aos_release_xmm_reg(cp, tmp.idx); - } - break; - - case TGSI_WRITEMASK_XY: - sse_movups(cp->func, dst, old_dst); - sse_shufps(cp->func, dst, result, SHUF(X, Y, Z, W)); - break; - - case TGSI_WRITEMASK_ZW: - sse_movups(cp->func, dst, result); - sse_shufps(cp->func, dst, old_dst, SHUF(X, Y, Z, W)); - break; - - case TGSI_WRITEMASK_YZW: - if (old_dst.file == file_XMM) { - sse_movups(cp->func, dst, result); - sse_movss(cp->func, dst, old_dst); - } - else { - struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movups(cp->func, dst, result); - sse_movss(cp->func, tmp, old_dst); - sse_movss(cp->func, dst, tmp); - aos_release_xmm_reg(cp, tmp.idx); - } - break; - - case TGSI_WRITEMASK_Y: - emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Y,X,Z,W)); - break; - - case TGSI_WRITEMASK_Z: - emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); - break; - - case TGSI_WRITEMASK_W: - emit_shuf_copy1(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); - break; - - case TGSI_WRITEMASK_XZ: - emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,Z,Y,W)); - break; + TRUE); + return; + default: + break; + } - case TGSI_WRITEMASK_XW: - emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,W,Z,Y)); + dst = aos_get_shader_reg_xmm(cp, + reg->DstRegister.File, + reg->DstRegister.Index); - case TGSI_WRITEMASK_YZ: - emit_shuf_copy2(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); - break; + switch (reg->DstRegister.WriteMask) { + case TGSI_WRITEMASK_X: + sse_movss(cp->func, dst, get_xmm_clone(cp, result)); + break; + + case TGSI_WRITEMASK_XY: + sse_shufps(cp->func, dst, get_xmm_clone(cp, result), SHUF(X, Y, Z, W)); + break; - case TGSI_WRITEMASK_YW: - emit_shuf_copy2(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); - break; + case TGSI_WRITEMASK_ZW: + result = get_xmm_clone(cp, result); + sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); + dst = result; + break; - case TGSI_WRITEMASK_XZW: - emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Y,X,Z,W)); - break; + case TGSI_WRITEMASK_YZW: + sse_movss(cp->func, result, dst); + dst = result; + break; - case TGSI_WRITEMASK_XYW: - emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Z,Y,X,W)); - break; + default: + mask_write(cp, dst, result, reg->DstRegister.WriteMask); + break; + } - case TGSI_WRITEMASK_XYZ: - emit_shuf_copy1(cp, dst, old_dst, result, SHUF(W,Y,Z,X)); - break; + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); - default: - assert(0); /* not possible */ - break; - } - } } @@ -837,7 +771,7 @@ static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_mulps(cp->func, dst, neg); sse_maxps(cp->func, dst, arg0); @@ -850,7 +784,7 @@ static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_addps(cp->func, dst, arg1); @@ -874,7 +808,7 @@ static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_mulps(cp->func, dst, arg1); @@ -898,7 +832,7 @@ static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_mulps(cp->func, dst, arg1); @@ -920,7 +854,7 @@ static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_mulps(cp->func, dst, arg1); @@ -1216,7 +1150,7 @@ static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_maxps(cp->func, dst, arg1); @@ -1229,7 +1163,7 @@ static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_minps(cp->func, dst, arg1); @@ -1240,7 +1174,7 @@ static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_inst static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); /* potentially nothing to do */ @@ -1252,7 +1186,7 @@ static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_mulps(cp->func, dst, arg1); @@ -1270,7 +1204,7 @@ static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_inst /* If we can't clobber old contents of arg0, get a temporary & copy * it there, then clobber it... */ - arg0 = get_xmm_tmp(cp, arg0); + arg0 = get_xmm_clone(cp, arg0); sse_mulps(cp->func, arg0, arg1); sse_addps(cp->func, arg0, arg2); @@ -1336,7 +1270,7 @@ static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); sse_andps(cp->func, dst, ones); @@ -1360,7 +1294,7 @@ static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_cmpps(cp->func, dst, arg1, cc_LessThan); sse_andps(cp->func, dst, ones); @@ -1373,7 +1307,7 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_tmp(cp, arg0); + struct x86_reg dst = get_xmm_clone(cp, arg0); sse_subps(cp->func, dst, arg1); @@ -1526,9 +1460,9 @@ emit_instruction( struct aos_compilation *cp, static boolean emit_viewport( struct aos_compilation *cp ) { - struct x86_reg pos = aos_get_shader_reg(cp, - TGSI_FILE_OUTPUT, - 0); + struct x86_reg pos = aos_get_shader_reg_xmm(cp, + TGSI_FILE_OUTPUT, + 0); struct x86_reg scale = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, scale)); @@ -1536,12 +1470,6 @@ static boolean emit_viewport( struct aos_compilation *cp ) struct x86_reg translate = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, translate)); - if (pos.file != file_XMM) { - struct x86_reg dst = aos_get_xmm_reg(cp); - sse_movups(cp->func, dst, pos); - pos = dst; - } - sse_mulps(cp->func, pos, scale); sse_addps(cp->func, pos, translate); -- cgit v1.2.3 From 7b25c1a4032960752d8a8e950bdf75740b2de2e8 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 13:47:08 +0100 Subject: draw: remove FPU_MANIP ifdef --- src/gallium/auxiliary/draw/draw_vs_aos.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 708ecadbac..d60940bb7a 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -693,48 +693,39 @@ static void x87_fstp_dest4( struct aos_compilation *cp, x87_fstp_or_pop(cp->func, writemask, 3, ptr); } -#define FPU_MANIP 1 /* Save current x87 state and put it into single precision mode. */ static void save_fpu_state( struct aos_compilation *cp ) { -#if FPU_MANIP x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_restore))); -#endif } static void restore_fpu_state( struct aos_compilation *cp ) { -#if FPU_MANIP x87_fnclex(cp->func); x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_restore))); -#endif } static void set_fpu_round_neg_inf( struct aos_compilation *cp ) { -#if FPU_MANIP if (cp->fpucntl != FPU_RND_NEG) { cp->fpucntl = FPU_RND_NEG; x87_fnclex(cp->func); x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_rnd_neg_inf))); } -#endif } static void set_fpu_round_nearest( struct aos_compilation *cp ) { -#if FPU_MANIP if (cp->fpucntl != FPU_RND_NEAREST) { cp->fpucntl = FPU_RND_NEAREST; x87_fnclex(cp->func); x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_rnd_nearest))); } -#endif } @@ -754,7 +745,7 @@ static void x87_emit_ex2( struct aos_compilation *cp ) x87_fld1(cp->func); /* 1 (2^frac(a))-1 a */ x87_faddp(cp->func, st1); /* 2^frac(a) a */ x87_fscale(cp->func); /* 2^a a */ - x87_fstp(cp->func, st1); + x87_fstp(cp->func, st1); /* 2^a */ assert( stack == cp->func->x87_stack); -- cgit v1.2.3 From 6780a6dede31e7f2eb465e1d7b507b3e64fe6ec9 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 13:48:07 +0100 Subject: draw: shortcircuit shuffle in aos_sse when possible --- src/gallium/auxiliary/draw/draw_vs_aos.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index d60940bb7a..b8fad231ca 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -809,7 +809,9 @@ static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_inst sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(cp->func, dst, tmp); - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); aos_release_xmm_reg(cp, tmp.idx); store_dest(cp, &op->FullDstRegisters[0], dst); @@ -833,7 +835,9 @@ static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_inst sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(cp->func, dst, tmp); - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); aos_release_xmm_reg(cp, tmp.idx); store_dest(cp, &op->FullDstRegisters[0], dst); @@ -857,7 +861,9 @@ static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_inst sse_addss(cp->func, dst, tmp); emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); sse_addss(cp->func, dst, tmp); - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + + if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); aos_release_xmm_reg(cp, tmp.idx); store_dest(cp, &op->FullDstRegisters[0], dst); @@ -1233,7 +1239,8 @@ static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_inst sse_divss(cp->func, dst, arg0); } - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); store_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; @@ -1249,7 +1256,8 @@ static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_inst /* Extend precision here... */ - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) + sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); store_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; -- cgit v1.2.3 From 65cb09249e750b45ec3fc9a57670fc77250efc5e Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 13:49:38 +0100 Subject: draw: for debug, do rhw divide in aos_sse viewport calcs --- src/gallium/auxiliary/draw/draw_vs_aos.c | 45 +++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index b8fad231ca..40de13a98c 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1481,6 +1481,46 @@ static boolean emit_viewport( struct aos_compilation *cp ) } +/* This is useful to be able to see the results on softpipe. Doesn't + * do proper clipping, just assumes the backend can do it during + * rasterization -- for debug only... + */ +static boolean emit_rhw_viewport( struct aos_compilation *cp ) +{ + struct x86_reg tmp = aos_get_xmm_reg(cp); + struct x86_reg pos = aos_get_shader_reg_xmm(cp, + TGSI_FILE_OUTPUT, + 0); + + struct x86_reg scale = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, scale)); + + struct x86_reg translate = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, translate)); + + + + emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W)); + sse2_rcpss(cp->func, tmp, tmp); + sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X)); + + sse_mulps(cp->func, pos, scale); + sse_mulps(cp->func, pos, tmp); + sse_addps(cp->func, pos, translate); + + /* Set pos[3] = w + */ + mask_write(cp, pos, tmp, TGSI_WRITEMASK_W); + + aos_adopt_xmm_reg( cp, + pos, + TGSI_FILE_OUTPUT, + 0, + TRUE ); + return TRUE; +} + + static boolean note_immediate( struct aos_compilation *cp, struct tgsi_full_immediate *imm ) { @@ -1623,7 +1663,10 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, goto fail; if (cp.vaos->base.key.viewport) { - emit_viewport(&cp); + if (0) + emit_viewport(&cp); + else + emit_rhw_viewport(&cp); } /* Emit output... TODO: do this eagerly after the last write to a -- cgit v1.2.3 From 260001430bbd28ea17201f1980ab1ebed93b246f Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 15:24:02 +0100 Subject: draw: use aligned movs within draw_vs_aos.c --- src/gallium/auxiliary/draw/draw_vs_aos.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 40de13a98c..039e233fe8 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -159,7 +159,7 @@ static void spill( struct aos_compilation *cp, unsigned idx ) cp->xmm[idx].idx); assert(cp->xmm[idx].dirty); - sse_movups(cp->func, oldval, x86_make_reg(file_XMM, idx)); + sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx)); cp->xmm[idx].dirty = 0; } } @@ -176,7 +176,7 @@ static struct x86_reg get_xmm_clone( struct aos_compilation *cp, { if (!is_xmm_tmp(cp, reg)) { struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movups(cp->func, tmp, reg); + sse_movaps(cp->func, tmp, reg); reg = tmp; } @@ -335,7 +335,7 @@ static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, if (reg.file != file_XMM) { struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movups(cp->func, tmp, reg); + sse_movaps(cp->func, tmp, reg); aos_adopt_xmm_reg( cp, tmp, file, idx, FALSE ); reg = tmp; } @@ -374,7 +374,7 @@ static void emit_pshufd( struct aos_compilation *cp, } else { if (!eq(dst, arg0)) - sse_movups(cp->func, dst, arg0); + sse_movaps(cp->func, dst, arg0); sse_shufps(cp->func, dst, dst, shuf); } @@ -523,7 +523,7 @@ static struct x86_reg fetch_src( struct aos_compilation *cp, struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movups(cp->func, tmp, arg0); + sse_movaps(cp->func, tmp, arg0); sse_mulps(cp->func, tmp, neg); sse_maxps(cp->func, dst, arg0); -- cgit v1.2.3 From 43df4642f1d2f3d2673a1d5e4f5126f5175fb899 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 20:21:49 +0100 Subject: draw: tweak x87_emit_ex2 to avoid changing x87 fpu settings --- src/gallium/auxiliary/draw/draw_vs_aos.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 039e233fe8..93bb4f9bc0 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -735,16 +735,17 @@ static void x87_emit_ex2( struct aos_compilation *cp ) struct x86_reg st1 = x86_make_reg(file_x87, 1); int stack = cp->func->x87_stack; - set_fpu_round_neg_inf( cp ); +// set_fpu_round_neg_inf( cp ); x87_fld(cp->func, st0); /* a a */ - x87_fld(cp->func, st0); /* a a a */ - x87_fprndint( cp->func ); /* flr(a) a a*/ - x87_fsubp(cp->func, st1); /* frac(a) a */ - x87_f2xm1(cp->func); /* (2^frac(a))-1 a */ - x87_fld1(cp->func); /* 1 (2^frac(a))-1 a */ - x87_faddp(cp->func, st1); /* 2^frac(a) a */ - x87_fscale(cp->func); /* 2^a a */ + x87_fprndint( cp->func ); /* int(a) a*/ + x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */ + x87_fxch(cp->func, st1); /* frc(a) int(a) */ + x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */ + x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */ + x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ + x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */ + /* 2^a int(a) */ x87_fstp(cp->func, st1); /* 2^a */ assert( stack == cp->func->x87_stack); -- cgit v1.2.3 From 7106da136069f865747e03c30ca245bc030b241b Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 22 May 2008 20:22:15 +0100 Subject: draw: correct but slow LIT() in aos varient --- src/gallium/auxiliary/draw/draw_vs_aos.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 93bb4f9bc0..930914f609 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1093,10 +1093,12 @@ static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_inst - /* a1' = a1 <= 0 ? 1 : a1; + /* a1' = a1 <= 0 ? 1 : a1; + * + * Note: use 1.0 to avoid passing zero to */ - x87_fldz(cp->func); /* 0 */ - x87_fld1(cp->func); /* 1 0 */ + x87_fldz(cp->func); /* 1 0 */ + x87_fldz(cp->func); /* 1 0 */ x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */ x87_fcomi(cp->func, st2); /* a1 1 0 */ x87_fcmovb(cp->func, st1); /* a1' 1 0 */ @@ -1119,17 +1121,14 @@ static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_inst x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */ x87_fcomi(cp->func, st1); x87_fcmovb(cp->func, st1); /* a0' 0 r2 */ - x87_fstp(cp->func, st1); /* a0' r2 */ - x87_fxch(cp->func, st1); /* a0' r2 */ x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */ - x87_fldz(cp->func); /* 0 a0' r2 */ - x87_fcomi(cp->func, st1); /* 0 a0' r2 */ - x87_fcmovnbe(cp->func, st2); /* r2' a0' r2 */ + x87_fcomi(cp->func, st1); /* a0' 0 r2 */ + x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */ - x87_fstp_or_pop(cp->func, writemask, 2, dst); - x87_fpop(cp->func); + x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */ + x87_fpop(cp->func); /* r2 */ x87_fpop(cp->func); } -- cgit v1.2.3 From 3b41d619a1b7cc8c356c32af777486461ddd7926 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Fri, 23 May 2008 09:14:17 +0100 Subject: draw: faster LIT(), incorrect though --- src/gallium/auxiliary/draw/draw_vs_aos.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 930914f609..b0c3ac49d2 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1098,7 +1098,13 @@ static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_inst * Note: use 1.0 to avoid passing zero to */ x87_fldz(cp->func); /* 1 0 */ +#if 1 + x87_fld1(cp->func); /* 1 0 */ +#else + /* Correct but slow due to fp exceptions generated in fyl2x - fix me. + */ x87_fldz(cp->func); /* 1 0 */ +#endif x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0 */ x87_fcomi(cp->func, st2); /* a1 1 0 */ x87_fcmovb(cp->func, st1); /* a1' 1 0 */ -- cgit v1.2.3 From 6172f1295cf812108d8ceba15a83ba87880360d3 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sat, 24 May 2008 13:22:29 +0100 Subject: draw: add a debug-print which can be called from inside generated shaders --- src/gallium/auxiliary/draw/draw_vs_aos.c | 67 ++++++++++++++++++++++++++++++++ src/gallium/auxiliary/draw/draw_vs_aos.h | 1 + 2 files changed, 68 insertions(+) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index b0c3ac49d2..aa119f242e 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -752,7 +752,63 @@ static void x87_emit_ex2( struct aos_compilation *cp ) } +static void PIPE_CDECL print_reg( const char *msg, + const float *reg ) +{ + debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]); +} + +static void emit_print( struct aos_compilation *cp, + const char *message, /* must point to a static string! */ + unsigned file, + unsigned idx ) +{ + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); + struct x86_reg arg = get_reg_ptr( cp, file, idx ); + unsigned i; + + /* There shouldn't be anything on the x87 stack. Can add this + * capacity later if need be. + */ + assert(cp->func->x87_stack == 0); + + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. We're obviously not concerned about performance on this + * debug path, so here goes: + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + + /* Push the arguments: + */ + x86_lea( cp->func, ecx, arg ); + x86_push( cp->func, ecx ); + x86_push_imm32( cp->func, (int)message ); + + /* Call the helper. Could call debug_printf directly, but + * print_reg is a nice place to put a breakpoint if need be. + */ + x86_mov_reg_imm( cp->func, ecx, (int)print_reg ); + x86_call( cp->func, ecx ); + x86_pop( cp->func, ecx ); + x86_pop( cp->func, ecx ); + + /* Pop caller-save regs + */ + x86_cdecl_caller_pop_regs( cp->func ); + + /* Done... + */ +} /** * The traditional instructions. All operate on internal registers @@ -1798,6 +1854,17 @@ static void vaos_set_constants( struct draw_vs_varient *varient, memcpy(vaos->machine->constant, constants, (vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1) * 4 * sizeof(float)); + +#if 0 + unsigned i; + for (i =0; i < vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1; i++) + debug_printf("state %d: %f %f %f %f\n", + i, + constants[i][0], + constants[i][1], + constants[i][2], + constants[i][3]); +#endif } diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index efdc9a38f4..a0680ec63d 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -96,6 +96,7 @@ struct aos_compilation { unsigned insn_counter; unsigned num_immediates; + unsigned count; struct { unsigned idx:16; -- cgit v1.2.3 From 86e529ad90411d21bca3d70984b2db202e7a0cd6 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sat, 24 May 2008 16:31:11 +0100 Subject: draw: use lookup tables to avoid calling pow() in LIT opcode --- src/gallium/auxiliary/draw/draw_vs_aos.c | 251 ++++++++++++++++++++++++++++++- src/gallium/auxiliary/draw/draw_vs_aos.h | 27 ++++ 2 files changed, 272 insertions(+), 6 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index aa119f242e..1fbb7088ca 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -105,8 +105,31 @@ static struct x86_reg get_reg_ptr(struct aos_compilation *cp, #define X87_CW_ROUND_MASK (3<<10) #define X87_CW_INFINITY (1<<12) +static void do_populate_lut( struct shine_tab *tab, + float unclamped_exponent ) +{ + const float epsilon = 1.0F / 256.0F; + float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon)); + unsigned i; + + tab->exponent = unclamped_exponent; /* for later comparison */ + + tab->values[0] = 0; + if (exponent == 0) { + for (i = 1; i < 258; i++) { + tab->values[i] = 1.0; + } + } + else { + for (i = 1; i < 258; i++) { + tab->values[i] = powf((float)i * epsilon, exponent); + } + } +} + static void init_internals( struct aos_machine *machine ) { + unsigned i; float inv = 1.0f/255.0f; float f255 = 255.0f; @@ -141,6 +164,9 @@ static void init_internals( struct aos_machine *machine ) (1<<6) | X87_CW_ROUND_DOWN | X87_CW_PRECISION_DOUBLE_EXT); + + for (i = 0; i < MAX_SHINE_TAB; i++) + do_populate_lut( &machine->shine_tab[i], 1.0f ); } @@ -1132,26 +1158,231 @@ static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_inst return TRUE; } +static PIPE_CDECL void do_lit( struct aos_machine *machine, + float *result, + const float *in, + unsigned count ) +{ + if (in[0] > 0) + { + if (in[1] <= 0.0) + { + result[0] = 1.0F; + result[1] = in[0]; + result[2] = 1.0; + result[3] = 1.0F; + } + else + { + const float epsilon = 1.0F / 256.0F; + float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); + result[0] = 1.0F; + result[1] = in[0]; + result[2] = powf(in[1], exponent); + result[3] = 1.0; + } + } + else + { + result[0] = 1.0F; + result[1] = 0.0; + result[2] = 0.0; + result[3] = 1.0F; + } +} + + +static PIPE_CDECL void do_lit_lut( struct aos_machine *machine, + float *result, + const float *in, + unsigned count ) +{ + if (in[0] > 0) + { + if (in[1] <= 0.0) + { + result[0] = 1.0F; + result[1] = in[0]; + result[2] = 1.0; + result[3] = 1.0F; + return; + } + + if (machine->lit_info[count].shine_tab->exponent != in[3]) { + machine->lit_info[count].func = do_lit; + goto no_luck; + } + + if (in[1] <= 1.0) + { + const float *tab = machine->lit_info[count].shine_tab->values; + float f = in[1] * 256; + int k = (int)f; + float frac = f - (float)k; + + result[0] = 1.0F; + result[1] = in[0]; + result[2] = tab[k] + frac*(tab[k+1]-tab[k]); + result[3] = 1.0; + return; + } + + no_luck: + { + const float epsilon = 1.0F / 256.0F; + float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon)); + result[0] = 1.0F; + result[1] = in[0]; + result[2] = powf(in[1], exponent); + result[3] = 1.0; + } + } + else + { + result[0] = 1.0F; + result[1] = 0.0; + result[2] = 0.0; + result[3] = 1.0F; + } +} + + + +static void PIPE_CDECL populate_lut( struct aos_machine *machine, + float *result, + const float *in, + unsigned count ) +{ + unsigned i, tab; + + /* Search for an existing table for this value. Note that without + * static analysis we don't really know if in[3] will be constant, + * but it usually is... + */ + for (tab = 0; tab < 4; tab++) { + if (machine->shine_tab[tab].exponent == in[3]) { + goto found; + } + } + + for (tab = 0, i = 1; i < 4; i++) { + if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used) + tab = i; + } + + if (machine->shine_tab[tab].last_used == machine->now) { + /* No unused tables (this is not a ffvertex program...). Just + * call pow each time: + */ + machine->lit_info[count].func = do_lit; + machine->lit_info[count].func( machine, result, in, count ); + return; + } + else { + do_populate_lut( &machine->shine_tab[tab], in[3] ); + } + + found: + machine->shine_tab[tab].last_used = machine->now; + machine->lit_info[count].shine_tab = &machine->shine_tab[tab]; + machine->lit_info[count].func = do_lit_lut; + machine->lit_info[count].func( machine, result, in, count ); +} + + + static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { - struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + unsigned lit_count = cp->lit_count++; + struct x86_reg result, arg0; + unsigned i; + +#if 1 + /* For absolute correctness, need to spill/invalidate all XMM regs + * too. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].dirty) + spill(cp, i); + aos_release_xmm_reg(cp, i); + } +#endif + + if (writemask != TGSI_WRITEMASK_XYZW) + result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0])); + else + result = get_dst_ptr(cp, &op->FullDstRegisters[0]); + + + arg0 = fetch_src( cp, &op->FullSrcRegisters[0] ); + if (arg0.file == file_XMM) { + struct x86_reg tmp = x86_make_disp(cp->machine_EDX, + Offset(struct aos_machine, tmp[1])); + sse_movaps( cp->func, tmp, arg0 ); + arg0 = tmp; + } + + + + /* Push caller-save (ie scratch) regs. + */ + x86_cdecl_caller_push_regs( cp->func ); + + /* Push the arguments: + */ + x86_push_imm32( cp->func, lit_count ); + + x86_lea( cp->func, ecx, arg0 ); + x86_push( cp->func, ecx ); + x86_lea( cp->func, ecx, result ); + x86_push( cp->func, ecx ); + x86_push( cp->func, cp->machine_EDX ); + if (lit_count < MAX_LIT_INFO) { + x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, + Offset(struct aos_machine, lit_info) + + lit_count * sizeof(struct lit_info) + + Offset(struct lit_info, func))); + } + else { + x86_mov_reg_imm( cp->func, ecx, (int)do_lit ); + } + + x86_call( cp->func, ecx ); + + x86_pop( cp->func, ecx ); /* fixme... */ + x86_pop( cp->func, ecx ); + x86_pop( cp->func, ecx ); + x86_pop( cp->func, ecx ); + + x86_cdecl_caller_pop_regs( cp->func ); + + if (writemask != TGSI_WRITEMASK_XYZW) { + store_dest( cp, + &op->FullDstRegisters[0], + get_xmm_clone( cp, result ) ); + } + + return TRUE; +} + + +static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ + struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); + unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; if (writemask & TGSI_WRITEMASK_YZ) { struct x86_reg st1 = x86_make_reg(file_x87, 1); struct x86_reg st2 = x86_make_reg(file_x87, 2); - - - /* a1' = a1 <= 0 ? 1 : a1; - * - * Note: use 1.0 to avoid passing zero to */ x87_fldz(cp->func); /* 1 0 */ #if 1 @@ -1865,6 +2096,14 @@ static void vaos_set_constants( struct draw_vs_varient *varient, constants[i][2], constants[i][3]); #endif + + { + unsigned i; + for (i = 0; i < MAX_LIT_INFO; i++) { + vaos->machine->lit_info[i].func = populate_lut; + vaos->machine->now++; + } + } } diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index a0680ec63d..c08c73d4bc 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -59,6 +59,25 @@ struct x86_function; #define FPU_RND_NEG 1 #define FPU_RND_NEAREST 2 +struct aos_machine; +typedef void PIPE_CDECL (*lit_func)( struct aos_machine *, + float *result, + const float *in, + unsigned count ); +struct shine_tab { + float exponent; + float values[258]; + unsigned last_used; +}; + +struct lit_info { + lit_func func; + struct shine_tab *shine_tab; +}; + +#define MAX_SHINE_TAB 4 +#define MAX_LIT_INFO 16 + /* This is the temporary storage used by all the aos_sse vs varients. * Create one per context and reuse by passing a pointer in at * vs_varient creation?? @@ -74,6 +93,13 @@ struct aos_machine { float scale[4]; /* viewport */ float translate[4]; /* viewport */ + float tmp[2][4]; /* scratch space for LIT */ + + struct shine_tab shine_tab[MAX_SHINE_TAB]; + struct lit_info lit_info[MAX_LIT_INFO]; + unsigned now; + + ushort fpu_rnd_nearest; ushort fpu_rnd_neg_inf; ushort fpu_restore; @@ -97,6 +123,7 @@ struct aos_compilation { unsigned insn_counter; unsigned num_immediates; unsigned count; + unsigned lit_count; struct { unsigned idx:16; -- cgit v1.2.3 From caadc8d944c558e1fa9f23c3616d726337a19862 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 25 May 2008 15:37:47 +0100 Subject: draw: clean up some of the xmm register manipulation function names --- src/gallium/auxiliary/draw/draw_vs_aos.c | 141 +++++++++++++++++-------------- 1 file changed, 78 insertions(+), 63 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 1fbb7088ca..17b9442d6b 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -190,17 +190,26 @@ static void spill( struct aos_compilation *cp, unsigned idx ) } } -static boolean is_xmm_tmp( struct aos_compilation *cp, - struct x86_reg reg ) + +static struct x86_reg get_xmm_writable( struct aos_compilation *cp, + struct x86_reg reg ) { - return (reg.file == file_XMM && - cp->xmm[reg.idx].file == TGSI_FILE_NULL); + if (reg.file != file_XMM || + cp->xmm[reg.idx].file != TGSI_FILE_NULL) + { + struct x86_reg tmp = aos_get_xmm_reg(cp); + sse_movaps(cp->func, tmp, reg); + reg = tmp; + } + + return reg; } -static struct x86_reg get_xmm_clone( struct aos_compilation *cp, - struct x86_reg reg ) +static struct x86_reg get_xmm( struct aos_compilation *cp, + struct x86_reg reg ) { - if (!is_xmm_tmp(cp, reg)) { + if (reg.file != file_XMM) + { struct x86_reg tmp = aos_get_xmm_reg(cp); sse_movaps(cp->func, tmp, reg); reg = tmp; @@ -210,6 +219,9 @@ static struct x86_reg get_xmm_clone( struct aos_compilation *cp, } +/* Allocate an empty xmm register, either as a temporary or later to + * "adopt" as a shader reg. + */ struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) { unsigned i; @@ -251,32 +263,11 @@ void aos_release_xmm_reg( struct aos_compilation *cp, cp->xmm[idx].last_used = 0; } -static void invalidate_xmm( struct aos_compilation *cp, - unsigned file, unsigned idx ) -{ - unsigned i; - - /* Invalidate any old copy of this register in XMM0-7. - */ - for (i = 0; i < 8; i++) { - if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { - - if (cp->xmm[i].dirty) - spill(cp, i); - - aos_release_xmm_reg(cp, i); - break; - } - } - for (; i < 8; i++) { - if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { - assert(0); - } - } -} - + +/* Mark an xmm reg as holding the current copy of a shader reg. + */ void aos_adopt_xmm_reg( struct aos_compilation *cp, struct x86_reg reg, unsigned file, @@ -290,6 +281,9 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp, return; } + /* If any xmm reg thinks it holds this shader reg, break the + * illusion. + */ for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { @@ -304,12 +298,24 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp, } - +/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate. + */ static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, unsigned file, unsigned idx ) { - invalidate_xmm( cp, file, idx ); + unsigned i; + + /* Ensure the in-memory copy of this reg is up-to-date + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx && + cp->xmm[i].dirty) { + spill(cp, i); + } + } + return get_reg_ptr( cp, file, idx ); } @@ -320,7 +326,26 @@ static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, static struct x86_reg get_dst_ptr( struct aos_compilation *cp, const struct tgsi_full_dst_register *dst ) { - return aos_get_shader_reg_ptr( cp, dst->DstRegister.File, dst->DstRegister.Index ); + unsigned file = dst->DstRegister.File; + unsigned idx = dst->DstRegister.Index; + unsigned i; + + + /* Ensure in-memory copy of this reg is up-to-date and invalidate + * any xmm copies. + */ + for (i = 0; i < 8; i++) { + if (cp->xmm[i].file == file && + cp->xmm[i].idx == idx) + { + if (cp->xmm[i].dirty) + spill(cp, i); + + aos_release_xmm_reg(cp, i); + } + } + + return get_reg_ptr( cp, file, idx ); } @@ -358,15 +383,7 @@ static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, unsigned idx ) { struct x86_reg reg = aos_get_shader_reg( cp, file, idx ); - - if (reg.file != file_XMM) { - struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_movaps(cp->func, tmp, reg); - aos_adopt_xmm_reg( cp, tmp, file, idx, FALSE ); - reg = tmp; - } - - return reg; + return get_xmm( cp, reg ); } @@ -634,7 +651,7 @@ static void store_dest( struct aos_compilation *cp, case TGSI_WRITEMASK_XYZW: aos_adopt_xmm_reg(cp, - get_xmm_clone(cp, result), + get_xmm_writable(cp, result), reg->DstRegister.File, reg->DstRegister.Index, TRUE); @@ -649,15 +666,15 @@ static void store_dest( struct aos_compilation *cp, switch (reg->DstRegister.WriteMask) { case TGSI_WRITEMASK_X: - sse_movss(cp->func, dst, get_xmm_clone(cp, result)); + sse_movss(cp->func, dst, get_xmm(cp, result)); break; case TGSI_WRITEMASK_XY: - sse_shufps(cp->func, dst, get_xmm_clone(cp, result), SHUF(X, Y, Z, W)); + sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W)); break; case TGSI_WRITEMASK_ZW: - result = get_xmm_clone(cp, result); + result = get_xmm_writable(cp, result); sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); dst = result; break; @@ -845,7 +862,7 @@ static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, neg); sse_maxps(cp->func, dst, arg0); @@ -858,7 +875,7 @@ static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_addps(cp->func, dst, arg1); @@ -882,10 +899,9 @@ static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); - /* Now the hard bit: sum the first 3 values: */ sse_movhlps(cp->func, tmp, dst); @@ -908,7 +924,7 @@ static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); @@ -932,7 +948,7 @@ static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); @@ -1366,7 +1382,7 @@ static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_inst if (writemask != TGSI_WRITEMASK_XYZW) { store_dest( cp, &op->FullDstRegisters[0], - get_xmm_clone( cp, result ) ); + get_xmm_writable( cp, result ) ); } return TRUE; @@ -1440,7 +1456,7 @@ static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_maxps(cp->func, dst, arg1); @@ -1453,7 +1469,7 @@ static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_minps(cp->func, dst, arg1); @@ -1464,7 +1480,7 @@ static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_inst static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); /* potentially nothing to do */ @@ -1476,7 +1492,7 @@ static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); @@ -1494,7 +1510,7 @@ static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_inst /* If we can't clobber old contents of arg0, get a temporary & copy * it there, then clobber it... */ - arg0 = get_xmm_clone(cp, arg0); + arg0 = get_xmm_writable(cp, arg0); sse_mulps(cp->func, arg0, arg1); sse_addps(cp->func, arg0, arg2); @@ -1562,7 +1578,7 @@ static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); sse_andps(cp->func, dst, ones); @@ -1586,7 +1602,7 @@ static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_inst struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_cmpps(cp->func, dst, arg1, cc_LessThan); sse_andps(cp->func, dst, ones); @@ -1599,7 +1615,7 @@ static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = get_xmm_clone(cp, arg0); + struct x86_reg dst = get_xmm_writable(cp, arg0); sse_subps(cp->func, dst, arg1); @@ -1989,7 +2005,6 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, /* decr count, loop if not zero */ x86_dec(cp.func, cp.count_ESI); -/* x86_test(cp.func, cp.count_ESI, cp.count_ESI); */ x86_jcc(cp.func, cc_NZ, label); restore_fpu_state(&cp); -- cgit v1.2.3 From ce331e3a5e2a0505e01637861bdd7f5e6cfbd041 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 25 May 2008 15:44:17 +0100 Subject: draw: special case for writing out scalar results --- src/gallium/auxiliary/draw/draw_vs_aos.c | 127 +++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 25 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 17b9442d6b..aebc230858 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -697,6 +697,72 @@ static void store_dest( struct aos_compilation *cp, } +static void inject_scalar( struct aos_compilation *cp, + struct x86_reg dst, + struct x86_reg result, + unsigned swizzle ) +{ + sse_shufps(cp->func, dst, dst, swizzle); + sse_movss(cp->func, dst, result); + sse_shufps(cp->func, dst, dst, swizzle); +} + + +static void store_scalar_dest( struct aos_compilation *cp, + const struct tgsi_full_dst_register *reg, + struct x86_reg result ) +{ + unsigned writemask = reg->DstRegister.WriteMask; + struct x86_reg dst; + + if (writemask != TGSI_WRITEMASK_X && + writemask != TGSI_WRITEMASK_Y && + writemask != TGSI_WRITEMASK_Z && + writemask != TGSI_WRITEMASK_W && + writemask != 0) + { + result = get_xmm_writable(cp, result); /* already true, right? */ + sse_shufps(cp->func, result, result, SHUF(X,X,X,X)); + store_dest(cp, reg, result); + return; + } + + result = get_xmm(cp, result); + dst = aos_get_shader_reg_xmm(cp, + reg->DstRegister.File, + reg->DstRegister.Index); + + + + switch (reg->DstRegister.WriteMask) { + case TGSI_WRITEMASK_X: + sse_movss(cp->func, dst, result); + break; + + case TGSI_WRITEMASK_Y: + inject_scalar(cp, dst, result, SHUF(Y, X, Z, W)); + break; + + case TGSI_WRITEMASK_Z: + inject_scalar(cp, dst, result, SHUF(Z, Y, X, W)); + break; + + case TGSI_WRITEMASK_W: + inject_scalar(cp, dst, result, SHUF(W, Y, Z, X)); + break; + + default: + break; + } + + aos_adopt_xmm_reg(cp, + dst, + reg->DstRegister.File, + reg->DstRegister.Index, + TRUE); +} + + static void x87_fst_or_nop( struct x86_function *func, unsigned writemask, @@ -909,11 +975,8 @@ static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_inst emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(cp->func, dst, tmp); - if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); - aos_release_xmm_reg(cp, tmp.idx); - store_dest(cp, &op->FullDstRegisters[0], dst); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; } @@ -935,11 +998,8 @@ static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_inst emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(cp->func, dst, tmp); - if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); - aos_release_xmm_reg(cp, tmp.idx); - store_dest(cp, &op->FullDstRegisters[0], dst); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; } @@ -961,11 +1021,8 @@ static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_inst emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); sse_addss(cp->func, dst, tmp); - if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); - aos_release_xmm_reg(cp, tmp.idx); - store_dest(cp, &op->FullDstRegisters[0], dst); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; } @@ -1518,7 +1575,9 @@ static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_inst return TRUE; } - +/* Really not sufficient -- need to check for conditions that could + * generate inf/nan values, which will slow things down hugely. + */ static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { x87_fld_src(cp, &op->FullSrcRegisters[1], 0); /* a1.x */ @@ -1548,27 +1607,45 @@ static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_inst sse_divss(cp->func, dst, arg0); } - if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); - - store_dest(cp, &op->FullDstRegisters[0], dst); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; } + +/* Although rsqrtps() and rcpps() are low precision on some/all SSE + * implementations, it is possible to improve its precision at + * fairly low cost, using a newton/raphson step, as below: + * + * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) + * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] + * + * See: http://softwarecommunity.intel.com/articles/eng/1818.htm + */ static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg dst = aos_get_xmm_reg(cp); - sse_rsqrtss(cp->func, dst, arg0); - - /* Extend precision here... - */ - - if (op->FullDstRegisters[0].DstRegister.WriteMask != 0x1) - sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + if (1) { + sse_rsqrtss(cp->func, dst, arg0); + } + else { +#if 0 + /* Extend precision here... + */ + sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) ); + sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) ); + + sse_rsqrtss( func, tmp1, src ); /* rsqrtss(a) */ + sse_mulss( func, src, tmp1 ); /* a * rsqrtss(a) */ + sse_mulss( func, dst, tmp1 ); /* .5 * rsqrtss(a) */ + sse_mulss( func, src, tmp1 ); /* a * rsqrtss(a) * rsqrtss(a) */ + sse_subss( func, tmp0, src ); /* 3.0 - (a * rsqrtss(a) * rsqrtss(a)) */ + sse_mulss( func, dst, tmp0 ); /* .5 * r * (3.0 - (a * r * r)) */ +#endif + } - store_dest(cp, &op->FullDstRegisters[0], dst); + store_scalar_dest(cp, &op->FullDstRegisters[0], dst); return TRUE; } -- cgit v1.2.3 From 3afb7198e01516dba38bb3248d4c0161e54650fe Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 25 May 2008 15:45:27 +0100 Subject: draw: remove EXP & LOG from vs_aos.c These don't get hit & look like bug magnets to me... --- src/gallium/auxiliary/draw/draw_vs_aos.c | 85 ++------------------------------ 1 file changed, 4 insertions(+), 81 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index aebc230858..34dc09ead7 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1066,85 +1066,6 @@ static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_inst return TRUE; } -static boolean emit_EXP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); - struct x86_reg st0 = x86_make_reg(file_x87, 0); - struct x86_reg st1 = x86_make_reg(file_x87, 1); - struct x86_reg st3 = x86_make_reg(file_x87, 3); - unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; - - /* CAUTION: dst may alias arg0! - */ - x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */ - x87_fld(cp->func, st0); /* arg arg */ - - /* by default, fpu is setup to round-to-nearest. We want to - * change this now, and track the state through to the end of the - * generated function so that it isn't repeated unnecessarily. - * Alternately, could subtract .5 to get round to -inf behaviour. - */ - set_fpu_round_neg_inf( cp ); - x87_fprndint( cp->func ); /* flr(a) a */ - x87_fld(cp->func, st0); /* flr(a) flr(a) a */ - x87_fld1(cp->func); /* 1 floor(a) floor(a) a */ - x87_fst_or_nop(cp->func, writemask, 3, dst); /* stack unchanged */ - - x87_fscale(cp->func); /* 2^floor(a) floor(a) a */ - x87_fst(cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/ - - x87_fstp_or_pop(cp->func, writemask, 0, dst); /* flr(a) a 2^flr(a) */ - - x87_fsubp(cp->func, st1); /* frac(a) 2^flr(a) */ - - x87_fst_or_nop(cp->func, writemask, 1, dst); /* frac(a) 2^flr(a) */ - - x87_f2xm1(cp->func); /* (2^frac(a))-1 2^flr(a)*/ - x87_fld1(cp->func); /* 1 (2^frac(a))-1 2^flr(a)*/ - x87_faddp(cp->func, st1); /* 2^frac(a) 2^flr(a) */ - x87_fmulp(cp->func, st1); /* 2^a */ - - x87_fstp_or_pop(cp->func, writemask, 2, dst); - -/* dst[0] = 2^floor(tmp); */ -/* dst[1] = frac(tmp); */ -/* dst[2] = 2^floor(tmp) * 2^frac(tmp); */ -/* dst[3] = 1.0F; */ - return TRUE; -} - -static boolean emit_LOG( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) -{ - struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); - struct x86_reg st0 = x86_make_reg(file_x87, 0); - struct x86_reg st1 = x86_make_reg(file_x87, 1); - struct x86_reg st2 = x86_make_reg(file_x87, 2); - unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; - - /* CAUTION: dst may alias arg0! - */ - x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* arg0.x */ - x87_fabs(cp->func); /* |arg0.x| */ - x87_fxtract(cp->func); /* mantissa(arg0.x), exponent(arg0.x) */ - x87_fst(cp->func, st2); /* mantissa, exponent, mantissa */ - x87_fld1(cp->func); /* 1, mantissa, exponent, mantissa */ - x87_fyl2x(cp->func); /* log2(mantissa), exponent, mantissa */ - x87_fadd(cp->func, st0, st1); /* e+l2(m), e, m */ - - x87_fstp_or_pop(cp->func, writemask, 2, dst); /* e, m */ - - x87_fld1(cp->func); /* 1, e, m */ - x87_fsub(cp->func, st1, st0); /* 1, e-1, m */ - - x87_fstp_or_pop(cp->func, writemask, 3, dst); /* e-1,m */ - x87_fstp_or_pop(cp->func, writemask, 0, dst); /* m */ - - x87_fadd(cp->func, st0, st0); /* 2m */ - - x87_fstp_or_pop( cp->func, writemask, 1, dst ); - - return TRUE; -} static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { @@ -1755,10 +1676,12 @@ emit_instruction( struct aos_compilation *cp, return emit_RSQ(cp, inst); case TGSI_OPCODE_EXP: - return emit_EXP(cp, inst); + /*return emit_EXP(cp, inst);*/ + return FALSE; case TGSI_OPCODE_LOG: - return emit_LOG(cp, inst); + /*return emit_LOG(cp, inst);*/ + return FALSE; case TGSI_OPCODE_MUL: return emit_MUL(cp, inst); -- cgit v1.2.3 From 9c7568965c00dcc2e9403a2f94f1cd09dcd783ae Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 25 May 2008 15:47:04 +0100 Subject: draw: slight tweak for XPD opcode --- src/gallium/auxiliary/draw/draw_vs_aos.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 34dc09ead7..37d04e45a6 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -1626,31 +1626,24 @@ static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); - struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg tmp0 = aos_get_xmm_reg(cp); struct x86_reg tmp1 = aos_get_xmm_reg(cp); - /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1. Need a way - * to invalidate registers. This will come with better analysis - * (liveness analysis) of the incoming program. - */ - emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W)); - emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W)); - sse_mulps(cp->func, dst, tmp1); - emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W)); emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); - sse_mulps(cp->func, tmp0, tmp1); - sse_subps(cp->func, dst, tmp0); + sse_mulps(cp->func, tmp1, arg0); + emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W)); + sse_mulps(cp->func, tmp0, arg1); + sse_subps(cp->func, tmp1, tmp0); + sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W)); +/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ -/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ /* dst[3] is undef */ aos_release_xmm_reg(cp, tmp0.idx); - aos_release_xmm_reg(cp, tmp1.idx); - store_dest(cp, &op->FullDstRegisters[0], dst); + store_dest(cp, &op->FullDstRegisters[0], tmp1); return TRUE; } -- cgit v1.2.3 From 721fb5597e687fc1446119002ab03cc428104b29 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Mon, 26 May 2008 00:09:02 +0100 Subject: draw: more aos tweaks --- src/gallium/auxiliary/draw/draw_vs_aos.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 37d04e45a6..916203c66b 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -202,6 +202,7 @@ static struct x86_reg get_xmm_writable( struct aos_compilation *cp, reg = tmp; } + cp->xmm[reg.idx].last_used = cp->insn_counter; return reg; } @@ -215,6 +216,7 @@ static struct x86_reg get_xmm( struct aos_compilation *cp, reg = tmp; } + cp->xmm[reg.idx].last_used = cp->insn_counter; return reg; } @@ -281,6 +283,18 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp, return; } + /* If this xmm reg is already holding this shader reg, just update + * last_used, and don't clobber the dirty flag... + */ + if (cp->xmm[reg.idx].file == file && + cp->xmm[reg.idx].idx == idx) + { + cp->xmm[reg.idx].dirty |= dirty; + cp->xmm[reg.idx].last_used = cp->insn_counter; + return; + } + + /* If any xmm reg thinks it holds this shader reg, break the * illusion. */ @@ -382,8 +396,16 @@ static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, unsigned file, unsigned idx ) { - struct x86_reg reg = aos_get_shader_reg( cp, file, idx ); - return get_xmm( cp, reg ); + struct x86_reg reg = get_xmm( cp, + aos_get_shader_reg( cp, file, idx ) ); + + aos_adopt_xmm_reg( cp, + reg, + file, + idx, + FALSE ); + + return reg; } -- cgit v1.2.3 From 351eca365c0ba488000c3826d5093de6170381e4 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Mon, 26 May 2008 11:03:00 +0100 Subject: draw: extend precision in RSQ opcode --- src/gallium/auxiliary/draw/draw_vs_aos.c | 48 ++++++++++++++++++-------------- src/gallium/auxiliary/draw/draw_vs_aos.h | 1 + 2 files changed, 28 insertions(+), 21 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 916203c66b..1622358ae1 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -140,7 +140,8 @@ static void init_internals( struct aos_machine *machine ) ASSIGN_4V(machine->internal[IMM_NEGS], -1.0f, -1.0f, -1.0f, -1.0f); ASSIGN_4V(machine->internal[IMM_IDENTITY], 0.0f, 0.0f, 0.0f, 1.0f); ASSIGN_4V(machine->internal[IMM_INV_255], inv, inv, inv, inv); - ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255); + ASSIGN_4V(machine->internal[IMM_255], f255, f255, f255, f255); + ASSIGN_4V(machine->internal[IMM_RSQ], -.5f, 1.5f, 0.0f, 0.0f); machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP | @@ -1561,35 +1562,40 @@ static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_inst * * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] + * or: + * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)] + * * * See: http://softwarecommunity.intel.com/articles/eng/1818.htm */ static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { - struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); - struct x86_reg dst = aos_get_xmm_reg(cp); - if (1) { - sse_rsqrtss(cp->func, dst, arg0); + if (0) { + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg r = aos_get_xmm_reg(cp); + sse_rsqrtss(cp->func, r, arg0); + store_scalar_dest(cp, &op->FullDstRegisters[0], r); + return TRUE; } else { -#if 0 - /* Extend precision here... - */ - sse_movaps( func, dst, get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) ); - sse_movaps( func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) ); - - sse_rsqrtss( func, tmp1, src ); /* rsqrtss(a) */ - sse_mulss( func, src, tmp1 ); /* a * rsqrtss(a) */ - sse_mulss( func, dst, tmp1 ); /* .5 * rsqrtss(a) */ - sse_mulss( func, src, tmp1 ); /* a * rsqrtss(a) * rsqrtss(a) */ - sse_subss( func, tmp0, src ); /* 3.0 - (a * rsqrtss(a) * rsqrtss(a)) */ - sse_mulss( func, dst, tmp0 ); /* .5 * r * (3.0 - (a * r * r)) */ -#endif - } + struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); + struct x86_reg r = aos_get_xmm_reg(cp); - store_scalar_dest(cp, &op->FullDstRegisters[0], dst); - return TRUE; + struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ ); + struct x86_reg one_point_five = x86_make_disp( neg_half, 4 ); + struct x86_reg src = get_xmm_writable( cp, arg0 ); + + sse_rsqrtss( cp->func, r, src ); /* rsqrtss(a) */ + sse_mulss( cp->func, src, neg_half ); /* -.5 * a */ + sse_mulss( cp->func, src, r ); /* -.5 * a * r */ + sse_mulss( cp->func, src, r ); /* -.5 * a * r * r */ + sse_addss( cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */ + sse_mulss( cp->func, r, src ); /* r * (1.5 - .5 * a * r * r) */ + + store_scalar_dest(cp, &op->FullDstRegisters[0], r); + return TRUE; + } } diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index c08c73d4bc..fffe2e4658 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -175,6 +175,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp ); #define IMM_INV_255 3 /* 1/255, 1/255, 1/255, 1/255 */ #define IMM_255 4 /* 255, 255, 255, 255 */ #define IMM_NEGS 5 /* -1,-1,-1,-1 */ +#define IMM_RSQ 6 /* -.5,1.5,_,_ */ struct x86_reg aos_get_internal( struct aos_compilation *cp, unsigned imm ); -- cgit v1.2.3 From 4e2567f0ab6afd701bea4c35e388663e90f5cb6c Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 27 May 2008 10:42:58 +0100 Subject: draw: some possible fixes for spilling --- src/gallium/auxiliary/draw/draw_vs_aos.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 1622358ae1..99630e4f75 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -253,6 +253,7 @@ struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) cp->xmm[oldest].file = TGSI_FILE_NULL; cp->xmm[oldest].idx = 0; + cp->xmm[oldest].dirty = 0; cp->xmm[oldest].last_used = cp->insn_counter; return x86_make_reg(file_XMM, oldest); } @@ -284,24 +285,18 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp, return; } - /* If this xmm reg is already holding this shader reg, just update - * last_used, and don't clobber the dirty flag... - */ - if (cp->xmm[reg.idx].file == file && - cp->xmm[reg.idx].idx == idx) - { - cp->xmm[reg.idx].dirty |= dirty; - cp->xmm[reg.idx].last_used = cp->insn_counter; - return; - } - /* If any xmm reg thinks it holds this shader reg, break the * illusion. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && - cp->xmm[i].idx == idx) { + cp->xmm[i].idx == idx) + { + /* If an xmm reg is already holding this shader reg, take into account its + * dirty flag... + */ + dirty |= cp->xmm[i].dirty; aos_release_xmm_reg(cp, i); } } @@ -1989,6 +1984,17 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, debug_printf("\n"); } + + { + unsigned i; + for (i = 0; i < 8; i++) { + if (cp.xmm[i].file != TGSI_FILE_OUTPUT) { + cp.xmm[i].file = TGSI_FILE_NULL; + cp.xmm[i].dirty = 0; + } + } + } + if (cp.error) goto fail; -- cgit v1.2.3 From 5dc44184fa9f07465b7ff2be94394c55392ce5e9 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 27 May 2008 11:10:50 +0100 Subject: draw: fix writemask/shufps confusion --- src/gallium/auxiliary/draw/draw_vs_aos.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 99630e4f75..434bd2a9ab 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -687,17 +687,18 @@ static void store_dest( struct aos_compilation *cp, sse_movss(cp->func, dst, get_xmm(cp, result)); break; - case TGSI_WRITEMASK_XY: + case TGSI_WRITEMASK_ZW: sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W)); break; - case TGSI_WRITEMASK_ZW: + case TGSI_WRITEMASK_XY: result = get_xmm_writable(cp, result); sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); dst = result; break; case TGSI_WRITEMASK_YZW: + result = get_xmm_writable(cp, result); sse_movss(cp->func, result, dst); dst = result; break; @@ -891,7 +892,7 @@ static void emit_print( struct aos_compilation *cp, unsigned idx ) { struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); - struct x86_reg arg = get_reg_ptr( cp, file, idx ); + struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx ); unsigned i; /* There shouldn't be anything on the x87 stack. Can add this -- cgit v1.2.3 From f7946bc7c0435ab2926cd729dfd8312222a3aa2a Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 27 May 2008 11:15:31 +0100 Subject: draw: dump individual instructions as they are processed --- src/gallium/auxiliary/draw/draw_vs_aos.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 434bd2a9ab..d3989fe107 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -42,7 +42,20 @@ #include "rtasm/rtasm_x86sse.h" #ifdef PIPE_ARCH_X86 +#define DISASSEM 0 +static const char *files[] = +{ + "NULL", + "CONST", + "IN", + "OUT", + "TEMP", + "SAMP", + "ADDR", + "IMM", + "INTERNAL", +}; static INLINE boolean eq( struct x86_reg a, struct x86_reg b ) @@ -184,7 +197,11 @@ static void spill( struct aos_compilation *cp, unsigned idx ) struct x86_reg oldval = get_reg_ptr(cp, cp->xmm[idx].file, cp->xmm[idx].idx); - + + if (0) debug_printf("\nspill %s[%d]", + files[cp->xmm[idx].file], + cp->xmm[idx].idx); + assert(cp->xmm[idx].dirty); sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx)); cp->xmm[idx].dirty = 0; @@ -1975,6 +1992,9 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, break; case TGSI_TOKEN_TYPE_INSTRUCTION: + if (DISASSEM) + tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter ); + if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) goto fail; break; @@ -1982,7 +2002,9 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, x87_assert_stack_empty(cp.func); cp.insn_counter++; - debug_printf("\n"); + + if (DISASSEM) + debug_printf("\n"); } -- cgit v1.2.3 From a08c574bfcf72c7f7ffbeb35c10347b491ef87fb Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 27 May 2008 12:26:23 +0100 Subject: draw: hook up viewport / rhw emit to varient key state --- .../auxiliary/draw/draw_pt_fetch_shade_emit.c | 4 +- src/gallium/auxiliary/draw/draw_vs_aos.c | 16 +++---- src/gallium/auxiliary/draw/draw_vs_varient.c | 52 ++++++++++++++++++---- 3 files changed, 53 insertions(+), 19 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index 7fefd391a6..2f2e7195b3 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -94,8 +94,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.nr_elements = MAX2(num_vs_outputs, /* outputs - translate to hw format */ num_vs_inputs); /* inputs - fetch from api format */ - fse->key.viewport = 1; - fse->key.clip = 0; + fse->key.viewport = !draw->identity_viewport; + fse->key.clip = !draw->bypass_clipping; fse->key.pad = 0; memset(fse->key.element, 0, diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index d3989fe107..c63553216c 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -2021,11 +2021,14 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, if (cp.error) goto fail; - if (cp.vaos->base.key.viewport) { - if (0) - emit_viewport(&cp); - else - emit_rhw_viewport(&cp); + if (cp.vaos->base.key.clip) { + /* not really handling clipping, just do the rhw so we can + * see the results... + */ + emit_rhw_viewport(&cp); + } + else if (cp.vaos->base.key.viewport) { + emit_viewport(&cp); } /* Emit output... TODO: do this eagerly after the last write to a @@ -2188,9 +2191,6 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, { struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); - if (key->clip) - return NULL; - if (!vaos) goto fail; diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index c15c648527..d4deabfff3 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -89,9 +89,9 @@ static void vsvg_set_input( struct draw_vs_varient *varient, /* Mainly for debug at this stage: */ -static void do_viewport( struct draw_vs_varient_generic *vsvg, - unsigned count, - void *output_buffer ) +static void do_rhw_viewport( struct draw_vs_varient_generic *vsvg, + unsigned count, + void *output_buffer ) { char *ptr = (char *)output_buffer; const float *scale = vsvg->viewport.scale; @@ -109,6 +109,25 @@ static void do_viewport( struct draw_vs_varient_generic *vsvg, data[3] = w; } } + +static void do_viewport( struct draw_vs_varient_generic *vsvg, + unsigned count, + void *output_buffer ) +{ + char *ptr = (char *)output_buffer; + const float *scale = vsvg->viewport.scale; + const float *trans = vsvg->viewport.translate; + unsigned stride = vsvg->base.key.output_stride; + unsigned j; + + for (j = 0; j < count; j++, ptr += stride) { + float *data = (float *)ptr; + + data[0] = data[0] * scale[0] + trans[0]; + data[1] = data[1] * scale[1] + trans[1]; + data[2] = data[2] * scale[2] + trans[2]; + } +} static void vsvg_run_elts( struct draw_vs_varient *varient, @@ -136,10 +155,20 @@ static void vsvg_run_elts( struct draw_vs_varient *varient, vsvg->base.key.output_stride, vsvg->base.key.output_stride); - if (vsvg->base.key.viewport) + + if (vsvg->base.key.clip) { + /* not really handling clipping, just do the rhw so we can + * see the results... + */ + do_rhw_viewport( vsvg, + count, + output_buffer ); + } + else if (vsvg->base.key.viewport) { do_viewport( vsvg, count, output_buffer ); + } //if (!vsvg->already_in_emit_format) @@ -182,11 +211,19 @@ static void vsvg_run_linear( struct draw_vs_varient *varient, vsvg->base.key.output_stride, vsvg->base.key.output_stride); - if (vsvg->base.key.viewport) + if (vsvg->base.key.clip) { + /* not really handling clipping, just do the rhw so we can + * see the results... + */ + do_rhw_viewport( vsvg, + count, + output_buffer ); + } + else if (vsvg->base.key.viewport) { do_viewport( vsvg, count, output_buffer ); - + } //if (!vsvg->already_in_emit_format) vsvg->emit->set_buffer( vsvg->emit, @@ -224,9 +261,6 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, unsigned i; struct translate_key fetch, emit; - if (key->clip) - return NULL; - struct draw_vs_varient_generic *vsvg = CALLOC_STRUCT( draw_vs_varient_generic ); if (vsvg == NULL) return NULL; -- cgit v1.2.3 From f8762ba5234fd1b44e11e76bb5f58d2305c90572 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 27 May 2008 14:42:15 +0100 Subject: draw: explicitly list nr_inputs, outputs in varient key --- src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c | 14 +++++++++----- src/gallium/auxiliary/draw/draw_vs.h | 6 ++++-- src/gallium/auxiliary/draw/draw_vs_aos.c | 2 +- src/gallium/auxiliary/draw/draw_vs_aos_io.c | 4 ++-- src/gallium/auxiliary/draw/draw_vs_varient.c | 10 +++++----- 5 files changed, 21 insertions(+), 15 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index c6249b4b41..581026dcb0 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -72,7 +72,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle, struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs; - unsigned num_vs_outputs = draw->vs.vertex_shader->info.num_outputs; const struct vertex_info *vinfo; unsigned i; boolean need_psize = 0; @@ -91,8 +90,11 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.output_stride = vinfo->size * 4; - fse->key.nr_elements = MAX2(num_vs_outputs, /* outputs - translate to hw format */ - num_vs_inputs); /* inputs - fetch from api format */ + fse->key.nr_outputs = vinfo->num_attribs; + fse->key.nr_inputs = num_vs_inputs; + + fse->key.nr_elements = MAX2(fse->key.nr_outputs, /* outputs - translate to hw format */ + fse->key.nr_inputs); /* inputs - fetch from api format */ fse->key.viewport = !draw->identity_viewport; fse->key.clip = !draw->bypass_clipping; @@ -142,7 +144,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, need_psize = 1; output_format = PIPE_FORMAT_R32_FLOAT; emit_sz = 1 * sizeof(float); - vs_output = num_vs_outputs + 1; + vs_output = vinfo->num_attribs + 1; break; case EMIT_4UB: output_format = PIPE_FORMAT_B8G8R8A8_UNORM; @@ -177,7 +179,9 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.element[i].input_buffer = 0; //nr_buffers + 1; fse->key.element[i].input_offset = 0; - fse->key.nr_elements += 1; + fse->key.nr_inputs += 1; + fse->key.nr_elements = MAX2(fse->key.nr_inputs, + fse->key.nr_outputs); } #endif diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index ff3e19b2a8..17902ab86a 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -58,10 +58,12 @@ struct draw_vs_element { struct draw_vs_varient_key { unsigned output_stride; - unsigned nr_elements:16; + unsigned nr_elements:8; /* max2(nr_inputs, nr_outputs) */ + unsigned nr_inputs:8; + unsigned nr_outputs:8; unsigned viewport:1; unsigned clip:1; - unsigned pad:14; + unsigned pad:5; struct draw_vs_element element[PIPE_MAX_ATTRIBS]; }; diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index c63553216c..e2e96470f7 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -2095,7 +2095,7 @@ static void vaos_set_buffer( struct draw_vs_varient *varient, struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; unsigned i; - for (i = 0; i < vaos->base.vs->info.num_inputs; i++) { + for (i = 0; i < vaos->base.key.nr_inputs; i++) { if (vaos->base.key.element[i].in.buffer == buf) { vaos->machine->attrib[i].input_ptr = ((char *)ptr + vaos->base.key.element[i].in.offset); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c index f39ebb7a17..ef265d61cf 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -180,7 +180,7 @@ boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) { unsigned i; - for (i = 0; i < cp->vaos->base.vs->info.num_inputs; i++) { + for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) { if (!load_input( cp, i, linear )) return FALSE; cp->insn_counter++; @@ -282,7 +282,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp ) { unsigned i; - for (i = 0; i < cp->vaos->base.vs->info.num_outputs; i++) { + for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) { unsigned format = cp->vaos->base.key.element[i].out.format; unsigned offset = cp->vaos->base.key.element[i].out.offset; diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index d4deabfff3..dab46e8eed 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -276,11 +276,11 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, - /* OK, have to build a new one: + /* Build free-standing fetch and emit functions: */ - fetch.nr_elements = vs->info.num_inputs; + fetch.nr_elements = key->nr_inputs; fetch.output_stride = 0; - for (i = 0; i < vs->info.num_inputs; i++) { + for (i = 0; i < key->nr_inputs; i++) { fetch.element[i].input_format = key->element[i].in.format; fetch.element[i].input_buffer = key->element[i].in.buffer; fetch.element[i].input_offset = key->element[i].in.offset; @@ -290,9 +290,9 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, } - emit.nr_elements = vs->info.num_outputs; + emit.nr_elements = key->nr_outputs; emit.output_stride = key->output_stride; - for (i = 0; i < vs->info.num_outputs; i++) { + for (i = 0; i < key->nr_outputs; i++) { emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit.element[i].input_buffer = 0; emit.element[i].input_offset = i * 4 * sizeof(float); -- cgit v1.2.3 From 2ec419d40dba43305c28fca9658ea00541f67821 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 27 May 2008 17:45:54 +0100 Subject: draw: fix ABS aliasing bug --- src/gallium/auxiliary/draw/draw_vs_aos.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index e2e96470f7..1c63677e6e 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -964,12 +964,13 @@ static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_inst { struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); - struct x86_reg dst = get_xmm_writable(cp, arg0); + struct x86_reg tmp = aos_get_xmm_reg(cp); - sse_mulps(cp->func, dst, neg); - sse_maxps(cp->func, dst, arg0); + sse_movaps(cp->func, tmp, arg0); + sse_mulps(cp->func, tmp, neg); + sse_maxps(cp->func, tmp, arg0); - store_dest(cp, &op->FullDstRegisters[0], dst); + store_dest(cp, &op->FullDstRegisters[0], tmp); return TRUE; } -- cgit v1.2.3 From 7b85ea19de09d4e7e077ca147528e90e52683690 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Tue, 27 May 2008 19:01:57 +0100 Subject: draw: support psize in vs_varient paths Preserve the vinfo "EMIT_*" format descriptors in the varient key, and deal with PSIZE directly in each implementation. --- .../auxiliary/draw/draw_pt_fetch_shade_emit.c | 35 ++------------------ src/gallium/auxiliary/draw/draw_vertex.h | 21 ++++++++++++ src/gallium/auxiliary/draw/draw_vs_aos.c | 3 ++ src/gallium/auxiliary/draw/draw_vs_aos.h | 1 + src/gallium/auxiliary/draw/draw_vs_aos_io.c | 37 ++++++++++++++-------- src/gallium/auxiliary/draw/draw_vs_varient.c | 24 +++++++++++--- 6 files changed, 71 insertions(+), 50 deletions(-) (limited to 'src/gallium/auxiliary/draw/draw_vs_aos.c') diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index 85d0bdfcab..729c7db999 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -77,7 +77,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle, unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs; const struct vertex_info *vinfo; unsigned i; - boolean need_psize = 0; if (!draw->render->set_primitive( draw->render, @@ -123,34 +122,24 @@ static void fse_prepare( struct draw_pt_middle_end *middle, for (i = 0; i < vinfo->num_attribs; i++) { unsigned emit_sz = 0; - unsigned output_format = PIPE_FORMAT_NONE; - unsigned vs_output = vinfo->src_index[i]; switch (vinfo->emit[i]) { case EMIT_4F: - output_format = PIPE_FORMAT_R32G32B32A32_FLOAT; emit_sz = 4 * sizeof(float); break; case EMIT_3F: - output_format = PIPE_FORMAT_R32G32B32_FLOAT; emit_sz = 3 * sizeof(float); break; case EMIT_2F: - output_format = PIPE_FORMAT_R32G32_FLOAT; emit_sz = 2 * sizeof(float); break; case EMIT_1F: - output_format = PIPE_FORMAT_R32_FLOAT; emit_sz = 1 * sizeof(float); break; case EMIT_1F_PSIZE: - need_psize = 1; - output_format = PIPE_FORMAT_R32_FLOAT; emit_sz = 1 * sizeof(float); - vs_output = vinfo->num_attribs + 1; break; case EMIT_4UB: - output_format = PIPE_FORMAT_B8G8R8A8_UNORM; emit_sz = 4 * sizeof(ubyte); break; default: @@ -162,33 +151,15 @@ static void fse_prepare( struct draw_pt_middle_end *middle, * numbers, not to positions in the hw vertex description -- * that's handled by the output_offset field. */ - fse->key.element[vs_output].out.format = output_format; - fse->key.element[vs_output].out.vs_output = vs_output; - fse->key.element[vs_output].out.offset = dst_offset; + fse->key.element[i].out.format = vinfo->emit[i]; + fse->key.element[i].out.vs_output = vinfo->src_index[i]; + fse->key.element[i].out.offset = dst_offset; dst_offset += emit_sz; assert(fse->key.output_stride >= dst_offset); } } - /* To make psize work, really need to tell the vertex shader to - * copy that value from input->output. For 'translate' this was - * implicit for all elements. - */ -#if 0 - if (need_psize) { - unsigned input = num_vs_inputs + 1; - const struct pipe_vertex_element *src = &draw->pt.vertex_element[i]; - fse->key.element[i].input_format = PIPE_FORMAT_R32_FLOAT; - fse->key.element[i].input_buffer = 0; //nr_buffers + 1; - fse->key.element[i].input_offset = 0; - - fse->key.nr_inputs += 1; - fse->key.nr_elements = MAX2(fse->key.nr_inputs, - fse->key.nr_outputs); - - } -#endif /* Would normally look up a vertex shader and peruse its list of * varients somehow. We omitted that step and put all the diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h index 6d8bac5138..16c65c4317 100644 --- a/src/gallium/auxiliary/draw/draw_vertex.h +++ b/src/gallium/auxiliary/draw/draw_vertex.h @@ -109,4 +109,25 @@ extern void draw_compute_vertex_size(struct vertex_info *vinfo); void draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data); + +static INLINE unsigned draw_translate_vinfo_format(unsigned format ) +{ + switch (format) { + case EMIT_1F: + case EMIT_1F_PSIZE: + return PIPE_FORMAT_R32_FLOAT; + case EMIT_2F: + return PIPE_FORMAT_R32G32_FLOAT; + case EMIT_3F: + return PIPE_FORMAT_R32G32B32_FLOAT; + case EMIT_4F: + return PIPE_FORMAT_R32G32B32A32_FLOAT; + case EMIT_4UB: + return PIPE_FORMAT_R8G8B8A8_UNORM; + default: + return PIPE_FORMAT_NONE; + } +} + + #endif /* DRAW_VERTEX_H */ diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c index 1c63677e6e..d3770b2c53 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -2126,6 +2126,7 @@ static void vaos_run_elts( struct draw_vs_varient *varient, { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + vaos->machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; vaos->gen_run_elts( varient, elts, count, @@ -2139,6 +2140,7 @@ static void vaos_run_linear( struct draw_vs_varient *varient, { struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + vaos->machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; vaos->gen_run_linear( varient, start, count, @@ -2204,6 +2206,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, vaos->base.run_linear = vaos_run_linear; vaos->base.run_elts = vaos_run_elts; + vaos->draw = vs->draw; vaos->machine = align_malloc( sizeof(struct aos_machine), 16 ); if (!vaos->machine) goto fail; diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index fffe2e4658..b47413ff43 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -176,6 +176,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp ); #define IMM_255 4 /* 255, 255, 255, 255 */ #define IMM_NEGS 5 /* -1,-1,-1,-1 */ #define IMM_RSQ 6 /* -.5,1.5,_,_ */ +#define IMM_PSIZE 7 /* not really an immediate - updated each run */ struct x86_reg aos_get_internal( struct aos_compilation *cp, unsigned imm ); diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c index cebfaf6474..836110f382 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -33,6 +33,7 @@ #include "tgsi/exec/tgsi_exec.h" #include "draw_vs.h" #include "draw_vs_aos.h" +#include "draw_vertex.h" #include "rtasm/rtasm_x86sse.h" @@ -249,24 +250,27 @@ static boolean emit_output( struct aos_compilation *cp, unsigned format ) { switch (format) { - case PIPE_FORMAT_R32_FLOAT: + case EMIT_1F: + case EMIT_1F_PSIZE: emit_store_R32(cp, ptr, dataXMM); break; - case PIPE_FORMAT_R32G32_FLOAT: + case EMIT_2F: emit_store_R32G32(cp, ptr, dataXMM); break; - case PIPE_FORMAT_R32G32B32_FLOAT: + case EMIT_3F: emit_store_R32G32B32(cp, ptr, dataXMM); break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: + case EMIT_4F: emit_store_R32G32B32A32(cp, ptr, dataXMM); break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); + case EMIT_4UB: + if (1) { + emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); + emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); + } + else { + emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); + } break; default: ERROR(cp, "unhandled output format"); @@ -287,9 +291,16 @@ boolean aos_emit_outputs( struct aos_compilation *cp ) unsigned offset = cp->vaos->base.key.element[i].out.offset; unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output; - struct x86_reg data = aos_get_shader_reg( cp, - TGSI_FILE_OUTPUT, - vs_output ); + struct x86_reg data; + + if (format == EMIT_1F_PSIZE) { + data = aos_get_internal_xmm( cp, IMM_PSIZE ); + } + else { + data = aos_get_shader_reg( cp, + TGSI_FILE_OUTPUT, + vs_output ); + } if (data.file != file_XMM) { struct x86_reg tmp = aos_get_xmm_reg( cp ); diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index dab46e8eed..119a3a04b5 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -231,6 +231,10 @@ static void vsvg_run_linear( struct draw_vs_varient *varient, output_buffer, vsvg->base.key.output_stride ); + vsvg->emit->set_buffer( vsvg->emit, + 1, + &vsvg->draw->rasterizer->point_size, + 0); vsvg->emit->run( vsvg->emit, 0, count, @@ -293,11 +297,21 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs, emit.nr_elements = key->nr_outputs; emit.output_stride = key->output_stride; for (i = 0; i < key->nr_outputs; i++) { - emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT; - emit.element[i].input_buffer = 0; - emit.element[i].input_offset = i * 4 * sizeof(float); - emit.element[i].output_format = key->element[i].out.format; - emit.element[i].output_offset = key->element[i].out.offset; + if (key->element[i].out.format != EMIT_1F_PSIZE) + { + emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT; + emit.element[i].input_buffer = 0; + emit.element[i].input_offset = key->element[i].out.vs_output * 4 * sizeof(float); + emit.element[i].output_format = draw_translate_vinfo_format(key->element[i].out.format); + emit.element[i].output_offset = key->element[i].out.offset; + } + else { + emit.element[i].input_format = PIPE_FORMAT_R32_FLOAT; + emit.element[i].input_buffer = 1; + emit.element[i].input_offset = 0; + emit.element[i].output_format = PIPE_FORMAT_R32_FLOAT; + emit.element[i].output_offset = key->element[i].out.offset; + } } vsvg->fetch = draw_vs_get_fetch( vs->draw, &fetch ); -- cgit v1.2.3