/* * Mesa 3-D graphics library * Version: 6.3 * * Copyright (C) 1999-2004 Brian Paul All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code * using the rtasm runtime assembler. Based on the old * t_vb_arb_program_sse.c */ #include "util/u_memory.h" #include "util/u_math.h" #include "pipe/p_shader_tokens.h" #include "util/u_debug.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_dump.h" #include "draw_vs.h" #include "draw_vs_aos.h" #include "rtasm/rtasm_x86sse.h" #ifdef PIPE_ARCH_X86 #define DISASSEM 0 #define FAST_MATH 1 static const char *files[] = { "NULL", "CONST", "IN", "OUT", "TEMP", "SAMP", "ADDR", "IMM", "INTERNAL", }; static INLINE boolean eq( struct x86_reg a, struct x86_reg b ) { return (a.file == b.file && a.idx == b.idx && a.mod == b.mod && a.disp == b.disp); } struct x86_reg aos_get_x86( struct aos_compilation *cp, unsigned which_reg, /* quick hack */ unsigned value ) { struct x86_reg reg; if (which_reg == 0) reg = cp->temp_EBP; else reg = cp->tmp_EAX; if (cp->x86_reg[which_reg] != value) { unsigned offset; switch (value) { case X86_IMMEDIATES: assert(which_reg == 0); offset = Offset(struct aos_machine, immediates); break; case X86_CONSTANTS: assert(which_reg == 1); offset = Offset(struct aos_machine, constants); break; case X86_BUFFERS: assert(which_reg == 0); offset = Offset(struct aos_machine, buffer); break; default: assert(0); offset = 0; } x86_mov(cp->func, reg, x86_make_disp(cp->machine_EDX, offset)); cp->x86_reg[which_reg] = value; } return reg; } static struct x86_reg get_reg_ptr(struct aos_compilation *cp, unsigned file, unsigned idx ) { struct x86_reg ptr = cp->machine_EDX; switch (file) { case TGSI_FILE_INPUT: assert(idx < MAX_INPUTS); return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); case TGSI_FILE_OUTPUT: return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); case TGSI_FILE_TEMPORARY: assert(idx < MAX_TEMPS); return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); case AOS_FILE_INTERNAL: assert(idx < MAX_INTERNALS); return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx])); case TGSI_FILE_IMMEDIATE: assert(idx < MAX_IMMEDIATES); /* just a sanity check */ return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float)); case TGSI_FILE_CONSTANT: assert(idx < MAX_CONSTANTS); /* just a sanity check */ return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float)); default: AOS_ERROR(cp, "unknown reg file"); return x86_make_reg(0,0); } } #define X87_CW_EXCEPTION_INV_OP (1<<0) #define X87_CW_EXCEPTION_DENORM_OP (1<<1) #define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2) #define X87_CW_EXCEPTION_OVERFLOW (1<<3) #define X87_CW_EXCEPTION_UNDERFLOW (1<<4) #define X87_CW_EXCEPTION_PRECISION (1<<5) #define X87_CW_PRECISION_SINGLE (0<<8) #define X87_CW_PRECISION_RESERVED (1<<8) #define X87_CW_PRECISION_DOUBLE (2<<8) #define X87_CW_PRECISION_DOUBLE_EXT (3<<8) #define X87_CW_PRECISION_MASK (3<<8) #define X87_CW_ROUND_NEAREST (0<<10) #define X87_CW_ROUND_DOWN (1<<10) #define X87_CW_ROUND_UP (2<<10) #define X87_CW_ROUND_ZERO (3<<10) #define X87_CW_ROUND_MASK (3<<10) #define X87_CW_INFINITY (1<<12) static void spill( struct aos_compilation *cp, unsigned idx ) { if (!cp->xmm[idx].dirty || (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ cp->xmm[idx].file != TGSI_FILE_OUTPUT && cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { AOS_ERROR(cp, "invalid spill"); return; } else { struct x86_reg oldval = get_reg_ptr(cp, cp->xmm[idx].file, cp->xmm[idx].idx); if (0) debug_printf("\nspill %s[%d]", files[cp->xmm[idx].file], cp->xmm[idx].idx); assert(cp->xmm[idx].dirty); sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx)); cp->xmm[idx].dirty = 0; } } void aos_spill_all( struct aos_compilation *cp ) { unsigned i; for (i = 0; i < 8; i++) { if (cp->xmm[i].dirty) spill(cp, i); aos_release_xmm_reg(cp, i); } } static struct x86_reg get_xmm_writable( struct aos_compilation *cp, struct x86_reg reg ) { if (reg.file != file_XMM || cp->xmm[reg.idx].file != TGSI_FILE_NULL) { struct x86_reg tmp = aos_get_xmm_reg(cp); sse_movaps(cp->func, tmp, reg); reg = tmp; } cp->xmm[reg.idx].last_used = cp->insn_counter; return reg; } static struct x86_reg get_xmm( struct aos_compilation *cp, struct x86_reg reg ) { if (reg.file != file_XMM) { struct x86_reg tmp = aos_get_xmm_reg(cp); sse_movaps(cp->func, tmp, reg); reg = tmp; } cp->xmm[reg.idx].last_used = cp->insn_counter; return reg; } /* Allocate an empty xmm register, either as a temporary or later to * "adopt" as a shader reg. */ struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) { unsigned i; unsigned oldest = 0; boolean found = FALSE; for (i = 0; i < 8; i++) if (cp->xmm[i].last_used != cp->insn_counter && cp->xmm[i].file == TGSI_FILE_NULL) { oldest = i; found = TRUE; } if (!found) { for (i = 0; i < 8; i++) if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) oldest = i; } /* Need to write out the old value? */ if (cp->xmm[oldest].dirty) spill(cp, oldest); assert(cp->xmm[oldest].last_used != cp->insn_counter); cp->xmm[oldest].file = TGSI_FILE_NULL; cp->xmm[oldest].idx = 0; cp->xmm[oldest].dirty = 0; cp->xmm[oldest].last_used = cp->insn_counter; return x86_make_reg(file_XMM, oldest); } void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx ) { cp->xmm[idx].file = TGSI_FILE_NULL; cp->xmm[idx].idx = 0; cp->xmm[idx].dirty = 0; cp->xmm[idx].last_used = 0; } static void aos_soft_release_xmm( struct aos_compilation *cp, struct x86_reg reg ) { if (reg.file == file_XMM) { assert(cp->xmm[reg.idx].last_used == cp->insn_counter); cp->xmm[reg.idx].last_used = cp->insn_counter - 1; } } /* Mark an xmm reg as holding the current copy of a shader reg. */ void aos_adopt_xmm_reg( struct aos_compilation *cp, struct x86_reg reg, unsigned file, unsigned idx, unsigned dirty ) { unsigned i; if (reg.file != file_XMM) { assert(0); return; } /* If any xmm reg thinks it holds this shader reg, break the * illusion. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { /* If an xmm reg is already holding this shader reg, take into account its * dirty flag... */ dirty |= cp->xmm[i].dirty; aos_release_xmm_reg(cp, i); } } cp->xmm[reg.idx].file = file; cp->xmm[reg.idx].idx = idx; cp->xmm[reg.idx].dirty = dirty; cp->xmm[reg.idx].last_used = cp->insn_counter; } /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate. */ static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, unsigned file, unsigned idx ) { unsigned i; /* Ensure the in-memory copy of this reg is up-to-date */ for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx && cp->xmm[i].dirty) { spill(cp, i); } } return get_reg_ptr( cp, file, idx ); } /* As above, but return a pointer. Note - this pointer may alias * those returned by get_arg_ptr(). */ static struct x86_reg get_dst_ptr( struct aos_compilation *cp, const struct tgsi_full_dst_register *dst ) { unsigned file = dst->Register.File; unsigned idx = dst->Register.Index; unsigned i; /* Ensure in-memory copy of this reg is up-to-date and invalidate * any xmm copies. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { if (cp->xmm[i].dirty) spill(cp, i); aos_release_xmm_reg(cp, i); } } return get_reg_ptr( cp, file, idx ); } /* Return an XMM reg if the argument is resident, otherwise return a * base+offset pointer to the saved value. */ struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, unsigned file, unsigned idx ) { unsigned i; for (i = 0; i < 8; i++) { if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { cp->xmm[i].last_used = cp->insn_counter; return x86_make_reg(file_XMM, i); } } /* If not found in the XMM register file, return an indirect * reference to the in-memory copy: */ return get_reg_ptr( cp, file, idx ); } static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, unsigned file, unsigned idx ) { struct x86_reg reg = get_xmm( cp, aos_get_shader_reg( cp, file, idx ) ); aos_adopt_xmm_reg( cp, reg, file, idx, FALSE ); return reg; } struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp, unsigned imm ) { return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm ); } struct x86_reg aos_get_internal( struct aos_compilation *cp, unsigned imm ) { return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm ); } /* Emulate pshufd insn in regular SSE, if necessary: */ static void emit_pshufd( struct aos_compilation *cp, struct x86_reg dst, struct x86_reg arg0, ubyte shuf ) { if (cp->have_sse2) { sse2_pshufd(cp->func, dst, arg0, shuf); } else { if (!eq(dst, arg0)) sse_movaps(cp->func, dst, arg0); sse_shufps(cp->func, dst, dst, shuf); } } /* load masks (pack into negs??) * pshufd - shuffle according to writemask * and - result, mask * nand - dest, mask * or - dest, result */ static boolean mask_write( struct aos_compilation *cp, struct x86_reg dst, struct x86_reg result, unsigned mask ) { struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); struct x86_reg tmp = aos_get_xmm_reg(cp); emit_pshufd(cp, tmp, imm_swz, SHUF((mask & 1) ? 2 : 3, (mask & 2) ? 2 : 3, (mask & 4) ? 2 : 3, (mask & 8) ? 2 : 3)); sse_andps(cp->func, dst, tmp); sse_andnps(cp->func, tmp, result); sse_orps(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); return TRUE; } /* Helper for writemask: */ static boolean emit_shuf_copy2( struct aos_compilation *cp, struct x86_reg dst, struct x86_reg arg0, struct x86_reg arg1, ubyte shuf ) { struct x86_reg tmp = aos_get_xmm_reg(cp); emit_pshufd(cp, dst, arg1, shuf); emit_pshufd(cp, tmp, arg0, shuf); sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); emit_pshufd(cp, dst, dst, shuf); aos_release_xmm_reg(cp, tmp.idx); return TRUE; } #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) /* Locate a source register and perform any required (simple) swizzle. * * Just fail on complex swizzles at this point. */ static struct x86_reg fetch_src( struct aos_compilation *cp, const struct tgsi_full_src_register *src ) { struct x86_reg arg0 = aos_get_shader_reg(cp, src->Register.File, src->Register.Index); unsigned i; ubyte swz = 0; unsigned negs = 0; unsigned abs = 0; for (i = 0; i < 4; i++) { unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i ); unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); swz |= (swizzle & 0x3) << (i * 2); switch (neg) { case TGSI_UTIL_SIGN_TOGGLE: negs |= (1<func, dst, arg0); if (negs && negs != 0xf) { struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ); struct x86_reg tmp = aos_get_xmm_reg(cp); /* Load 1,-1,0,0 * Use neg as arg to pshufd * Multiply */ emit_pshufd(cp, tmp, imm_swz, SHUF((negs & 1) ? 1 : 0, (negs & 2) ? 1 : 0, (negs & 4) ? 1 : 0, (negs & 8) ? 1 : 0)); sse_mulps(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); aos_soft_release_xmm(cp, imm_swz); } else if (negs) { struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS); sse_mulps(cp->func, dst, imm_negs); aos_soft_release_xmm(cp, imm_negs); } if (abs && abs != 0xf) { AOS_ERROR(cp, "unsupported partial abs"); } else if (abs) { struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); struct x86_reg tmp = aos_get_xmm_reg(cp); sse_movaps(cp->func, tmp, dst); sse_mulps(cp->func, tmp, neg); sse_maxps(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); aos_soft_release_xmm(cp, neg); } aos_soft_release_xmm(cp, arg0); return dst; } return arg0; } static void x87_fld_src( struct aos_compilation *cp, const struct tgsi_full_src_register *src, unsigned channel ) { struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, src->Register.File, src->Register.Index); unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel ); unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); switch (neg) { case TGSI_UTIL_SIGN_TOGGLE: /* Flip the sign: */ x87_fchs( cp->func ); break; case TGSI_UTIL_SIGN_KEEP: break; case TGSI_UTIL_SIGN_CLEAR: x87_fabs( cp->func ); break; case TGSI_UTIL_SIGN_SET: x87_fabs( cp->func ); x87_fchs( cp->func ); break; default: AOS_ERROR(cp, "unsupported sign-mode"); break; } } /* Used to implement write masking. This and most of the other instructions * here would be easier to implement if there had been a translation * to a 2 argument format (dst/arg0, arg1) at the shader level before * attempting to translate to x86/sse code. */ static void store_dest( struct aos_compilation *cp, const struct tgsi_full_dst_register *reg, struct x86_reg result ) { struct x86_reg dst; switch (reg->Register.WriteMask) { case 0: return; case TGSI_WRITEMASK_XYZW: aos_adopt_xmm_reg(cp, get_xmm_writable(cp, result), reg->Register.File, reg->Register.Index, TRUE); return; default: break; } dst = aos_get_shader_reg_xmm(cp, reg->Register.File, reg->Register.Index); switch (reg->Register.WriteMask) { case TGSI_WRITEMASK_X: sse_movss(cp->func, dst, get_xmm(cp, result)); break; case TGSI_WRITEMASK_ZW: sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W)); break; case TGSI_WRITEMASK_XY: result = get_xmm_writable(cp, result); sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W)); dst = result; break; case TGSI_WRITEMASK_YZW: result = get_xmm_writable(cp, result); sse_movss(cp->func, result, dst); dst = result; break; default: mask_write(cp, dst, result, reg->Register.WriteMask); break; } aos_adopt_xmm_reg(cp, dst, reg->Register.File, reg->Register.Index, TRUE); } static void inject_scalar( struct aos_compilation *cp, struct x86_reg dst, struct x86_reg result, ubyte swizzle ) { sse_shufps(cp->func, dst, dst, swizzle); sse_movss(cp->func, dst, result); sse_shufps(cp->func, dst, dst, swizzle); } static void store_scalar_dest( struct aos_compilation *cp, const struct tgsi_full_dst_register *reg, struct x86_reg result ) { unsigned writemask = reg->Register.WriteMask; struct x86_reg dst; if (writemask != TGSI_WRITEMASK_X && writemask != TGSI_WRITEMASK_Y && writemask != TGSI_WRITEMASK_Z && writemask != TGSI_WRITEMASK_W && writemask != 0) { result = get_xmm_writable(cp, result); /* already true, right? */ sse_shufps(cp->func, result, result, SHUF(X,X,X,X)); store_dest(cp, reg, result); return; } result = get_xmm(cp, result); dst = aos_get_shader_reg_xmm(cp, reg->Register.File, reg->Register.Index); switch (reg->Register.WriteMask) { case TGSI_WRITEMASK_X: sse_movss(cp->func, dst, result); break; case TGSI_WRITEMASK_Y: inject_scalar(cp, dst, result, SHUF(Y, X, Z, W)); break; case TGSI_WRITEMASK_Z: inject_scalar(cp, dst, result, SHUF(Z, Y, X, W)); break; case TGSI_WRITEMASK_W: inject_scalar(cp, dst, result, SHUF(W, Y, Z, X)); break; default: break; } aos_adopt_xmm_reg(cp, dst, reg->Register.File, reg->Register.Index, TRUE); } static void x87_fst_or_nop( struct x86_function *func, unsigned writemask, unsigned channel, struct x86_reg ptr ) { assert(ptr.file == file_REG32); if (writemask & (1<Register.WriteMask; x87_fst_or_nop(cp->func, writemask, 0, ptr); x87_fst_or_nop(cp->func, writemask, 1, ptr); x87_fst_or_nop(cp->func, writemask, 2, ptr); x87_fstp_or_pop(cp->func, writemask, 3, ptr); } /* Save current x87 state and put it into single precision mode. */ static void save_fpu_state( struct aos_compilation *cp ) { x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_restore))); } static void restore_fpu_state( struct aos_compilation *cp ) { x87_fnclex(cp->func); x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_restore))); } static void set_fpu_round_neg_inf( struct aos_compilation *cp ) { if (cp->fpucntl != FPU_RND_NEG) { cp->fpucntl = FPU_RND_NEG; x87_fnclex(cp->func); x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_rnd_neg_inf))); } } static void set_fpu_round_nearest( struct aos_compilation *cp ) { if (cp->fpucntl != FPU_RND_NEAREST) { cp->fpucntl = FPU_RND_NEAREST; x87_fnclex(cp->func); x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, fpu_rnd_nearest))); } } #if 0 static void x87_emit_ex2( struct aos_compilation *cp ) { struct x86_reg st0 = x86_make_reg(file_x87, 0); struct x86_reg st1 = x86_make_reg(file_x87, 1); int stack = cp->func->x87_stack; /* set_fpu_round_neg_inf( cp ); */ x87_fld(cp->func, st0); /* a a */ x87_fprndint( cp->func ); /* int(a) a*/ x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */ x87_fxch(cp->func, st1); /* frc(a) int(a) */ x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */ x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */ x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */ x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */ /* 2^a int(a) */ x87_fstp(cp->func, st1); /* 2^a */ assert( stack == cp->func->x87_stack); } #endif #if 0 static void PIPE_CDECL print_reg( const char *msg, const float *reg ) { debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]); } #endif #if 0 static void emit_print( struct aos_compilation *cp, const char *message, /* must point to a static string! */ unsigned file, unsigned idx ) { struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx ); unsigned i; /* There shouldn't be anything on the x87 stack. Can add this * capacity later if need be. */ assert(cp->func->x87_stack == 0); /* For absolute correctness, need to spill/invalidate all XMM regs * too. We're obviously not concerned about performance on this * debug path, so here goes: */ for (i = 0; i < 8; i++) { if (cp->xmm[i].dirty) spill(cp, i); aos_release_xmm_reg(cp, i); } /* Push caller-save (ie scratch) regs. */ x86_cdecl_caller_push_regs( cp->func ); /* Push the arguments: */ x86_lea( cp->func, ecx, arg ); x86_push( cp->func, ecx ); x86_push_imm32( cp->func, (int)message ); /* Call the helper. Could call debug_printf directly, but * print_reg is a nice place to put a breakpoint if need be. */ x86_mov_reg_imm( cp->func, ecx, (int)print_reg ); x86_call( cp->func, ecx ); x86_pop( cp->func, ecx ); x86_pop( cp->func, ecx ); /* Pop caller-save regs */ x86_cdecl_caller_pop_regs( cp->func ); /* Done... */ } #endif /** * The traditional instructions. All operate on internal registers * and ignore write masks and swizzling issues. */ static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); struct x86_reg tmp = aos_get_xmm_reg(cp); sse_movaps(cp->func, tmp, arg0); sse_mulps(cp->func, tmp, neg); sse_maxps(cp->func, tmp, arg0); store_dest(cp, &op->Dst[0], tmp); return TRUE; } static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_addps(cp->func, dst, arg1); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { x87_fld_src(cp, &op->Src[0], 0); x87_fcos(cp->func); x87_fstp_dest4(cp, &op->Dst[0]); return TRUE; } /* The dotproduct instructions don't really do that well in sse: * XXX: produces wrong results -- disabled. */ static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); /* Now the hard bit: sum the first 3 values: */ sse_movhlps(cp->func, tmp, dst); sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); store_scalar_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); /* Now the hard bit: sum the values: */ sse_movhlps(cp->func, tmp, dst); sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); store_scalar_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg tmp = aos_get_xmm_reg(cp); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); /* Now the hard bit: sum the values (from DP3): */ sse_movhlps(cp->func, tmp, dst); sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); sse_addss(cp->func, dst, tmp); emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); sse_addss(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); store_scalar_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg dst = aos_get_xmm_reg(cp); struct x86_reg tmp = aos_get_xmm_reg(cp); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); /* dst[0] = 1.0 * 1.0F; */ /* dst[1] = arg0[1] * arg1[1]; */ /* dst[2] = arg0[2] * 1.0; */ /* dst[3] = 1.0 * arg1[3]; */ emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); sse_mulps(cp->func, dst, tmp); aos_release_xmm_reg(cp, tmp.idx); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { x87_fld1(cp->func); /* 1 */ x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */ x87_fyl2x(cp->func); /* log2(a0) */ x87_fstp_dest4(cp, &op->Dst[0]); return TRUE; } #if 0 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { x87_fld_src(cp, &op->Src[0], 0); x87_emit_ex2(cp); x87_fstp_dest4(cp, &op->Dst[0]); return TRUE; } #endif static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); unsigned writemask = op->Dst[0].Register.WriteMask; int i; set_fpu_round_neg_inf( cp ); /* Load all sources first to avoid aliasing */ for (i = 3; i >= 0; i--) { if (writemask & (1<Src[0], i); } } for (i = 0; i < 4; i++) { if (writemask & (1<func ); x87_fstp(cp->func, x86_make_disp(dst, i*4)); } } return TRUE; } static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); unsigned writemask = op->Dst[0].Register.WriteMask; int i; set_fpu_round_nearest( cp ); /* Load all sources first to avoid aliasing */ for (i = 3; i >= 0; i--) { if (writemask & (1<Src[0], i); } } for (i = 0; i < 4; i++) { if (writemask & (1<func ); x87_fstp(cp->func, x86_make_disp(dst, i*4)); } } return TRUE; } static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); struct x86_reg st0 = x86_make_reg(file_x87, 0); struct x86_reg st1 = x86_make_reg(file_x87, 1); unsigned writemask = op->Dst[0].Register.WriteMask; int i; set_fpu_round_neg_inf( cp ); /* suck all the source values onto the stack before writing out any * dst, which may alias... */ for (i = 3; i >= 0; i--) { if (writemask & (1<Src[0], i); } } for (i = 0; i < 4; i++) { if (writemask & (1<func, st0); /* a a */ x87_fprndint( cp->func ); /* flr(a) a */ x87_fsubp(cp->func, st1); /* frc(a) */ x87_fstp(cp->func, x86_make_disp(dst, i*4)); } } return TRUE; } static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX ); unsigned writemask = op->Dst[0].Register.WriteMask; unsigned lit_count = cp->lit_count++; struct x86_reg result, arg0; unsigned i; #if 1 /* For absolute correctness, need to spill/invalidate all XMM regs * too. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].dirty) spill(cp, i); aos_release_xmm_reg(cp, i); } #endif if (writemask != TGSI_WRITEMASK_XYZW) result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0])); else result = get_dst_ptr(cp, &op->Dst[0]); arg0 = fetch_src( cp, &op->Src[0] ); if (arg0.file == file_XMM) { struct x86_reg tmp = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[1])); sse_movaps( cp->func, tmp, arg0 ); arg0 = tmp; } /* Push caller-save (ie scratch) regs. */ x86_cdecl_caller_push_regs( cp->func ); /* Push the arguments: */ x86_push_imm32( cp->func, lit_count ); x86_lea( cp->func, ecx, arg0 ); x86_push( cp->func, ecx ); x86_lea( cp->func, ecx, result ); x86_push( cp->func, ecx ); x86_push( cp->func, cp->machine_EDX ); if (lit_count < MAX_LIT_INFO) { x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, Offset(struct aos_machine, lit_info) + lit_count * sizeof(struct lit_info) + Offset(struct lit_info, func))); } else { x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit ); } x86_call( cp->func, ecx ); x86_pop( cp->func, ecx ); /* fixme... */ x86_pop( cp->func, ecx ); x86_pop( cp->func, ecx ); x86_pop( cp->func, ecx ); x86_cdecl_caller_pop_regs( cp->func ); if (writemask != TGSI_WRITEMASK_XYZW) { store_dest( cp, &op->Dst[0], get_xmm_writable( cp, result ) ); } return TRUE; } #if 0 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); unsigned writemask = op->Dst[0].Register.WriteMask; if (writemask & TGSI_WRITEMASK_YZ) { struct x86_reg st1 = x86_make_reg(file_x87, 1); struct x86_reg st2 = x86_make_reg(file_x87, 2); /* a1' = a1 <= 0 ? 1 : a1; */ x87_fldz(cp->func); /* 1 0 */ #if 1 x87_fld1(cp->func); /* 1 0 */ #else /* Correct but slow due to fp exceptions generated in fyl2x - fix me. */ x87_fldz(cp->func); /* 1 0 */ #endif x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */ x87_fcomi(cp->func, st2); /* a1 1 0 */ x87_fcmovb(cp->func, st1); /* a1' 1 0 */ x87_fstp(cp->func, st1); /* a1' 0 */ x87_fstp(cp->func, st1); /* a1' */ x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */ x87_fxch(cp->func, st1); /* a1' a3 */ /* Compute pow(a1, a3) */ x87_fyl2x(cp->func); /* a3*log2(a1) */ x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */ /* a0' = max2(a0, 0): */ x87_fldz(cp->func); /* 0 r2 */ x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */ x87_fcomi(cp->func, st1); x87_fcmovb(cp->func, st1); /* a0' 0 r2 */ x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */ x87_fcomi(cp->func, st1); /* a0' 0 r2 */ x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */ x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */ x87_fpop(cp->func); /* r2 */ x87_fpop(cp->func); } if (writemask & TGSI_WRITEMASK_XW) { x87_fld1(cp->func); x87_fst_or_nop(cp->func, writemask, 0, dst); x87_fstp_or_pop(cp->func, writemask, 3, dst); } return TRUE; } #endif static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_maxps(cp->func, dst, arg1); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_minps(cp->func, dst, arg1); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg dst = get_xmm_writable(cp, arg0); /* potentially nothing to do */ store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_mulps(cp->func, dst, arg1); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg arg2 = fetch_src(cp, &op->Src[2]); /* If we can't clobber old contents of arg0, get a temporary & copy * it there, then clobber it... */ arg0 = get_xmm_writable(cp, arg0); sse_mulps(cp->func, arg0, arg1); sse_addps(cp->func, arg0, arg2); store_dest(cp, &op->Dst[0], arg0); return TRUE; } /* A wrapper for powf(). * Makes sure it is cdecl and operates on floats. */ static float PIPE_CDECL _powerf( float x, float y ) { #if FAST_MATH return util_fast_pow(x, y); #else return powf( x, y ); #endif } #if FAST_MATH static float PIPE_CDECL _exp2(float x) { return util_fast_exp2(x); } #endif /* Really not sufficient -- need to check for conditions that could * generate inf/nan values, which will slow things down hugely. */ static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { #if 0 x87_fld_src(cp, &op->Src[1], 0); /* a1.x */ x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */ x87_fyl2x(cp->func); /* a1*log2(a0) */ x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */ x87_fstp_dest4(cp, &op->Dst[0]); #else uint i; /* For absolute correctness, need to spill/invalidate all XMM regs * too. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].dirty) spill(cp, i); aos_release_xmm_reg(cp, i); } /* Push caller-save (ie scratch) regs. */ x86_cdecl_caller_push_regs( cp->func ); x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) ); x87_fld_src( cp, &op->Src[1], 0 ); x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) ); x87_fld_src( cp, &op->Src[0], 0 ); x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); /* tmp_EAX has been pushed & will be restored below */ x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf ); x86_call( cp->func, cp->tmp_EAX ); x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) ); x86_cdecl_caller_pop_regs( cp->func ); /* Note retval on x87 stack: */ cp->func->x87_stack++; x87_fstp_dest4( cp, &op->Dst[0] ); #endif return TRUE; } #if FAST_MATH static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { uint i; /* For absolute correctness, need to spill/invalidate all XMM regs * too. */ for (i = 0; i < 8; i++) { if (cp->xmm[i].dirty) spill(cp, i); aos_release_xmm_reg(cp, i); } /* Push caller-save (ie scratch) regs. */ x86_cdecl_caller_push_regs( cp->func ); x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) ); x87_fld_src( cp, &op->Src[0], 0 ); x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) ); /* tmp_EAX has been pushed & will be restored below */ x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 ); x86_call( cp->func, cp->tmp_EAX ); x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) ); x86_cdecl_caller_pop_regs( cp->func ); /* Note retval on x87 stack: */ cp->func->x87_stack++; x87_fstp_dest4( cp, &op->Dst[0] ); return TRUE; } #endif static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg dst = aos_get_xmm_reg(cp); if (cp->have_sse2) { sse2_rcpss(cp->func, dst, arg0); /* extend precision here... */ } else { struct x86_reg ones = aos_get_internal(cp, IMM_ONES); sse_movss(cp->func, dst, ones); sse_divss(cp->func, dst, arg0); } store_scalar_dest(cp, &op->Dst[0], dst); return TRUE; } /* Although rsqrtps() and rcpps() are low precision on some/all SSE * implementations, it is possible to improve its precision at * fairly low cost, using a newton/raphson step, as below: * * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a) * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)] * or: * x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)] * * * See: http://softwarecommunity.intel.com/articles/eng/1818.htm */ static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { if (0) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg r = aos_get_xmm_reg(cp); sse_rsqrtss(cp->func, r, arg0); store_scalar_dest(cp, &op->Dst[0], r); return TRUE; } else { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg r = aos_get_xmm_reg(cp); struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ ); struct x86_reg one_point_five = x86_make_disp( neg_half, 4 ); struct x86_reg src = get_xmm_writable( cp, arg0 ); struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); struct x86_reg tmp = aos_get_xmm_reg(cp); sse_movaps(cp->func, tmp, src); sse_mulps(cp->func, tmp, neg); sse_maxps(cp->func, tmp, src); sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */ sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */ sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */ sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */ sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */ sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */ store_scalar_dest(cp, &op->Dst[0], r); aos_release_xmm_reg(cp, tmp.idx); return TRUE; } } static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); sse_andps(cp->func, dst, ones); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { x87_fld_src(cp, &op->Src[0], 0); x87_fsin(cp->func); x87_fstp_dest4(cp, &op->Dst[0]); return TRUE; } static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg ones = aos_get_internal(cp, IMM_ONES); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_cmpps(cp->func, dst, arg1, cc_LessThan); sse_andps(cp->func, dst, ones); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg dst = get_xmm_writable(cp, arg0); sse_subps(cp->func, dst, arg1); store_dest(cp, &op->Dst[0], dst); return TRUE; } static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg tmp0 = aos_get_xmm_reg(cp); sse2_cvttps2dq(cp->func, tmp0, arg0); sse2_cvtdq2ps(cp->func, tmp0, tmp0); store_dest(cp, &op->Dst[0], tmp0); return TRUE; } static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) { struct x86_reg arg0 = fetch_src(cp, &op->Src[0]); struct x86_reg arg1 = fetch_src(cp, &op->Src[1]); struct x86_reg tmp0 = aos_get_xmm_reg(cp); struct x86_reg tmp1 = aos_get_xmm_reg(cp); emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); sse_mulps(cp->func, tmp1, arg0); emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W)); sse_mulps(cp->func, tmp0, arg1); sse_subps(cp->func, tmp1, tmp0); sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W)); /* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ /* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ /* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ /* dst[3] is undef */ aos_release_xmm_reg(cp, tmp0.idx); store_dest(cp, &op->Dst[0], tmp1); return TRUE; } static boolean emit_instruction( struct aos_compilation *cp, struct tgsi_full_instruction *inst ) { x87_assert_stack_empty(cp->func); switch( inst->Instruction.Opcode ) { case TGSI_OPCODE_MOV: return emit_MOV( cp, inst ); case TGSI_OPCODE_LIT: return emit_LIT(cp, inst); case TGSI_OPCODE_RCP: return emit_RCP(cp, inst); case TGSI_OPCODE_RSQ: return emit_RSQ(cp, inst); case TGSI_OPCODE_EXP: /*return emit_EXP(cp, inst);*/ return FALSE; case TGSI_OPCODE_LOG: /*return emit_LOG(cp, inst);*/ return FALSE; case TGSI_OPCODE_MUL: return emit_MUL(cp, inst); case TGSI_OPCODE_ADD: return emit_ADD(cp, inst); case TGSI_OPCODE_DP3: return emit_DP3(cp, inst); case TGSI_OPCODE_DP4: return emit_DP4(cp, inst); case TGSI_OPCODE_DST: return emit_DST(cp, inst); case TGSI_OPCODE_MIN: return emit_MIN(cp, inst); case TGSI_OPCODE_MAX: return emit_MAX(cp, inst); case TGSI_OPCODE_SLT: return emit_SLT(cp, inst); case TGSI_OPCODE_SGE: return emit_SGE(cp, inst); case TGSI_OPCODE_MAD: return emit_MAD(cp, inst); case TGSI_OPCODE_SUB: return emit_SUB(cp, inst); case TGSI_OPCODE_LRP: /*return emit_LERP(cp, inst);*/ return FALSE; case TGSI_OPCODE_FRC: return emit_FRC(cp, inst); case TGSI_OPCODE_CLAMP: /*return emit_CLAMP(cp, inst);*/ return FALSE; case TGSI_OPCODE_FLR: return emit_FLR(cp, inst); case TGSI_OPCODE_ROUND: return emit_RND(cp, inst); case TGSI_OPCODE_EX2: #if FAST_MATH return emit_EXPBASE2(cp, inst); #elif 0 /* this seems to fail for "larger" exponents. * See glean tvertProg1's EX2 test. */ return emit_EX2(cp, inst); #else return FALSE; #endif case TGSI_OPCODE_LG2: return emit_LG2(cp, inst); case TGSI_OPCODE_POW: return emit_POW(cp, inst); case TGSI_OPCODE_XPD: return emit_XPD(cp, inst); case TGSI_OPCODE_ABS: return emit_ABS(cp, inst); case TGSI_OPCODE_DPH: return emit_DPH(cp, inst); case TGSI_OPCODE_COS: return emit_COS(cp, inst); case TGSI_OPCODE_SIN: return emit_SIN(cp, inst); case TGSI_OPCODE_TRUNC: return emit_TRUNC(cp, inst); case TGSI_OPCODE_END: return TRUE; default: return FALSE; } } static boolean emit_viewport( struct aos_compilation *cp ) { struct x86_reg pos = aos_get_shader_reg_xmm(cp, TGSI_FILE_OUTPUT, cp->vaos->draw->vs.position_output ); struct x86_reg scale = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, scale)); struct x86_reg translate = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, translate)); sse_mulps(cp->func, pos, scale); sse_addps(cp->func, pos, translate); aos_adopt_xmm_reg( cp, pos, TGSI_FILE_OUTPUT, cp->vaos->draw->vs.position_output, TRUE ); return TRUE; } /* This is useful to be able to see the results on softpipe. Doesn't * do proper clipping, just assumes the backend can do it during * rasterization -- for debug only... */ static boolean emit_rhw_viewport( struct aos_compilation *cp ) { struct x86_reg tmp = aos_get_xmm_reg(cp); struct x86_reg pos = aos_get_shader_reg_xmm(cp, TGSI_FILE_OUTPUT, cp->vaos->draw->vs.position_output); struct x86_reg scale = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, scale)); struct x86_reg translate = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, translate)); emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W)); sse2_rcpss(cp->func, tmp, tmp); sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X)); sse_mulps(cp->func, pos, scale); sse_mulps(cp->func, pos, tmp); sse_addps(cp->func, pos, translate); /* Set pos[3] = w */ mask_write(cp, pos, tmp, TGSI_WRITEMASK_W); aos_adopt_xmm_reg( cp, pos, TGSI_FILE_OUTPUT, cp->vaos->draw->vs.position_output, TRUE ); return TRUE; } #if 0 static boolean note_immediate( struct aos_compilation *cp, struct tgsi_full_immediate *imm ) { unsigned pos = cp->num_immediates++; unsigned j; assert( imm->Immediate.NrTokens <= 4 + 1 ); for (j = 0; j < imm->Immediate.NrTokens - 1; j++) { cp->vaos->machine->immediate[pos][j] = imm->u[j].Float; } return TRUE; } #endif static void find_last_write_outputs( struct aos_compilation *cp ) { struct tgsi_parse_context parse; unsigned this_instruction = 0; unsigned i; tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); while (!tgsi_parse_end_of_tokens( &parse )) { tgsi_parse_token( &parse ); if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) continue; for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { if (parse.FullToken.FullInstruction.Dst[i].Register.File == TGSI_FILE_OUTPUT) { unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index; cp->output_last_write[idx] = this_instruction; } } this_instruction++; } tgsi_parse_free( &parse ); } #define ARG_MACHINE 1 #define ARG_START_ELTS 2 #define ARG_COUNT 3 #define ARG_OUTBUF 4 static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant, boolean linear ) { struct tgsi_parse_context parse; struct aos_compilation cp; unsigned fixup, label; util_init_math(); tgsi_parse_init( &parse, variant->base.vs->state.tokens ); memset(&cp, 0, sizeof(cp)); cp.insn_counter = 1; cp.vaos = variant; cp.have_sse2 = 1; cp.func = &variant->func[ linear ? 0 : 1 ]; cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX); cp.idx_EBX = x86_make_reg(file_REG32, reg_BX); cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX); cp.machine_EDX = x86_make_reg(file_REG32, reg_DX); cp.count_ESI = x86_make_reg(file_REG32, reg_SI); cp.temp_EBP = x86_make_reg(file_REG32, reg_BP); cp.stack_ESP = x86_make_reg( file_REG32, reg_SP ); x86_init_func(cp.func); find_last_write_outputs(&cp); x86_push(cp.func, cp.idx_EBX); x86_push(cp.func, cp.count_ESI); x86_push(cp.func, cp.temp_EBP); /* Load arguments into regs: */ x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE)); x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); /* Compare count to zero and possibly bail. */ x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); fixup = x86_jcc_forward(cp.func, cc_E); save_fpu_state( &cp ); set_fpu_round_nearest( &cp ); aos_init_inputs( &cp, linear ); cp.x86_reg[0] = 0; cp.x86_reg[1] = 0; /* Note address for loop jump */ label = x86_get_label(cp.func); { /* Fetch inputs... TODO: fetch lazily... */ if (!aos_fetch_inputs( &cp, linear )) goto fail; /* Emit the shader: */ while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) { tgsi_parse_token( &parse ); switch (parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: #if 0 if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) goto fail; #endif break; case TGSI_TOKEN_TYPE_INSTRUCTION: if (DISASSEM) tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter ); if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) goto fail; break; } x87_assert_stack_empty(cp.func); cp.insn_counter++; if (DISASSEM) debug_printf("\n"); } { unsigned i; for (i = 0; i < 8; i++) { if (cp.xmm[i].file != TGSI_FILE_OUTPUT) { cp.xmm[i].file = TGSI_FILE_NULL; cp.xmm[i].dirty = 0; } } } if (cp.error) goto fail; if (cp.vaos->base.key.clip) { /* not really handling clipping, just do the rhw so we can * see the results... */ emit_rhw_viewport(&cp); } else if (cp.vaos->base.key.viewport) { emit_viewport(&cp); } /* Emit output... TODO: do this eagerly after the last write to a * given output. */ if (!aos_emit_outputs( &cp )) goto fail; /* Next vertex: */ x86_lea(cp.func, cp.outbuf_ECX, x86_make_disp(cp.outbuf_ECX, cp.vaos->base.key.output_stride)); /* Incr index */ aos_incr_inputs( &cp, linear ); } /* decr count, loop if not zero */ x86_dec(cp.func, cp.count_ESI); x86_jcc(cp.func, cc_NZ, label); restore_fpu_state(&cp); /* Land forward jump here: */ x86_fixup_fwd_jump(cp.func, fixup); /* Exit mmx state? */ if (cp.func->need_emms) mmx_emms(cp.func); x86_pop(cp.func, cp.temp_EBP); x86_pop(cp.func, cp.count_ESI); x86_pop(cp.func, cp.idx_EBX); x87_assert_stack_empty(cp.func); x86_ret(cp.func); tgsi_parse_free( &parse ); return !cp.error; fail: tgsi_parse_free( &parse ); return FALSE; } /** cast wrapper */ static INLINE struct draw_vs_variant_aos_sse * draw_vs_variant_aos_sse(struct draw_vs_variant *variant) { return (struct draw_vs_variant_aos_sse *) variant; } static void vaos_set_buffer( struct draw_vs_variant *variant, unsigned buf, const void *ptr, unsigned stride, unsigned max_stride) { struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); if (buf < vaos->nr_vb) { vaos->buffer[buf].base_ptr = (char *)ptr; vaos->buffer[buf].stride = stride; } if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride); } static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant, const unsigned *elts, unsigned count, void *output_buffer ) { struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); struct aos_machine *machine = vaos->draw->vs.aos_machine; unsigned i; if (0) debug_printf("%s %d\n", __FUNCTION__, count); machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { machine->constants[i] = vaos->draw->vs.aligned_constants[i]; } machine->immediates = vaos->base.vs->immediates; machine->buffer = vaos->buffer; vaos->gen_run_elts( machine, elts, count, output_buffer ); } static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant, unsigned start, unsigned count, void *output_buffer ) { struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); struct aos_machine *machine = vaos->draw->vs.aos_machine; unsigned i; if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, vaos->base.key.const_vbuffers); machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size; for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { machine->constants[i] = vaos->draw->vs.aligned_constants[i]; } machine->immediates = vaos->base.vs->immediates; machine->buffer = vaos->buffer; vaos->gen_run_linear( machine, start, count, output_buffer ); /* Sanity spot checks to make sure we didn't trash our constants */ assert(machine->internal[IMM_ONES][0] == 1.0f); assert(machine->internal[IMM_IDENTITY][0] == 0.0f); assert(machine->internal[IMM_NEGS][0] == -1.0f); } static void vaos_destroy( struct draw_vs_variant *variant ) { struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant); FREE( vaos->buffer ); x86_release_func( &vaos->func[0] ); x86_release_func( &vaos->func[1] ); FREE(vaos); } static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_variant_key *key ) { unsigned i; struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse); if (!vaos) goto fail; vaos->base.key = *key; vaos->base.vs = vs; vaos->base.set_buffer = vaos_set_buffer; vaos->base.destroy = vaos_destroy; vaos->base.run_linear = vaos_run_linear; vaos->base.run_elts = vaos_run_elts; vaos->draw = vs->draw; for (i = 0; i < key->nr_inputs; i++) vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 ); vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) ); if (!vaos->buffer) goto fail; if (0) debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers); #if 0 tgsi_dump(vs->state.tokens, 0); #endif if (!build_vertex_program( vaos, TRUE )) goto fail; if (!build_vertex_program( vaos, FALSE )) goto fail; vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]); if (!vaos->gen_run_linear) goto fail; vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]); if (!vaos->gen_run_elts) goto fail; return &vaos->base; fail: if (vaos && vaos->buffer) FREE(vaos->buffer); if (vaos) x86_release_func( &vaos->func[0] ); if (vaos) x86_release_func( &vaos->func[1] ); FREE(vaos); return NULL; } struct draw_vs_variant * draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_variant_key *key ) { struct draw_vs_variant *variant = variant_aos_sse( vs, key ); if (variant == NULL) { variant = draw_vs_create_variant_generic( vs, key ); } return variant; } #endif /* PIPE_ARCH_X86 */