diff options
Diffstat (limited to 'src/gallium/auxiliary/draw')
| -rw-r--r-- | src/gallium/auxiliary/draw/Makefile | 2 | ||||
| -rw-r--r-- | src/gallium/auxiliary/draw/draw_vs.h | 10 | ||||
| -rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos.c | 1739 | ||||
| -rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos.h | 181 | ||||
| -rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_aos_io.c | 314 | ||||
| -rw-r--r-- | src/gallium/auxiliary/draw/draw_vs_sse.c | 1 | 
6 files changed, 2247 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile index 84877994fb..9a88ecc070 100644 --- a/src/gallium/auxiliary/draw/Makefile +++ b/src/gallium/auxiliary/draw/Makefile @@ -35,6 +35,8 @@ C_SOURCES = \  	draw_vertex.c \  	draw_vs.c \  	draw_vs_varient.c \ +	draw_vs_aos.c \ +	draw_vs_aos_io.c \  	draw_vs_exec.c \  	draw_vs_llvm.c \  	draw_vs_sse.c  diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index 6bfc2c8d75..5a8d0da06d 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -162,6 +162,16 @@ struct draw_vertex_shader *  draw_create_vs_llvm(struct draw_context *draw,  		    const struct pipe_shader_state *templ); + + +struct draw_vs_varient_key; +struct draw_vertex_shader; + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ); + + +  /********************************************************************************   * Helpers for vs implementations that don't do their own fetch/emit varients.   * Means these can be shared between shaders. diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c new file mode 100644 index 0000000000..620f5e3592 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.c @@ -0,0 +1,1739 @@ +/* + * Mesa 3-D graphics library + * Version:  6.3 + * + * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code + * using the rtasm runtime assembler.  Based on the old + * t_vb_arb_program_sse.c + */ + + +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/util/tgsi_parse.h" +#include "tgsi/util/tgsi_util.h" +#include "tgsi/exec/tgsi_exec.h" +#include "tgsi/util/tgsi_dump.h" + +#include "draw_vs.h" +#include "draw_vs_aos.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 + + +#define DISASSEM 0 + + + + + +static INLINE boolean eq( struct x86_reg a, +			    struct x86_reg b ) +{ +   return (a.file == b.file && +	   a.idx == b.idx && +	   a.mod == b.mod && +	   a.disp == b.disp); +} +       + +static struct x86_reg get_reg_ptr(struct aos_compilation *cp, +                                  unsigned file, +				  unsigned idx ) +{ +   struct x86_reg ptr = cp->machine_EDX; + +   switch (file) { +   case TGSI_FILE_INPUT: +      return x86_make_disp(ptr, Offset(struct aos_machine, input[idx])); + +   case TGSI_FILE_OUTPUT: +      return x86_make_disp(ptr, Offset(struct aos_machine, output[idx])); + +   case TGSI_FILE_TEMPORARY: +      return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx])); + +   case TGSI_FILE_IMMEDIATE: +      return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + +   case TGSI_FILE_CONSTANT:        +      return x86_make_disp(ptr, Offset(struct aos_machine, constant[idx])); + +   case AOS_FILE_INTERNAL: +      return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx])); + +   default: +      ERROR(cp, "unknown reg file"); +      return x86_make_reg(0,0); +   } +} +		 + +struct x86_reg aos_get_internal( struct aos_compilation *cp, +                                 unsigned imm ) +{ +   return get_reg_ptr( cp, +                       AOS_FILE_INTERNAL,  +                       imm + 1 ); +} + +static void spill( struct aos_compilation *cp, unsigned idx ) +{ +   if (!cp->xmm[idx].dirty || +       (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */ +        cp->xmm[idx].file != TGSI_FILE_OUTPUT && +        cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) { +      ERROR(cp, "invalid spill"); +      return; +   } +   else { +      struct x86_reg oldval = get_reg_ptr(cp, +                                          cp->xmm[idx].file, +                                          cp->xmm[idx].idx); +       +      assert(cp->xmm[idx].dirty); +      sse_movups(cp->func, oldval, x86_make_reg(file_XMM, idx)); +      cp->xmm[idx].dirty = 0; +   } +} + +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ) +{ +   unsigned i; +   unsigned oldest = 0; + +   for (i = 0; i < 8; i++)  +      if (cp->xmm[i].last_used < cp->xmm[oldest].last_used) +	 oldest = i; + +   /* Need to write out the old value? +    */ +   if (cp->xmm[oldest].dirty)  +      spill(cp, oldest); + +   assert(cp->xmm[oldest].last_used != cp->insn_counter); + +   cp->xmm[oldest].file = TGSI_FILE_NULL; +   cp->xmm[oldest].idx = 0; +   cp->xmm[oldest].last_used = cp->insn_counter; +   return x86_make_reg(file_XMM, oldest); +} + +void aos_release_xmm_reg( struct aos_compilation *cp, +                          unsigned idx ) +{ +   cp->xmm[idx].file = TGSI_FILE_NULL; +   cp->xmm[idx].idx = 0; +   cp->xmm[idx].dirty = 0; +   cp->xmm[idx].last_used = 0; +} + +static void invalidate_xmm( struct aos_compilation *cp,  +			    unsigned file, unsigned idx ) +{ +   unsigned i; + +   /* Invalidate any old copy of this register in XMM0-7.   +    */ +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { + +         if (cp->xmm[i].dirty)  +            spill(cp, i); +          +         aos_release_xmm_reg(cp, i); +         break; +      } +   } + +   for (; i < 8; i++) { +      if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) { +         assert(0); +      } +   } +} +       + +void aos_adopt_xmm_reg( struct aos_compilation *cp, +                        struct x86_reg reg, +                        unsigned file, +                        unsigned idx, +                        unsigned dirty ) +{ +   if (reg.file != file_XMM) { +      assert(0); +      return; +   } + +   invalidate_xmm(cp, file, idx); +   cp->xmm[reg.idx].file = file; +   cp->xmm[reg.idx].idx = idx; +   cp->xmm[reg.idx].dirty = dirty; +} + + + +static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,  +                                              unsigned file, +                                              unsigned idx ) +{ +   invalidate_xmm( cp, file, idx ); +   return get_reg_ptr( cp, file, idx ); +} + + +/* As above, but return a pointer.  Note - this pointer may alias + * those returned by get_arg_ptr(). + */ +static struct x86_reg get_dst_ptr( struct aos_compilation *cp,  +                                   const struct tgsi_full_dst_register *dst ) +{ +   return aos_get_shader_reg_ptr( cp, dst->DstRegister.File, dst->DstRegister.Index ); +} + + + + + +/* Return an XMM reg if the argument is resident, otherwise return a + * base+offset pointer to the saved value. + */ +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,  +                                   unsigned file, +                                   unsigned idx ) +{ +   unsigned i; + +   for (i = 0; i < 8; i++) { +      if (cp->xmm[i].file == file && +	  cp->xmm[i].idx  == idx)  +      { +	 cp->xmm[i].last_used = cp->insn_counter; +	 return x86_make_reg(file_XMM, i); +      } +   } + +   /* If not found in the XMM register file, return an indirect +    * reference to the in-memory copy: +    */ +   return get_reg_ptr( cp, file, idx ); +} + + + + + +/* Emulate pshufd insn in regular SSE, if necessary: + */ +static void emit_pshufd( struct aos_compilation *cp, +			 struct x86_reg dst, +			 struct x86_reg arg0, +			 ubyte shuf ) +{ +   if (cp->have_sse2) { +      sse2_pshufd(cp->func, dst, arg0, shuf); +   } +   else { +      if (!eq(dst, arg0))  +	 sse_movups(cp->func, dst, arg0); + +      sse_shufps(cp->func, dst, dst, shuf); +   } +} + + + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy1( struct aos_compilation *cp, +				  struct x86_reg dst, +				  struct x86_reg arg0, +				  struct x86_reg arg1, +				  ubyte shuf ) +{ +   struct x86_reg tmp = aos_get_xmm_reg(cp); +   sse_movups(cp->func, dst, arg1); +   emit_pshufd(cp, dst, dst, shuf); +   emit_pshufd(cp, tmp, arg0, shuf); + +   sse_movss(cp->func, dst, tmp); + +   emit_pshufd(cp, dst, dst, shuf); + +   aos_release_xmm_reg(cp, tmp.idx); +   return TRUE; +} + + +/* Helper for writemask: + */ +static boolean emit_shuf_copy2( struct aos_compilation *cp, +				  struct x86_reg dst, +				  struct x86_reg arg0, +				  struct x86_reg arg1, +				  ubyte shuf ) +{ +   struct x86_reg tmp = aos_get_xmm_reg(cp); +   emit_pshufd(cp, dst, arg1, shuf); +   emit_pshufd(cp, tmp, arg0, shuf); + +   sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W)); + +   emit_pshufd(cp, dst, dst, shuf); + +   aos_release_xmm_reg(cp, tmp.idx); +   return TRUE; +} + +#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6)) + + +/* Locate a source register and perform any required (simple) swizzle.   + *  + * Just fail on complex swizzles at this point. + */ +static struct x86_reg fetch_src( struct aos_compilation *cp,  +                                 const struct tgsi_full_src_register *src )  +{ +   struct x86_reg arg0 = aos_get_shader_reg(cp,  +                                            src->SrcRegister.File,  +                                            src->SrcRegister.Index); +   unsigned i; +   unsigned swz = 0; +   unsigned negs = 0; +   unsigned abs = 0; + +   for (i = 0; i < 4; i++) { +      unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i ); +      unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i ); + +      switch (swizzle) { +      case TGSI_EXTSWIZZLE_ZERO: +      case TGSI_EXTSWIZZLE_ONE: +         ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2"); +         break; + +      default: +         swz |= (swizzle & 0x3) << (i * 2); +         break; +      } + +      switch (neg) { +      case TGSI_UTIL_SIGN_TOGGLE: +         negs |= (1<<i); +         break; +          +      case TGSI_UTIL_SIGN_KEEP: +         break; + +      case TGSI_UTIL_SIGN_CLEAR: +         abs |= (1<<i); +         break; + +      default: +         ERROR(cp, "unsupported sign-mode"); +         break; +      } +   } + +   if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) { +      struct x86_reg dst = aos_get_xmm_reg(cp); + +      if (swz != SSE_SWIZZLE_NOOP) { +         emit_pshufd(cp, dst, arg0, swz); +         arg0 = dst; +      } + +      if (negs) { +         struct x86_reg imm_negs = aos_get_internal(cp, IMM_NEGS); +         struct x86_reg tmp = aos_get_xmm_reg(cp); + +         /* Load 1,-1,0,0 +          * Use neg as arg to pshufd +          * Multiply +          */ +         emit_pshufd(cp, tmp, imm_negs,  +                     SHUF((negs & 1) ? 1 : 0, +                          (negs & 2) ? 1 : 0, +                          (negs & 4) ? 1 : 0, +                          (negs & 8) ? 1 : 0)); +         sse_mulps(cp->func, dst, arg0); + +         aos_release_xmm_reg(cp, tmp.idx); +         arg0 = dst; +      } + +      if (abs && abs != 0xf) { +         ERROR(cp, "unsupported partial abs"); +      } + +      if (abs) { +         struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); +         struct x86_reg tmp = aos_get_xmm_reg(cp); + +         sse_movups(cp->func, tmp, arg0); +         sse_mulps(cp->func, tmp, neg); +         sse_maxps(cp->func, dst, arg0); + +         aos_release_xmm_reg(cp, tmp.idx); +         arg0 = dst; +      } +   } +       +   return arg0; +} + +static void x87_fld_src( struct aos_compilation *cp,  +                         const struct tgsi_full_src_register *src, +                         unsigned channel )  +{ +   struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,  +                                                src->SrcRegister.File,  +                                                src->SrcRegister.Index); + +   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel ); +   unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel ); + +   switch (swizzle) { +   case TGSI_EXTSWIZZLE_ZERO: +      x87_fldz( cp->func ); +      break; + +   case TGSI_EXTSWIZZLE_ONE: +      x87_fld1( cp->func ); +      break; + +   default: +      x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) ); +      break; +   } +    + +   switch (neg) { +   case TGSI_UTIL_SIGN_TOGGLE: +      /* Flip the sign: +       */ +      x87_fchs( cp->func ); +      break; +          +   case TGSI_UTIL_SIGN_KEEP: +      break; + +   case TGSI_UTIL_SIGN_CLEAR: +      x87_fabs( cp->func ); +      break; + +   case TGSI_UTIL_SIGN_SET: +      x87_fabs( cp->func ); +      x87_fchs( cp->func ); +      break; + +   default: +      ERROR(cp, "unsupported sign-mode"); +      break; +   } +} + + + + + + +/* Used to implement write masking.  This and most of the other instructions + * here would be easier to implement if there had been a translation + * to a 2 argument format (dst/arg0, arg1) at the shader level before + * attempting to translate to x86/sse code. + */ +static void store_dest( struct aos_compilation *cp,  +                        const struct tgsi_full_dst_register *reg, +                        struct x86_reg result ) +{ +   if (reg->DstRegister.WriteMask == 0)  +   { +      return; +   } +   else if (reg->DstRegister.WriteMask == TGSI_WRITEMASK_XYZW) +   { +      if (result.file == file_XMM) { +         aos_adopt_xmm_reg(cp,  +                           result,  +                           reg->DstRegister.File, +                           reg->DstRegister.Index, +                           TRUE); +      } +      else { +         struct x86_reg dst = aos_get_xmm_reg(cp); +         aos_adopt_xmm_reg(cp,  +                           dst,  +                           reg->DstRegister.File, +                           reg->DstRegister.Index, +                           TRUE); +         sse_movups(cp->func, dst, result); +      } +   } +   else +   { +      /* Previous value of the dest register: +       */ +      struct x86_reg old_dst = aos_get_shader_reg(cp,  +                                                  reg->DstRegister.File, +                                                  reg->DstRegister.Index); + + +      /* Alloc an xmm reg to hold the new value of the dest register: +       */ +      struct x86_reg dst = aos_get_xmm_reg(cp); + +      aos_adopt_xmm_reg(cp,  +                        dst,  +                        reg->DstRegister.File, +                        reg->DstRegister.Index, +                        TRUE ); + +      switch (reg->DstRegister.WriteMask) { +      case TGSI_WRITEMASK_X: +         if (result.file == file_XMM) { +            sse_movups(cp->func, dst, old_dst); +            sse_movss(cp->func, dst, result); +         } +         else { +            struct x86_reg tmp = aos_get_xmm_reg(cp); +            sse_movups(cp->func, dst, old_dst); +            sse_movss(cp->func, tmp, result); +            sse_movss(cp->func, dst, tmp); +            aos_release_xmm_reg(cp, tmp.idx); +         } +         break; + +      case TGSI_WRITEMASK_XY: +         sse_movups(cp->func, dst, old_dst); +         sse_shufps(cp->func, dst, result, SHUF(X, Y, Z, W)); +         break; + +      case TGSI_WRITEMASK_ZW:  +         sse_movups(cp->func, dst, result); +         sse_shufps(cp->func, dst, old_dst, SHUF(X, Y, Z, W)); +         break; + +      case TGSI_WRITEMASK_YZW:  +         if (old_dst.file == file_XMM) { +            sse_movups(cp->func, dst, result); +            sse_movss(cp->func, dst, old_dst); +         } +         else { +            struct x86_reg tmp = aos_get_xmm_reg(cp);       +            sse_movups(cp->func, dst, result); +            sse_movss(cp->func, tmp, old_dst); +            sse_movss(cp->func, dst, tmp); +            aos_release_xmm_reg(cp, tmp.idx); +         } +         break; + +      case TGSI_WRITEMASK_Y: +         emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Y,X,Z,W)); +         break; + +      case TGSI_WRITEMASK_Z:  +         emit_shuf_copy1(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); +         break; + +      case TGSI_WRITEMASK_W:  +         emit_shuf_copy1(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); +         break; + +      case TGSI_WRITEMASK_XZ: +         emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,Z,Y,W)); +         break; + +      case TGSI_WRITEMASK_XW:  +         emit_shuf_copy2(cp, dst, result, old_dst, SHUF(X,W,Z,Y)); + +      case TGSI_WRITEMASK_YZ:       +         emit_shuf_copy2(cp, dst, result, old_dst, SHUF(Z,Y,X,W)); +         break; + +      case TGSI_WRITEMASK_YW: +         emit_shuf_copy2(cp, dst, result, old_dst, SHUF(W,Y,Z,X)); +         break; + +      case TGSI_WRITEMASK_XZW: +         emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Y,X,Z,W)); +         break; + +      case TGSI_WRITEMASK_XYW:  +         emit_shuf_copy1(cp, dst, old_dst, result, SHUF(Z,Y,X,W)); +         break; + +      case TGSI_WRITEMASK_XYZ:  +         emit_shuf_copy1(cp, dst, old_dst, result, SHUF(W,Y,Z,X)); +         break; + +      default: +         assert(0);             /* not possible */ +         break; +      } +   } +} + + +static void x87_fst_or_nop( struct x86_function *func, +                            unsigned writemask, +                            unsigned channel, +                            struct x86_reg ptr ) +{ +   if (writemask & (1<<channel))  +      x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) ); +} + +static void x87_fstp_or_pop( struct x86_function *func, +                             unsigned writemask, +                             unsigned channel, +                             struct x86_reg ptr ) +{ +   if (writemask & (1<<channel))  +      x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) ); +   else +      x87_fstp( func, x86_make_reg( file_x87, 0 )); +} + + + +/*  + */ +static void x87_fstp_dest4( struct aos_compilation *cp, +                            const struct tgsi_full_dst_register *dst ) +{ +   struct x86_reg ptr = get_dst_ptr(cp, dst);  +   unsigned writemask = dst->DstRegister.WriteMask; + +   x87_fst_or_nop(cp->func, writemask, 0, ptr); +   x87_fst_or_nop(cp->func, writemask, 1, ptr); +   x87_fst_or_nop(cp->func, writemask, 2, ptr); +   x87_fstp_or_pop(cp->func, writemask, 3, ptr); +} + +/* Save current x87 state and put it into single precision mode. + */ +static void save_fpu_state( struct aos_compilation *cp ) +{ +#if 0 +   x87_fnstcw( cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); +   x87_fldcw( cp->func, ); +#endif +} + +static void restore_fpu_state( struct aos_compilation *cp ) +{ +#if 0 +   x87_fnclex(cp->func); +   x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore))); +#endif +} + +static void set_fpu_round_neg_inf( struct aos_compilation *cp ) +{ +#if 0 +   if (cp->fpucntl != RND_NEG_FPU) { +      struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX); +      struct arb_vp_machine *m = NULL; + +      cp->fpucntl = RND_NEG_FPU; +      x87_fnclex(cp->func); +      x87_fldcw(cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg))); +   } +#endif +} + +static void set_fpu_round_nearest( struct aos_compilation *cp ) +{ +#if 0 +#endif +} + + +static void emit_x87_ex2( struct aos_compilation *cp ) +{ +   struct x86_reg st0 = x86_make_reg(file_x87, 0); +   struct x86_reg st1 = x86_make_reg(file_x87, 1); +   struct x86_reg st3 = x86_make_reg(file_x87, 3); + +   set_fpu_round_neg_inf( cp ); + +   x87_fld(cp->func, st0); /* a a */ +   x87_fprndint( cp->func );	/* int(a) a */ +   x87_fld(cp->func, st0); /* int(a) int(a) a */ +   x87_fstp(cp->func, st3); /* int(a) a int(a)*/ +   x87_fsubp(cp->func, st1); /* frac(a) int(a) */ +   x87_f2xm1(cp->func);    /* (2^frac(a))-1 int(a)*/ +   x87_fld1(cp->func);    /* 1 (2^frac(a))-1 int(a)*/ +   x87_faddp(cp->func, st1);	/* 2^frac(a) int(a) */ +   x87_fscale(cp->func);	/* 2^a */ +} + + + +/** + * The traditional instructions.  All operate on internal registers + * and ignore write masks and swizzling issues. + */ + +static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg dst = aos_get_xmm_reg(cp); +   struct x86_reg neg = aos_get_internal(cp, IMM_NEGS); + +   sse_movups(cp->func, dst, arg0); +   sse_mulps(cp->func, dst, neg); +   sse_maxps(cp->func, dst, arg0); +    +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); +   sse_addps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0); +   x87_fcos(cp->func); +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + +/* The dotproduct instructions don't really do that well in sse: + */ +static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); +   struct x86_reg tmp = aos_get_xmm_reg(cp);  + +   sse_movups(cp->func, dst, arg0); +   sse_mulps(cp->func, dst, arg1); +    +   /* Now the hard bit: sum the first 3 values: +    */  +   sse_movhlps(cp->func, tmp, dst); +   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ +   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); +   sse_addss(cp->func, dst, tmp); +   sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + +   aos_release_xmm_reg(cp, tmp.idx); +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + + +static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); +   struct x86_reg tmp = aos_get_xmm_reg(cp);       + +   sse_movups(cp->func, dst, arg0); +   sse_mulps(cp->func, dst, arg1); +    +   /* Now the hard bit: sum the values: +    */  +   sse_movhlps(cp->func, tmp, dst); +   sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */ +   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); +   sse_addss(cp->func, dst, tmp); +   sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + +   aos_release_xmm_reg(cp, tmp.idx); +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); +   struct x86_reg tmp = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); +   sse_mulps(cp->func, dst, arg1); + +   /* Now the hard bit: sum the values (from DP3): +    */  +   sse_movhlps(cp->func, tmp, dst); +   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */ +   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z)); +   sse_addss(cp->func, dst, tmp); +   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W)); +   sse_addss(cp->func, dst, tmp); +   sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + +   aos_release_xmm_reg(cp, tmp.idx); +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +    struct x86_reg dst = aos_get_xmm_reg(cp); +    struct x86_reg tmp = aos_get_xmm_reg(cp); +    struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + +/*    dst[0] = 1.0     * 1.0F; */ +/*    dst[1] = arg0[1] * arg1[1]; */ +/*    dst[2] = arg0[2] * 1.0; */ +/*    dst[3] = 1.0     * arg1[3]; */ + +    emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y)); +    emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W)); +    sse_mulps(cp->func, dst, tmp); + +    aos_release_xmm_reg(cp, tmp.idx); +    store_dest(cp, &op->FullDstRegisters[0], dst); +    return TRUE; +} + +static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld1(cp->func);		/* 1 */ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0 1 */ +   x87_fyl2x(cp->func);	/* log2(a0) */ +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + +static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0); + +   emit_x87_ex2(cp); + +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + +static boolean emit_EXP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +    struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +    struct x86_reg st0 = x86_make_reg(file_x87, 0); +    struct x86_reg st1 = x86_make_reg(file_x87, 1); +    struct x86_reg st3 = x86_make_reg(file_x87, 3); +    unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + +    /* CAUTION: dst may alias arg0! +     */ +    x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* arg0.x */ +    x87_fld(cp->func, st0); /* arg arg */ + +    /* by default, fpu is setup to round-to-nearest.  We want to +     * change this now, and track the state through to the end of the +     * generated function so that it isn't repeated unnecessarily. +     * Alternately, could subtract .5 to get round to -inf behaviour. +     */ +    set_fpu_round_neg_inf( cp ); +    x87_fprndint( cp->func );	/* flr(a) a */ +    x87_fld(cp->func, st0); /* flr(a) flr(a) a */ +    x87_fld1(cp->func);    /* 1 floor(a) floor(a) a */ +    x87_fst_or_nop(cp->func, writemask, 3, dst);  /* stack unchanged */ + +    x87_fscale(cp->func);  /* 2^floor(a) floor(a) a */ +    x87_fst(cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/ + +    x87_fstp_or_pop(cp->func, writemask, 0, dst); /* flr(a) a 2^flr(a) */ + +    x87_fsubrp(cp->func, st1); /* frac(a) 2^flr(a) */ + +    x87_fst_or_nop(cp->func, writemask, 1, dst);    /* frac(a) 2^flr(a) */ + +    x87_f2xm1(cp->func);    /* (2^frac(a))-1 2^flr(a)*/ +    x87_fld1(cp->func);    /* 1 (2^frac(a))-1 2^flr(a)*/ +    x87_faddp(cp->func, st1);	/* 2^frac(a) 2^flr(a) */ +    x87_fmulp(cp->func, st1);	/* 2^a */ +     +    x87_fstp_or_pop(cp->func, writemask, 2, dst);     + +/*    dst[0] = 2^floor(tmp); */ +/*    dst[1] = frac(tmp); */ +/*    dst[2] = 2^floor(tmp) * 2^frac(tmp); */ +/*    dst[3] = 1.0F; */ +    return TRUE; +} + +static boolean emit_LOG( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +    struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +    struct x86_reg st0 = x86_make_reg(file_x87, 0); +    struct x86_reg st1 = x86_make_reg(file_x87, 1); +    struct x86_reg st2 = x86_make_reg(file_x87, 2); +    unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +  +    /* CAUTION: dst may alias arg0! +     */ +    x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* arg0.x */ +    x87_fabs(cp->func);	/* |arg0.x| */ +    x87_fxtract(cp->func);	/* mantissa(arg0.x), exponent(arg0.x) */ +    x87_fst(cp->func, st2);	/* mantissa, exponent, mantissa */ +    x87_fld1(cp->func);	/* 1, mantissa, exponent, mantissa */ +    x87_fyl2x(cp->func); 	/* log2(mantissa), exponent, mantissa */ +    x87_fadd(cp->func, st0, st1);	/* e+l2(m), e, m  */ +     +    x87_fstp_or_pop(cp->func, writemask, 2, dst); /* e, m */ + +    x87_fld1(cp->func);	/* 1, e, m */ +    x87_fsub(cp->func, st1, st0);	/* 1, e-1, m */ + +    x87_fstp_or_pop(cp->func, writemask, 3, dst); /* e-1,m */ +    x87_fstp_or_pop(cp->func, writemask, 0, dst);	/* m */ + +    x87_fadd(cp->func, st0, st0);	/* 2m */ + +    x87_fstp_or_pop( cp->func, writemask, 1, dst ); + +    return TRUE; +} + +static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +   int i; + +   set_fpu_round_neg_inf( cp ); + +   /* Load all sources first to avoid aliasing +    */ +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fld_src(cp, &op->FullSrcRegisters[0], i); +      } +   } + +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fprndint( cp->func );    +         x87_fstp(cp->func, x86_make_disp(dst, i*4)); +      } +   } + +   return TRUE; +} + + +static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +   int i; + +   set_fpu_round_nearest( cp ); + +   /* Load all sources first to avoid aliasing +    */ +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fld_src(cp, &op->FullSrcRegisters[0], i); +      } +   } + +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fprndint( cp->func );    +         x87_fstp(cp->func, x86_make_disp(dst, i*4)); +      } +   } + +   return TRUE; +} + + +static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   struct x86_reg st0 = x86_make_reg(file_x87, 0); +   struct x86_reg st1 = x86_make_reg(file_x87, 1); +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; +   int i; + +   set_fpu_round_neg_inf( cp ); + +   /* suck all the source values onto the stack before writing out any +    * dst, which may alias... +    */ +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fld_src(cp, &op->FullSrcRegisters[0], i);    +      } +   } + +   for (i = 0; i < 4; i++) { +      if (writemask & (1<<i)) { +         x87_fld(cp->func, st0);     /* a a */ +         x87_fprndint( cp->func );   /* flr(a) a */ +         x87_fsubrp(cp->func, st1);  /* frc(a) */ +         x87_fstp(cp->func, x86_make_disp(dst, i*4)); +      } +   } + +   return TRUE; +} + + + +static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]);  +   struct x86_reg st1 = x86_make_reg(file_x87, 1); +   unsigned fixup1, fixup2; +   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask; + + +   /* Load the interesting parts of arg0: +    */ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 3); +   x87_fld_src(cp, &op->FullSrcRegisters[0], 1); +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0); +    + +   if (writemask & TGSI_WRITEMASK_XW) { +      x87_fld1(cp->func); +      x87_fst_or_nop(cp->func, writemask, 0, dst); +      x87_fstp_or_pop(cp->func, writemask, 3, dst); +   } + +   if (writemask & TGSI_WRITEMASK_YZ) { +       +      /* Pre-zero destinations, may be overwritten later...  fixme.  +       */ +      x87_fldz(cp->func); +      x87_fst_or_nop(cp->func, writemask, 1, dst); +      x87_fstp_or_pop(cp->func, writemask, 2, dst); + + +      /* Check arg0[0]: +       */ +      x87_fldz(cp->func);		/* 0 a0 a1 a3 */ +      x87_fucomp(cp->func, st1);	/* a0 a1 a3 */ +      x87_fnstsw(cp->func, cp->tmp_EAX); +      x86_sahf(cp->func); +      fixup1 = x86_jcc_forward(cp->func, cc_AE);  +    +      x87_fstp_or_pop(cp->func, writemask, 1, dst);	/* a1 a3 */ + +      /* Check arg0[1]: +       */  +      x87_fldz(cp->func);		/* 0 a1 a3 */ +      x87_fucomp(cp->func, st1);	/* a1 a3 */ +      x87_fnstsw(cp->func, cp->tmp_EAX); +      x86_sahf(cp->func); +      fixup2 = x86_jcc_forward(cp->func, cc_AE);  + +      /* Compute pow(a1, a3) +       */ +      x87_fyl2x(cp->func);	/* a3*log2(a1) */ + +      emit_x87_ex2( cp );		/* 2^(a3*log2(a1)) */ + +      x87_fstp_or_pop(cp->func, writemask, 2, dst); +    +      /* Land jumps: +       */ +      x86_fixup_fwd_jump(cp->func, fixup1); +      x86_fixup_fwd_jump(cp->func, fixup2); +   } + +   return TRUE; +} + + + +static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); +   sse_maxps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); +   sse_minps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); +   sse_mulps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); +   sse_mulps(cp->func, dst, arg1); +   sse_addps(cp->func, dst, arg2); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[1], 0);  /* a1.x */ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0.x a1.x */ +   x87_fyl2x(cp->func);	                                /* a1*log2(a0) */ + +   emit_x87_ex2( cp );		/* 2^(a1*log2(a0)) */ + +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + +static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   if (cp->have_sse2) { +      sse2_rcpss(cp->func, dst, arg0); +      /* extend precision here... +       */ +   } +   else { +      struct x86_reg ones = aos_get_internal(cp, IMM_ONES); +      sse_movss(cp->func, dst, ones); +      sse_divss(cp->func, dst, arg0); +   } + +   sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_rsqrtss(cp->func, dst, arg0); + +   /* Extend precision here... +    */ + +   sse_shufps(cp->func, dst, dst, SHUF(X, X, X, X)); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); +   struct x86_reg ones = aos_get_internal(cp, IMM_ONES); + +   sse_movups(cp->func, dst, arg0); +   sse_cmpps(cp->func, dst, arg1, cc_NotLessThan); +   sse_andps(cp->func, dst, ones); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   x87_fld_src(cp, &op->FullSrcRegisters[0], 0); +   x87_fsin(cp->func); +   x87_fstp_dest4(cp, &op->FullDstRegisters[0]); +   return TRUE; +} + + + +static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); +   struct x86_reg ones = aos_get_internal(cp, IMM_ONES); +    +   sse_movups(cp->func, dst, arg0); +   sse_cmpps(cp->func, dst, arg1, cc_LessThan); +   sse_andps(cp->func, dst, ones); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + +static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); + +   sse_movups(cp->func, dst, arg0); +   sse_subps(cp->func, dst, arg1); + +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + +static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )  +{ +   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]); +   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]); +   struct x86_reg dst = aos_get_xmm_reg(cp); +   struct x86_reg tmp0 = aos_get_xmm_reg(cp); +   struct x86_reg tmp1 = aos_get_xmm_reg(cp); + +   /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1.  Need a way +    * to invalidate registers.  This will come with better analysis +    * (liveness analysis) of the incoming program. +    */ +   emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W)); +   emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W)); +   sse_mulps(cp->func, dst, tmp1); +   emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W)); +   emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W)); +   sse_mulps(cp->func, tmp0, tmp1); +   sse_subps(cp->func, dst, tmp0); + +/*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */ +/*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */ +/*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */ +/*    dst[3] is undef */ + + +   aos_release_xmm_reg(cp, tmp0.idx); +   aos_release_xmm_reg(cp, tmp1.idx); +   store_dest(cp, &op->FullDstRegisters[0], dst); +   return TRUE; +} + + + +static boolean +emit_instruction( struct aos_compilation *cp, +                  struct tgsi_full_instruction *inst ) +{ +   switch( inst->Instruction.Opcode ) { +   case TGSI_OPCODE_MOV: +      return emit_MOV( cp, inst ); + +   case TGSI_OPCODE_LIT: +      return emit_LIT(cp, inst); + +   case TGSI_OPCODE_RCP: +      return emit_RCP(cp, inst); + +   case TGSI_OPCODE_RSQ: +      return emit_RSQ(cp, inst); + +   case TGSI_OPCODE_EXP: +      return emit_EXP(cp, inst); + +   case TGSI_OPCODE_LOG: +      return emit_LOG(cp, inst); + +   case TGSI_OPCODE_MUL: +      return emit_MUL(cp, inst); + +   case TGSI_OPCODE_ADD: +      return emit_ADD(cp, inst); + +   case TGSI_OPCODE_DP3: +      return emit_DP3(cp, inst); + +   case TGSI_OPCODE_DP4: +      return emit_DP4(cp, inst); + +   case TGSI_OPCODE_DST: +      return emit_DST(cp, inst); + +   case TGSI_OPCODE_MIN: +      return emit_MIN(cp, inst); + +   case TGSI_OPCODE_MAX: +      return emit_MAX(cp, inst); + +   case TGSI_OPCODE_SLT: +      return emit_SLT(cp, inst); + +   case TGSI_OPCODE_SGE: +      return emit_SGE(cp, inst); + +   case TGSI_OPCODE_MAD: +      return emit_MAD(cp, inst); + +   case TGSI_OPCODE_SUB: +      return emit_SUB(cp, inst); +  +   case TGSI_OPCODE_LERP: +//      return emit_LERP(cp, inst); +      return FALSE; + +   case TGSI_OPCODE_FRAC: +      return emit_FRC(cp, inst); + +   case TGSI_OPCODE_CLAMP: +//      return emit_CLAMP(cp, inst); +      return FALSE; + +   case TGSI_OPCODE_FLOOR: +      return emit_FLR(cp, inst); + +   case TGSI_OPCODE_ROUND: +      return emit_RND(cp, inst); + +   case TGSI_OPCODE_EXPBASE2: +      return emit_EX2(cp, inst); + +   case TGSI_OPCODE_LOGBASE2: +      return emit_LG2(cp, inst); + +   case TGSI_OPCODE_POWER: +      return emit_POW(cp, inst); + +   case TGSI_OPCODE_CROSSPRODUCT: +      return emit_XPD(cp, inst); + +   case TGSI_OPCODE_ABS: +      return emit_ABS(cp, inst); + +   case TGSI_OPCODE_DPH: +      return emit_DPH(cp, inst); + +   case TGSI_OPCODE_COS: +      return emit_COS(cp, inst); + +   case TGSI_OPCODE_SIN: +      return emit_SIN(cp, inst); + +   case TGSI_OPCODE_END: +      return TRUE; + +   default: +      return FALSE; +   } +} + +static boolean note_immediate( struct aos_compilation *cp, +                               struct tgsi_full_immediate *imm ) +{ +   unsigned pos = cp->num_immediates++; +   unsigned j; + +   for (j = 0; j < imm->Immediate.Size; j++) { +      cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float; +   } + +   return TRUE; +} + + + + +static void find_last_write_outputs( struct aos_compilation *cp ) +{ +   struct tgsi_parse_context parse; +   unsigned this_instruction = 0; +   unsigned i; + +   tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens ); + +   while (!tgsi_parse_end_of_tokens( &parse )) { +       +      tgsi_parse_token( &parse ); + +      if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)  +         continue; + +      for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) { +         if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File == +             TGSI_FILE_OUTPUT)  +         { +            unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index; +            cp->output_last_write[idx] = this_instruction; +         } +      } + +      this_instruction++; +   } + +   tgsi_parse_free( &parse ); +} + + +#define ARG_VARIENT    1 +#define ARG_START_ELTS 2 +#define ARG_COUNT      3 +#define ARG_OUTBUF     4 + + +static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient, +                                     boolean linear ) +{  +   struct tgsi_parse_context parse; +   struct aos_compilation cp; +   unsigned fixup, label; + +   tgsi_parse_init( &parse, varient->base.vs->state.tokens ); + +   memset(&cp, 0, sizeof(cp)); + +   cp.insn_counter = 1; +   cp.vaos = varient; +   cp.have_sse2 = 1; +   cp.func = &varient->func[ linear ? 0 : 1 ]; + +   cp.tmp_EAX       = x86_make_reg(file_REG32, reg_AX); +   cp.idx_EBX      = x86_make_reg(file_REG32, reg_BX); +   cp.outbuf_ECX    = x86_make_reg(file_REG32, reg_CX); +   cp.machine_EDX   = x86_make_reg(file_REG32, reg_DX); +   cp.count_ESI     = x86_make_reg(file_REG32, reg_SI); + +   x86_init_func(cp.func); + +   find_last_write_outputs(&cp); + +   x86_push(cp.func, cp.idx_EBX); +   x86_push(cp.func, cp.count_ESI); + + +   /* Load arguments into regs: +    */ +   x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_VARIENT)); +   x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS)); +   x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT)); +   x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF)); + + +   /* Compare count to zero and possibly bail. +    */ +   x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX); +   x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX); +   fixup = x86_jcc_forward(cp.func, cc_E); + +   /* Dig out the machine pointer from inside the varient arg  +    */ +   x86_mov(cp.func, cp.machine_EDX,  +           x86_make_disp(cp.machine_EDX, +                         Offset( struct draw_vs_varient_aos_sse, machine ))); + +   save_fpu_state( &cp ); + +   /* Note address for loop jump  +    */ +   label = x86_get_label(cp.func); +   { +      /* Fetch inputs...  TODO:  fetch lazily... +       */ +      if (!aos_fetch_inputs( &cp, linear )) +         goto fail; + +      /* Emit the shader: +       */ +      while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )  +      { +         tgsi_parse_token( &parse ); + +         switch (parse.FullToken.Token.Type) { +         case TGSI_TOKEN_TYPE_IMMEDIATE: +            if (!note_immediate( &cp, &parse.FullToken.FullImmediate )) +               goto fail; +            break; + +         case TGSI_TOKEN_TYPE_INSTRUCTION: +            if (!emit_instruction( &cp, &parse.FullToken.FullInstruction )) +               goto fail; +            break; +         } + +         cp.insn_counter++; +         debug_printf("\n"); +      } + +      if (cp.error) +         goto fail; + +      /* Emit output...  TODO: do this eagerly after the last write to a +       * given output. +       */ +      if (!aos_emit_outputs( &cp )) +         goto fail; + + +      /* Next vertex: +       */ +      x86_lea(cp.func,  +              cp.outbuf_ECX,  +              x86_make_disp(cp.outbuf_ECX,  +                            cp.vaos->base.key.output_stride)); + +      /* Incr index +       */    +      if (linear) { +         x86_inc(cp.func, cp.idx_EBX); +      }  +      else { +         x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4)); +      } + +   } +   /* decr count, loop if not zero +    */ +   x86_dec(cp.func, cp.count_ESI); +/*   x86_test(cp.func, cp.count_ESI, cp.count_ESI);  */ +   x86_jcc(cp.func, cc_NZ, label); + +   restore_fpu_state(&cp); + +   /* Land forward jump here: +    */ +   x86_fixup_fwd_jump(cp.func, fixup); + +   /* Exit mmx state? +    */ +   if (cp.func->need_emms) +      mmx_emms(cp.func); + +   x86_pop(cp.func, cp.count_ESI); +   x86_pop(cp.func, cp.idx_EBX); + +   x86_ret(cp.func); + +   tgsi_parse_free( &parse ); +   return !cp.error; + + fail: +   tgsi_parse_free( &parse ); +   return FALSE; +} + + + +static void vaos_set_buffer( struct draw_vs_varient *varient, +                             unsigned buf, +                             const void *ptr, +                             unsigned stride ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; +   unsigned i; + +   for (i = 0; i < vaos->base.vs->info.num_inputs; i++) { +      if (vaos->base.key.element[i].in.buffer == buf) { +         vaos->machine->attrib[i].input_ptr = ((char *)ptr + +                                               vaos->base.key.element[i].in.offset); +         vaos->machine->attrib[i].input_stride = stride; +      } +   } +} + + +static void vaos_destroy( struct draw_vs_varient *varient ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   if (vaos->machine) +      align_free( vaos->machine ); + +   x86_release_func( &vaos->func[0] ); +   x86_release_func( &vaos->func[1] ); + +   FREE(vaos); +} + +static void vaos_run_elts( struct draw_vs_varient *varient, +                           const unsigned *elts, +                           unsigned count, +                           void *output_buffer ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   vaos->gen_run_elts( varient, +                       elts, +                       count, +                       output_buffer ); +} + +static void vaos_run_linear( struct draw_vs_varient *varient, +                             unsigned start, +                             unsigned count, +                             void *output_buffer ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   vaos->gen_run_linear( varient, +                         start, +                         count, +                         output_buffer ); +} + + +static void vaos_set_constants( struct draw_vs_varient *varient, +                                const float (*constants)[4] ) +{ +   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient; + +   memcpy(vaos->machine->constant, +          constants, +          (vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1) * 4 * sizeof(float)); +} + + +static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ) +{ +   struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse); + +   if (!vaos) +      goto fail; +    +   vaos->base.key = *key; +   vaos->base.vs = vs; +   vaos->base.set_input = vaos_set_buffer; +   vaos->base.set_constants = vaos_set_constants; +   vaos->base.destroy = vaos_destroy; +   vaos->base.run_linear = vaos_run_linear; +   vaos->base.run_elts = vaos_run_elts; + +   vaos->machine = align_malloc( sizeof(struct aos_machine), 16 ); +   if (!vaos->machine) +      goto fail; +    +   memset(vaos->machine, 0, sizeof(struct aos_machine)); + +   tgsi_dump(vs->state.tokens, 0); + +   if (!build_vertex_program( vaos, TRUE )) +      goto fail; + +   if (!build_vertex_program( vaos, FALSE )) +      goto fail; + +   vaos->gen_run_linear = (vsv_run_linear_func)x86_get_func(&vaos->func[0]); +   if (!vaos->gen_run_linear) +      goto fail; + +   vaos->gen_run_elts = (vsv_run_elts_func)x86_get_func(&vaos->func[1]); +   if (!vaos->gen_run_elts) +      goto fail; + +   return &vaos->base; + + fail: +   if (vaos->machine) +      align_free( vaos->machine ); + +   if (vaos) +      x86_release_func( &vaos->func[0] ); + +   if (vaos) +      x86_release_func( &vaos->func[1] ); + +   FREE(vaos); +    +   return NULL; +} + + +struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs, +                                                 const struct draw_vs_varient_key *key ) +{ +   struct draw_vs_varient *varient = varient_aos_sse( vs, key ); + +   if (varient == NULL) { +      assert(0); +      varient = draw_vs_varient_generic( vs, key ); +   } + +   return varient; +} + + + +#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h new file mode 100644 index 0000000000..1d8a055a90 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -0,0 +1,181 @@ +/************************************************************************** + *  + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + +/* Authors:  Keith Whitwell <keith@tungstengraphics.com> + */ + +#ifndef DRAW_VS_AOS_H +#define DRAW_VS_AOS_H + + +struct tgsi_token; +struct x86_function; + +#include "pipe/p_state.h" +#include "rtasm/rtasm_x86sse.h" + + + + + +#define X    0 +#define Y    1 +#define Z    2 +#define W    3 + +#define MAX_INPUTS     PIPE_MAX_ATTRIBS +#define MAX_OUTPUTS    PIPE_MAX_ATTRIBS +#define MAX_TEMPS      PIPE_MAX_ATTRIBS /* say */ +#define MAX_CONSTANTS  PIPE_MAX_ATTRIBS /* say */ +#define MAX_IMMEDIATES PIPE_MAX_ATTRIBS /* say */ +#define MAX_INTERNALS  4 + +#define AOS_FILE_INTERNAL TGSI_FILE_COUNT + +/* This is the temporary storage used by all the aos_sse vs varients. + * Create one per context and reuse by passing a pointer in at + * vs_varient creation?? + */ +struct aos_machine { +   float input    [MAX_INPUTS    ][4]; +   float output   [MAX_OUTPUTS   ][4]; +   float temp     [MAX_TEMPS     ][4]; +   float constant [MAX_CONSTANTS ][4]; /* fixme -- should just be a pointer */ +   float immediate[MAX_IMMEDIATES][4]; /* fixme -- should just be a pointer */ +   float internal [MAX_INTERNALS ][4]; + +   unsigned fpu_round_nearest; +   unsigned fpu_round_neg_inf; + +   struct { +      const void *input_ptr; +      unsigned input_stride; + +      unsigned output_offset; +   } attrib[PIPE_MAX_ATTRIBS]; +}; + + + + +struct aos_compilation { +   struct x86_function *func; +   struct draw_vs_varient_aos_sse *vaos; + +   unsigned insn_counter; +   unsigned num_immediates; + +   struct { +      unsigned idx:16; +      unsigned file:8; +      unsigned dirty:8; +      unsigned last_used; +   } xmm[8]; + + +   boolean input_fetched[PIPE_MAX_ATTRIBS]; +   unsigned output_last_write[PIPE_MAX_ATTRIBS]; + +   boolean have_sse2; +   boolean error; +   short fpucntl; + +   /* these are actually known values, but putting them in a struct +    * like this is helpful to keep them in sync across the file. +    */ +   struct x86_reg tmp_EAX; +   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */ +   struct x86_reg outbuf_ECX; +   struct x86_reg machine_EDX; +   struct x86_reg count_ESI;    /* decrements to zero */ +}; + +struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp ); +void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx ); + +void aos_adopt_xmm_reg( struct aos_compilation *cp, +                        struct x86_reg reg, +                        unsigned file, +                        unsigned idx, +                        unsigned dirty ); + +struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,  +                                   unsigned file, +                                   unsigned idx ); + +boolean aos_fetch_inputs( struct aos_compilation *cp, +                          boolean linear ); + +boolean aos_emit_outputs( struct aos_compilation *cp ); + + +#define IMM_ONES     0              /* 1, 1,1,1 */ +#define IMM_NEGS     1              /* 1,-1,0,0 */ +#define IMM_IDENTITY 2              /* 0, 0,0,1 */ +#define IMM_INV_255  3              /* 1/255, 1/255, 1/255, 1/255 */ +#define IMM_255      4              /* 255, 255, 255, 255 */ + +struct x86_reg aos_get_internal( struct aos_compilation *cp, +                                 unsigned imm ); + + +#define ERROR(cp, msg)                                                  \ +do {                                                                    \ +   debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \ +   cp->error = 1;                                                       \ +   assert(0);                                                           \ +} while (0) + + + + + + +struct draw_vs_varient_aos_sse { +   struct draw_vs_varient base; +   struct draw_context *draw; + +#if 0 +   struct { +      const void *ptr; +      unsigned stride; +   } attrib[PIPE_MAX_ATTRIBS]; +#endif + +   struct aos_machine *machine; /* XXX: temporarily unshared */ + +   vsv_run_linear_func gen_run_linear; +   vsv_run_elts_func gen_run_elts; + + +   struct x86_function func[2]; +}; + + + +#endif  + diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c new file mode 100644 index 0000000000..72b2b3d11d --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c @@ -0,0 +1,314 @@ +/************************************************************************** + *  + * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + *  + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + *  + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + *  + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *  + **************************************************************************/ + + +#include "pipe/p_util.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/util/tgsi_parse.h" +#include "tgsi/util/tgsi_util.h" +#include "tgsi/exec/tgsi_exec.h" +#include "draw_vs.h" +#include "draw_vs_aos.h" + +#include "rtasm/rtasm_x86sse.h" + +#ifdef PIPE_ARCH_X86 + +/* Note - don't yet have to worry about interacting with the code in + * draw_vs_aos.c as there is no intermingling of generated code... + * That may have to change, we'll see. + */ +static void emit_load_R32G32B32A32( struct aos_compilation *cp, 			    +				    struct x86_reg data, +				    struct x86_reg src_ptr ) +{ +   sse_movups(cp->func, data, src_ptr); +} + +static void emit_load_R32G32B32( struct aos_compilation *cp, 			    +				 struct x86_reg data, +				 struct x86_reg src_ptr ) +{ +   sse_movss(cp->func, data, x86_make_disp(src_ptr, 8)); +   sse_shufps(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) ); +   sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) ); +   sse_movlps(cp->func, data, src_ptr); +} + +static void emit_load_R32G32( struct aos_compilation *cp,  +			   struct x86_reg data, +			   struct x86_reg src_ptr ) +{ +   sse_movups(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ) ); +   sse_movlps(cp->func, data, src_ptr); +} + + +static void emit_load_R32( struct aos_compilation *cp,  +			   struct x86_reg data, +			   struct x86_reg src_ptr ) +{ +   sse_movss(cp->func, data, src_ptr); +   sse_orps(cp->func, data, aos_get_internal( cp, IMM_IDENTITY ) ); +} + + +static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp, +				       struct x86_reg data, +				       struct x86_reg src_ptr ) +{ +   sse_movss(cp->func, data, src_ptr); +   sse2_punpcklbw(cp->func, data, aos_get_internal( cp, IMM_IDENTITY )); +   sse2_punpcklbw(cp->func, data, aos_get_internal( cp, IMM_IDENTITY )); +   sse2_cvtdq2ps(cp->func, data, data); +   sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255)); +} + + + +static void get_src_ptr( struct x86_function *func, +                         struct x86_reg src, +                         struct x86_reg machine, +                         struct x86_reg elt, +                         unsigned a ) +{ +   struct x86_reg input_ptr =  +      x86_make_disp(machine,  +		    Offset(struct aos_machine, attrib[a].input_ptr)); + +   struct x86_reg input_stride =  +      x86_make_disp(machine,  +		    Offset(struct aos_machine, attrib[a].input_stride)); + +   /* Calculate pointer to current attrib: +    */ +   x86_mov(func, src, input_stride); +   x86_imul(func, src, elt); +   x86_add(func, src, input_ptr); +} + + +/* Extended swizzles?  Maybe later. + */   +static void emit_swizzle( struct aos_compilation *cp, +			  struct x86_reg dest, +			  struct x86_reg src, +			  unsigned shuffle ) +{ +   sse_shufps(cp->func, dest, src, shuffle); +} + + +static boolean load_input( struct aos_compilation *cp, +                           unsigned idx, +                           boolean linear ) +{ +   unsigned format = cp->vaos->base.key.element[idx].in.format; +   struct x86_reg src = cp->tmp_EAX; +   struct x86_reg dataXMM = aos_get_xmm_reg(cp); + +   /* Figure out source pointer address: +    */ +   get_src_ptr(cp->func,  +               src,  +               cp->machine_EDX,  +               linear ? cp->idx_EBX : x86_deref(cp->idx_EBX), +               idx); + +   src = x86_deref(src); + +   aos_adopt_xmm_reg( cp, +                      dataXMM, +                      TGSI_FILE_INPUT, +                      idx, +                      TRUE ); + +   switch (format) { +   case PIPE_FORMAT_R32_FLOAT: +      emit_load_R32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_R32G32_FLOAT: +      emit_load_R32G32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_R32G32B32_FLOAT: +      emit_load_R32G32B32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_R32G32B32A32_FLOAT: +      emit_load_R32G32B32A32(cp, dataXMM, src); +      break; +   case PIPE_FORMAT_B8G8R8A8_UNORM: +      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); +      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); +      break; +   case PIPE_FORMAT_R8G8B8A8_UNORM: +      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src); +      break; +   default: +      ERROR(cp, "unhandled input format"); +      return FALSE; +   } + +   return TRUE; +} + + +boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear ) +{ +   unsigned i; +    +   for (i = 0; i < cp->vaos->base.vs->info.num_inputs; i++) { +      if (!load_input( cp, i, linear )) +         return FALSE; +      cp->insn_counter++; +      debug_printf("\n"); +   } + +   return TRUE; +} + + + + + + + +static void emit_store_R32G32B32A32( struct aos_compilation *cp, 			    +				     struct x86_reg dst_ptr, +				     struct x86_reg dataXMM ) +{ +   sse_movups(cp->func, dst_ptr, dataXMM); +} + +static void emit_store_R32G32B32( struct aos_compilation *cp,  +				  struct x86_reg dst_ptr, +				  struct x86_reg dataXMM ) +{ +   sse_movlps(cp->func, dst_ptr, dataXMM); +   sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ +   sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM); +} + +static void emit_store_R32G32( struct aos_compilation *cp,  +			       struct x86_reg dst_ptr, +			       struct x86_reg dataXMM ) +{ +   sse_movlps(cp->func, dst_ptr, dataXMM); +} + +static void emit_store_R32( struct aos_compilation *cp,  +			    struct x86_reg dst_ptr, +			    struct x86_reg dataXMM ) +{ +   sse_movss(cp->func, dst_ptr, dataXMM); +} + + + +static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp, +				       struct x86_reg dst_ptr, +				       struct x86_reg dataXMM ) +{ +   sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255)); +   sse2_cvtps2dq(cp->func, dataXMM, dataXMM); +   sse2_packssdw(cp->func, dataXMM, dataXMM); +   sse2_packuswb(cp->func, dataXMM, dataXMM); +   sse_movss(cp->func, dst_ptr, dataXMM); +} + + + + + +static boolean emit_output( struct aos_compilation *cp, +                            struct x86_reg ptr, +                            struct x86_reg dataXMM,  +                            unsigned format ) +{ +   switch (format) { +   case PIPE_FORMAT_R32_FLOAT: +      emit_store_R32(cp, ptr, dataXMM); +      break; +   case PIPE_FORMAT_R32G32_FLOAT: +      emit_store_R32G32(cp, ptr, dataXMM); +      break; +   case PIPE_FORMAT_R32G32B32_FLOAT: +      emit_store_R32G32B32(cp, ptr, dataXMM); +      break; +   case PIPE_FORMAT_R32G32B32A32_FLOAT: +      emit_store_R32G32B32A32(cp, ptr, dataXMM); +      break; +   case PIPE_FORMAT_B8G8R8A8_UNORM: +      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W)); +      emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); +      break; +   case PIPE_FORMAT_R8G8B8A8_UNORM: +      emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM); +      break; +   default: +      ERROR(cp, "unhandled output format"); +      return FALSE; +   } + +   return TRUE; +} + + + +boolean aos_emit_outputs( struct aos_compilation *cp ) +{ +   unsigned i; +    +   for (i = 0; i < cp->vaos->base.vs->info.num_inputs; i++) { +      unsigned format = cp->vaos->base.key.element[i].out.format; +      unsigned offset = cp->vaos->base.key.element[i].out.offset; + +      struct x86_reg data = aos_get_shader_reg( cp,  +                                                TGSI_FILE_OUTPUT, +                                                i ); + +      if (data.file != file_XMM) { +         struct x86_reg tmp = aos_get_xmm_reg( cp ); +         sse_movups(cp->func, tmp, data); +         data = tmp; +      } +       +      if (!emit_output( cp,  +                        x86_make_disp( cp->outbuf_ECX, offset ), +                        data,  +                        format )) +         return FALSE; + +      aos_release_xmm_reg( cp, data.idx ); + +      cp->insn_counter++; +      debug_printf("\n"); +   } + +   return TRUE; +} + +#endif diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index df94a7e0c7..0581c3042f 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -157,6 +157,7 @@ draw_create_vs_sse(struct draw_context *draw,     vs->base.draw = draw;     vs->base.create_varient = draw_vs_varient_generic; +//   vs->base.create_varient = draw_vs_varient_aos_sse;     vs->base.prepare = vs_sse_prepare;     vs->base.run_linear = vs_sse_run_linear;     vs->base.delete = vs_sse_delete;  | 
