diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/mesa/tnl/t_vertex_sse.c | 185 | 
1 files changed, 127 insertions, 58 deletions
diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c index b4e2c5b474..1771baab15 100644 --- a/src/mesa/tnl/t_vertex_sse.c +++ b/src/mesa/tnl/t_vertex_sse.c @@ -42,6 +42,8 @@  #define Z    2  #define W    3 +#define DISASSEM 1 +  struct x86_reg {     GLuint file:3;     GLuint idx:3; @@ -144,6 +146,17 @@ static struct x86_reg make_disp( struct x86_reg reg,     return reg;  } +static struct x86_reg deref( struct x86_reg reg ) +{ +   return make_disp(reg, 0); +} + +static struct x86_reg get_base_reg( struct x86_reg reg ) +{ +   return make_reg( reg.file, reg.idx ); +} + +  /* Retreive a reference to one of the function arguments, taking into   * account any push/pop activity:   */ @@ -179,29 +192,47 @@ static void emit_1b( struct x86_program *p, GLbyte b0 )     *(GLbyte *)(p->csr++) = b0;  } -static void emit_1ub( struct x86_program *p, GLubyte b0 ) +static void emit_1i( struct x86_program *p, GLint i0 ) +{ +   *(GLint *)(p->csr) = i0; +   p->csr += 4; +} + +static void disassem( struct x86_program *p, const char *fn ) +{ +#if DISASSEM +   static const char *last_fn; +   if (fn && fn != last_fn) { +      _mesa_printf("0x%x: %s\n", p->csr, fn); +      last_fn = fn; +   } +#endif +} + +static void emit_1ub_fn( struct x86_program *p, GLubyte b0, const char *fn )  { +   disassem(p, fn);     *(p->csr++) = b0;  } -static void emit_2ub( struct x86_program *p, GLubyte b0, GLubyte b1 ) +static void emit_2ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, const char *fn )  { +   disassem(p, fn);     *(p->csr++) = b0;     *(p->csr++) = b1;  } -static void emit_3ub( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2 ) +static void emit_3ub_fn( struct x86_program *p, GLubyte b0, GLubyte b1, GLubyte b2, const char *fn )  { +   disassem(p, fn);     *(p->csr++) = b0;     *(p->csr++) = b1;     *(p->csr++) = b2;  } -static void emit_1i( struct x86_program *p, GLint i0 ) -{ -   *(GLint *)(p->csr) = i0; -   p->csr += 4; -} +#define emit_1ub(p, b0)         emit_1ub_fn(p, b0, __FUNCTION__) +#define emit_2ub(p, b0, b1)     emit_2ub_fn(p, b0, b1, __FUNCTION__) +#define emit_3ub(p, b0, b1, b2) emit_3ub_fn(p, b0, b1, b2, __FUNCTION__)  /* Labels, jumps and fixup: @@ -216,7 +247,7 @@ static void emit_jcc( struct x86_program *p,  		      GLubyte *label )  {     GLint offset = label - (get_label(p) + 2); - +        if (offset <= 127 && offset >= -128) {        emit_1ub(p, 0x70 + cc);        emit_1b(p, (GLbyte) offset); @@ -273,7 +304,7 @@ static void emit_dec( struct x86_program *p,  		       struct x86_reg reg )  {     assert(reg.mod == mod_REG); -   emit_1ub(p, 0x40 + reg.idx); +   emit_1ub(p, 0x48 + reg.idx);  }  static void emit_ret( struct x86_program *p ) @@ -299,7 +330,13 @@ static void emit_modrm( struct x86_program *p,     val |= reg.idx << 3;		/* reg field */     val |= regmem.idx;		/* r/m field */ -   emit_1ub(p, val); +   emit_1ub_fn(p, val, 0); + +   /* Oh-oh we've stumbled into the SIB thing. +    */ +   if (regmem.idx == reg_SP) { +      emit_1ub_fn(p, 0x24, 0);		/* simplistic! */ +   }     switch (regmem.mod) {     case mod_REG: @@ -307,8 +344,10 @@ static void emit_modrm( struct x86_program *p,        break;     case mod_DISP8:        emit_1b(p, regmem.disp); +      break;     case mod_DISP32:        emit_1i(p, regmem.disp); +      break;     }  } @@ -325,14 +364,14 @@ static void emit_op_modrm( struct x86_program *p,  {     switch (dst.mod) {     case mod_REG: -      emit_1ub(p, op_dst_is_reg); +      emit_1ub_fn(p, op_dst_is_reg, 0);        emit_modrm(p, dst, src);        break;     case mod_INDIRECT:     case mod_DISP32:     case mod_DISP8:        assert(src.mod == mod_REG); -      emit_1ub(p, op_dst_is_mem); +      emit_1ub_fn(p, op_dst_is_mem, 0);        emit_modrm(p, src, dst);        break;     } @@ -352,6 +391,13 @@ static void emit_xor( struct x86_program *p,     emit_op_modrm( p, 0x33, 0x31, dst, src );  } +static void emit_cmp( struct x86_program *p, +		      struct x86_reg dst, +		      struct x86_reg src ) +{ +   emit_op_modrm( p, 0x3b, 0x39, dst, src ); +} +  static void emit_movlps( struct x86_program *p,  			 struct x86_reg dst,  			 struct x86_reg src ) @@ -443,6 +489,14 @@ static void emit_packsswb( struct x86_program *p,     emit_modrm( p, dst, src );  } +static void emit_packuswb( struct x86_program *p, +			struct x86_reg dst, +			struct x86_reg src ) +{ +   emit_3ub(p, 0x66, X86_TWOB, 0x67); +   emit_modrm( p, dst, src ); +} +  /* Load effective address:   */  static void emit_lea( struct x86_program *p, @@ -461,6 +515,14 @@ static void emit_add_imm( struct x86_program *p,     emit_lea(p, dst, make_disp(src, value));  } +static void emit_test( struct x86_program *p, +		       struct x86_reg dst, +		       struct x86_reg src ) +{ +   emit_1ub(p, 0x85); +   emit_modrm( p, dst, src ); +} + @@ -487,7 +549,7 @@ static void emit_pk4ub( struct x86_program *p,  {     emit_cvtps2dq(p, dest, arg0);     emit_packssdw(p, dest, dest); -   emit_packsswb(p, dest, dest); +   emit_packuswb(p, dest, dest);  }  static void emit_load4f_4( struct x86_program *p, 			    @@ -620,12 +682,12 @@ static void (*load[4][4])( struct x86_program *p,  };  static void emit_load( struct x86_program *p, -		       struct x86_reg temp, +		       struct x86_reg dest,  		       GLuint sz,  		       struct x86_reg src,  		       GLuint src_sz)  { -   load[sz-1][src_sz-1](p, temp, src); +   load[sz-1][src_sz-1](p, dest, src);  } @@ -721,6 +783,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )     struct x86_reg tmp = make_reg(file_XMM, 0);     struct x86_reg vp0 = make_reg(file_XMM, 1);     struct x86_reg vp1 = make_reg(file_XMM, 2); +   struct x86_reg chan0 = make_reg(file_XMM, 3);     GLubyte *fixup, *label;     p->csr = p->store; @@ -731,6 +794,15 @@ static GLboolean build_vertex_emit( struct x86_program *p )     emit_push(p, countEBP);     emit_push(p, vtxESI); + +   /* Get vertex count, compare to zero +    */ +   emit_xor(p, srcEDI, srcEDI); +   emit_mov(p, countEBP, make_fn_arg(p, 2)); +   emit_cmp(p, countEBP, srcEDI); +   fixup = emit_jcc_forward(p, cc_E); + +     /* Initialize destination register.       */     emit_mov(p, vertexEAX, make_fn_arg(p, 3)); @@ -741,10 +813,6 @@ static GLboolean build_vertex_emit( struct x86_program *p )     emit_mov(p, vtxESI, make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));     vtxESI = make_disp(vtxESI, get_offset(tnl, &tnl->clipspace)); -   /* Get vertex count, compare to zero -    */ -   emit_mov(p, countEBP, make_fn_arg(p, 2)); -   fixup = emit_jcc_forward(p, cc_NZ);     /* Possibly load vp0, vp1 for viewport calcs:      */ @@ -753,6 +821,10 @@ static GLboolean build_vertex_emit( struct x86_program *p )        emit_movups(p, vp1, make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));     } +   /* always load, needed or not: +    */ +   emit_movups(p, chan0, make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0]))); +     /* Note address for loop jump */     label = get_label(p); @@ -775,40 +847,40 @@ static GLboolean build_vertex_emit( struct x86_program *p )         */        switch (a[j].format) {        case EMIT_1F: -	 emit_load(p, tmp, 1, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 1, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_store(p, dest, 1, tmp);        case EMIT_2F: -	 emit_load(p, tmp, 2, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_store(p, dest, 2, tmp);        case EMIT_3F:  	 /* Potentially the worst case - hardcode 2+1 copying:  	  */ -	 emit_load(p, tmp, 3, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_store(p, dest, 3, tmp);        case EMIT_4F: -	 emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_store(p, dest, 4, tmp);  	 break;        case EMIT_2F_VIEWPORT:  -	 emit_load(p, tmp, 2, srcEDI, vtx->attr[j].inputsize); -	 emit_mulps(p, dest, vp0); -	 emit_addps(p, dest, vp1); +	 emit_load(p, tmp, 2, deref(srcEDI), vtx->attr[j].inputsize); +	 emit_mulps(p, tmp, vp0); +	 emit_addps(p, tmp, vp1);  	 emit_store(p, dest, 2, tmp);  	 break;        case EMIT_3F_VIEWPORT:  -	 emit_load(p, tmp, 3, srcEDI, vtx->attr[j].inputsize); -	 emit_mulps(p, dest, vp0); -	 emit_addps(p, dest, vp1); +	 emit_load(p, tmp, 3, deref(srcEDI), vtx->attr[j].inputsize); +	 emit_mulps(p, tmp, vp0); +	 emit_addps(p, tmp, vp1);  	 emit_store(p, dest, 3, tmp);  	 break;        case EMIT_4F_VIEWPORT:  -	 emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); -	 emit_mulps(p, dest, vp0); -	 emit_addps(p, dest, vp1); +	 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); +	 emit_mulps(p, tmp, vp0); +	 emit_addps(p, tmp, vp1);  	 emit_store(p, dest, 4, tmp);  	 break;        case EMIT_3F_XYW: -	 emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_pshufd(p, tmp, tmp, X, Y, W, Z);  	 emit_store(p, dest, 3, tmp);  	 break; @@ -818,48 +890,56 @@ static GLboolean build_vertex_emit( struct x86_program *p )        case EMIT_1UB_1F:	         case EMIT_3UB_3F_RGB:        case EMIT_3UB_3F_BGR: +	 _mesa_printf("non-implemneted format %d\n", a[j].format);  	 return GL_FALSE;	/* add this later */        case EMIT_4UB_4F_RGBA: -	 emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); +	 emit_mulps(p, tmp, chan0);  	 emit_pk4ub(p, tmp, tmp);  	 emit_store(p, dest, 1, tmp);  	 break;        case EMIT_4UB_4F_BGRA: -	 emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_pshufd(p, tmp, tmp, Z, Y, X, W); +	 emit_mulps(p, tmp, chan0);  	 emit_pk4ub(p, tmp, tmp);  	 emit_store(p, dest, 1, tmp);  	 break;        case EMIT_4UB_4F_ARGB: -	 emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_pshufd(p, tmp, tmp, W, X, Y, Z); +	 emit_mulps(p, tmp, chan0);  	 emit_pk4ub(p, tmp, tmp);  	 emit_store(p, dest, 1, tmp);  	 break;        case EMIT_4UB_4F_ABGR: -	 emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	 emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);  	 emit_pshufd(p, tmp, tmp, W, Z, Y, X); +	 emit_mulps(p, tmp, chan0);  	 emit_pk4ub(p, tmp, tmp);  	 emit_store(p, dest, 1, tmp);  	 break;        case EMIT_4CHAN_4F_RGBA:  	 switch (CHAN_TYPE) {  	 case GL_UNSIGNED_BYTE: -	    emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	    emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize); +	    emit_mulps(p, tmp, chan0);  	    emit_pk4ub(p, tmp, tmp);  	    emit_store(p, dest, 1, tmp);  	    break; -	 case GL_UNSIGNED_SHORT: -	    return GL_FALSE;  	 case GL_FLOAT: -	    emit_load(p, tmp, 4, srcEDI, vtx->attr[j].inputsize); +	    emit_load(p, tmp, 4, deref(srcEDI), vtx->attr[j].inputsize);  	    emit_store(p, dest, 4, tmp);  	    break; +	 case GL_UNSIGNED_SHORT:  	 default: -	    break; +	    _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE)); +	    return GL_FALSE;  	 } +	 break;        default: +	 _mesa_printf("unknown a[%d].format %d\n", j, a[j].format);  	 return GL_FALSE;	/* catch any new opcodes */        } @@ -881,6 +961,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )     /* decr count, loop if not zero      */     emit_dec(p, countEBP); +   emit_test(p, countEBP, countEBP);      emit_jcc(p, cc_NZ, label);     /* Land forward jump here: @@ -889,7 +970,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )     /* Pop regs and return      */ -   emit_pop(p, vtxESI); +   emit_pop(p, get_base_reg(vtxESI));     emit_pop(p, countEBP);     emit_pop(p, srcEDI);     emit_ret(p); @@ -912,20 +993,8 @@ void _tnl_generate_sse_emit( GLcontext *ctx )     if (build_vertex_emit(&p)) {        _tnl_register_fastpath( vtx, GL_TRUE ); - -      { -	 static int i = 0; -	 char filename[100]; -	 int fd; - -	 sprintf(filename, "fastpath%d.o", i);  -	 fd = creat(filename, 0600); -	 if (fd != -1) { -	    write(fd, p.store, p.csr - p.store); -	    close(fd); -	    _mesa_printf("wrote %s\n", filename); -	 } -      } +      if (DISASSEM) +	 _mesa_printf("disassemble 0x%x 0x%x\n", p.store, p.csr);     }     else {        FREE(p.store);  | 
