diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/mesa/x86/mmx_blend.S | 531 | ||||
-rw-r--r-- | src/mesa/x86/mmx_blendtmp.h | 113 |
2 files changed, 339 insertions, 305 deletions
diff --git a/src/mesa/x86/mmx_blend.S b/src/mesa/x86/mmx_blend.S index e679aa7bc7..f80cbf6c45 100644 --- a/src/mesa/x86/mmx_blend.S +++ b/src/mesa/x86/mmx_blend.S @@ -4,8 +4,10 @@ #include "matypes.h" -/* - * make the following approximation to the division (Sree) + +/* integer multiplication - alpha plus one + * + * makes the following approximation to the division (Sree) * * rgb*a/255 ~= (rgb*(a+1)) >> 256 * @@ -13,12 +15,24 @@ * * 0*0 = 0 and 255*255 = 255 * - * note this one should be used alone + * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making + * + * PCMPEQW ( MX1, MX1 ) */ -#define GMBT_ALPHA_PLUS_ONE 0 - -/* - * take the geometric series approximation to the division +#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \ + PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\ +TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\ + ;\ + PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */ + + +/* integer multiplication - geometric series + * + * takes the geometric series approximation to the division * * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. * @@ -29,333 +43,240 @@ * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, * so the special case a = 255 must be accounted or roundoff must be used */ -#define GMBT_GEOMETRIC_SERIES 1 - -/* +#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \ + PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ + ;\ + PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ +TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* integer multiplication - geometric series plus rounding + * * when using a geometric series division instead of truncating the result * use roundoff in the approximation (Jim Blinn) * * t = rgb*a + 0x80 * * achieving the exact results + * + * note that M80 is register with the 0x0080008000800080 constant */ -#define GMBT_ROUNDOFF 0 - -/* instead of the roundoff this adds a small correction to satisfy the OpenGL criteria +#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \ + PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ + ;\ + PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ +TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ + ;\ + PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ +TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* linear interpolation - geometric series + */ +#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \ + PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ +TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ + ;\ + PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ +TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ + ;\ + PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ + ;\ + PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ +TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ + ;\ + PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ +TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* linear interpolation - geometric series with roundoff + * + * this is a generalization of Blinn's formula to signed arithmetic + * + * note that M80 is a register with the 0x0080008000800080 constant + */ +#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \ + PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ +TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ + ;\ + PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ +TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ + ;\ + PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ + ;\ + PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\ +TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\ + ;\ + PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\ +TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\ + ;\ + PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\ +TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\ + ;\ + PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ +TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ + ;\ + PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ +TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ + ;\ + PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ +TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* linear interpolation - geometric series with correction + * + * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria * * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8 * * note that although is faster than rounding off it doesn't give always the exact results */ -#define GMBT_GEOMETRIC_CORRECTION 1 - -#if GMBT_ROUNDOFF +#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \ + PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ +TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ + ;\ + PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ +TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ + ;\ + PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ +TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ + ;\ + MOVQ ( MA1, MP1 ) ;\ +TWO(MOVQ ( MA2, MP2 )) ;\ + ;\ + PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\ +TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ + ;\ + PSRLW ( CONST(7), MP1 ) /* t1 >> 15 */ ;\ +TWO(PSRLW ( CONST(7), MP2 )) /* t2 >> 15 */ ;\ + ;\ + PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\ +TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\ + ;\ + PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ +TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ + ;\ + PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ +TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ + + +/* common blending initialization code + */ +#if 0 /* rounding not used */ SEG_DATA ALIGNDATA8 const_80: D_LONG 0x00800080, 0x00800080 -#endif - - SEG_TEXT - -ALIGNTEXT16 -GLOBL GLNAME(_mesa_mmx_blend_transparency) - -/* - * void blend_transparency( GLcontext *ctx, - * GLuint n, - * const GLubyte mask[], - * GLchan rgba[][4], - * CONST GLchan dest[][4] ) - * - * Common transparency blending mode. - */ -GLNAME( _mesa_mmx_blend_transparency ): - - PUSH_L ( EBP ) - MOV_L ( ESP, EBP ) - PUSH_L ( ESI ) - PUSH_L ( EDI ) - PUSH_L ( EBX ) - - MOV_L ( REGOFF(12, EBP), ECX ) /* n */ - CMP_L ( CONST(0), ECX) - JE ( LLBL (GMBT_return) ) - - MOV_L ( REGOFF(16, EBP), EBX ) /* mask */ - MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */ - MOV_L ( REGOFF(24, EBP), ESI ) /* dest */ - - TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */ - JZ ( LLBL (GMBT_align_end) ) - - CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ - JE ( LLBL (GMBT_align_continue) ) - - PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ - - MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */ - MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */ - - PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */ - PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */ - - MOVQ ( MM2, MM3 ) - - PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */ - PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */ - -#if GMBT_ALPHA_PLUS_ONE - PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ - - PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */ -#endif - - PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ - - PSLLW ( CONST(8), MM1 ) /* q1 << 8 */ - -#if GMBT_ROUNDOFF - MOVQ ( MM2, MM4 ) -#endif - - PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */ - -#if GMBT_ROUNDOFF - PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */ - - PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */ - - PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */ -#endif - -#if GMBT_ROUNDOFF - MOVQ ( CONTENT(const_80), MM4 ) - - PADDW ( MM4, MM2 ) /* t1 += 0x80 */ -#endif - -#if GMBT_GEOMETRIC_SERIES - MOVQ ( MM2, MM3 ) - - PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */ - - PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ - -#if GMBT_GEOMETRIC_CORRECTION - PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */ - - PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ -#endif -#endif - - PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */ - - PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */ - - PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */ - MOVD ( MM2, REGIND(EDI) ) - -LLBL (GMBT_align_continue): - - DEC_L ( ECX ) /* n -= 1 */ - INC_L ( EBX ) /* mask += 1 */ - ADD_L ( CONST(4), EDI ) /* rgba += 1 */ - ADD_L ( CONST(4), ESI ) /* dest += 1 */ - -LLBL (GMBT_align_end): - - CMP_L ( CONST(2), ECX) - JB ( LLBL (GMBT_loop_end) ) -ALIGNTEXT16 -LLBL (GMBT_loop_begin): +#define GMB_INIT( M00, M80 ) \ + PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ + MOVQ ( CONTENT(const_80), M80 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ - CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */ - JE ( LLBL (GMBT_loop_continue) ) +#else - /* NOTE: the instruction pairing when multiple pipelines are available must be checked */ +#define GMB_INIT( M00 ) \ + PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ - PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ - - MOVQ ( REGIND(ESI), MM7 ) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ - MOVQ ( REGIND(EDI), MM6 ) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ - - MOVQ ( MM7, MM1 ) - MOVQ ( MM6, MM2 ) - - PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */ - PUNPCKHBW ( MM0, MM7 ) /* qa2 | qb2 | qg2 | qr2 */ - PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */ - PUNPCKHBW ( MM0, MM6 ) /* pa2 | pb2 | pg2 | pr2 */ - - MOVQ ( MM2, MM3 ) - MOVQ ( MM6, MM5 ) - - PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */ - PUNPCKHWD ( MM5, MM5 ) /* pa2 | pa2 | | */ - PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */ - PUNPCKHDQ ( MM5, MM5 ) /* pa2 | pa2 | pa2 | pa2 */ - -#if GMBT_ALPHA_PLUS_ONE - PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ - - PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */ - PSUBW ( MM4, MM5 ) /* pa2 + 1 | pa2 + 1 | pa2 + 1 | pa2 + 1 */ -#endif - - PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ - PSUBW ( MM7, MM6 ) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ - - PSLLW ( CONST(8), MM1 ) /* q1 << 8 */ - PSLLW ( CONST(8), MM7 ) /* q2 << 8 */ - -#if GMBT_ROUNDOFF - MOVQ ( MM2, MM0 ) - MOVQ ( MM6, MM4 ) -#endif - - PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */ - PMULLW ( MM5, MM6 ) /* t2 = (q2 - p2)*pa2 */ - -#if GMBT_ROUNDOFF - PSRLW ( CONST(15), MM0 ) /* q1 > p1 ? 1 : 0 */ - PSRLW ( CONST(15), MM4 ) /* q2 > q2 ? 1 : 0 */ - - PSLLW ( CONST(8), MM0 ) /* q1 > p1 ? 0x100 : 0 */ - PSLLW ( CONST(8), MM4 ) /* q2 > q2 ? 0x100 : 0 */ - - PSUBW ( MM0, MM2 ) /* t1 -=? 0x100 */ - PSUBW ( MM4, MM7 ) /* t2 -=? 0x100 */ -#endif - -#if GMBT_ROUNDOFF - MOVQ ( CONTENT(const_80), MM4 ) - - PADDW ( MM4, MM2 ) /* t1 += 0x80 */ - PADDW ( MM4, MM6 ) /* t2 += 0x80 */ -#endif - -#if GMBT_GEOMETRIC_SERIES - MOVQ ( MM2, MM3 ) - MOVQ ( MM6, MM5 ) - - PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */ - PSRLW ( CONST(8), MM5 ) /* t2 >> 8 */ - - PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ - PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ - -#if GMBT_GEOMETRIC_CORRECTION - PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */ - PSRLW ( CONST(7), MM5 ) /* t2 >> 15 */ - - PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ - PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ -#endif -#endif - - PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */ - PADDW ( MM7, MM6 ) /* (t2/255 + q2) << 8 */ - - PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */ - PSRLW ( CONST(8), MM6 ) /* sa2 | sb2 | sg2 | sr2 */ - - PACKUSWB ( MM6, MM2 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ - MOVQ ( MM2, REGIND(EDI) ) - -LLBL (GMBT_loop_continue): - - DEC_L ( ECX ) - DEC_L ( ECX ) /* n -= 2 */ - ADD_L ( CONST(2), EBX ) /* mask += 2 */ - ADD_L ( CONST(8), EDI ) /* rgba += 2 */ - ADD_L ( CONST(8), ESI ) /* dest += 2 */ - CMP_L ( CONST(2), ECX ) - JAE ( LLBL (GMBT_loop_begin) ) - -LLBL (GMBT_loop_end): - - CMP_L ( CONST(1), ECX ) - JB ( LLBL (GMBT_done) ) - - CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ - JE ( LLBL (GMBT_done) ) - - PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ - - MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */ - MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */ - - PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */ - PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */ - - MOVQ ( MM2, MM3 ) - - PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */ - PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */ - -#if GMBT_ALPHA_PLUS_ONE - PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ - - PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */ -#endif - - PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ - - PSLLW ( CONST(8), MM1 ) /* q1 << 8 */ - -#if GMBT_ROUNDOFF - MOVQ ( MM2, MM4 ) #endif - PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */ - -#if GMBT_ROUNDOFF - PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */ - - PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */ - - PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */ -#endif - -#if GMBT_ROUNDOFF - MOVQ ( CONTENT(const_80), MM4 ) - - PADDW ( MM4, MM2 ) /* t1 += 0x80 */ -#endif - -#if GMBT_GEOMETRIC_SERIES - MOVQ ( MM2, MM3 ) - - PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */ +/* common blending loading code + * + * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making + * + * PXOR ( M00, M00 ) + */ +#define GMB_LOAD(rgba, dest, MP1, MQ1, MA1, MP2, MQ2, MA2, M00) \ +ONE(MOVD ( REGIND(rgba), MP1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ +ONE(MOVD ( REGIND(dest), MQ1 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ + ;\ +TWO(MOVQ ( REGIND(rgba), MP1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ +TWO(MOVQ ( REGIND(dest), MQ1 )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ ;\ + ;\ +TWO(MOVQ ( MP1, MP2 )) ;\ +TWO(MOVQ ( MQ1, MQ2 )) ;\ + ;\ + PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\ +TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\ + PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\ +TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ ;\ + ;\ + MOVQ ( MP1, MA1 ) ;\ +TWO(MOVQ ( MP2, MA2 )) ;\ + ;\ + PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\ +TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\ + PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\ +TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */ + + +/* common blending storing code + */ +#define GMB_STORE(rgba, MA1, MA2) \ + PACKUSWB ( MA2, MA1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\ + ;\ +ONE(MOVD ( MA1, REGIND(rgba) )) ;\ +TWO(MOVQ ( MA1, REGIND(rgba) )) - PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ -#if GMBT_GEOMETRIC_CORRECTION - PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */ + SEG_TEXT - PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ -#endif -#endif - PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */ +/* common transparency blending mode + */ - PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */ - - PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */ - MOVD ( MM2, REGIND(EDI) ) +#define TAG(x) x##_transparency -LLBL (GMBT_done): +#define INIT \ + GMB_INIT( MM0 ) - EMMS +#define MAIN \ + GMB_LOAD( EDI, ESI, MM1, MM2, MM3, MM4, MM5, MM6, MM0) ;\ + GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\ + GMB_STORE( EDI, MM3, MM6 ) -LLBL (GMBT_return): +#include "mmx_blendtmp.h" - POP_L ( EBX ) - POP_L ( EDI ) - POP_L ( ESI ) - MOV_L ( EBP, ESP ) - POP_L ( EBP ) - RET diff --git a/src/mesa/x86/mmx_blendtmp.h b/src/mesa/x86/mmx_blendtmp.h new file mode 100644 index 0000000000..395436ba01 --- /dev/null +++ b/src/mesa/x86/mmx_blendtmp.h @@ -0,0 +1,113 @@ +/* + * Written by José Fonseca <j_r_fonseca@yahoo.co.uk> + */ + + +/* + * void _mesa_mmx_blend( GLcontext *ctx, + * GLuint n, + * const GLubyte mask[], + * GLchan rgba[][4], + * CONST GLchan dest[][4] ) + * + */ +ALIGNTEXT16 +GLOBL GLNAME( TAG(_mesa_mmx_blend) ) + +GLNAME( TAG(_mesa_mmx_blend) ): + + PUSH_L ( EBP ) + MOV_L ( ESP, EBP ) + PUSH_L ( ESI ) + PUSH_L ( EDI ) + PUSH_L ( EBX ) + + MOV_L ( REGOFF(12, EBP), ECX ) /* n */ + CMP_L ( CONST(0), ECX) + JE ( LLBL ( TAG(GMB_return) ) ) + + MOV_L ( REGOFF(16, EBP), EBX ) /* mask */ + MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */ + MOV_L ( REGOFF(24, EBP), ESI ) /* dest */ + + INIT + + TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */ + JZ ( LLBL ( TAG(GMB_align_end) ) ) + + CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ + JE ( LLBL ( TAG(GMB_align_continue) ) ) + + /* runin */ +#define ONE(x) x +#define TWO(x) + MAIN +#undef ONE +#undef TWO + +LLBL ( TAG(GMB_align_continue) ): + + DEC_L ( ECX ) /* n -= 1 */ + INC_L ( EBX ) /* mask += 1 */ + ADD_L ( CONST(4), EDI ) /* rgba += 1 */ + ADD_L ( CONST(4), ESI ) /* dest += 1 */ + +LLBL ( TAG(GMB_align_end) ): + + CMP_L ( CONST(2), ECX) + JB ( LLBL ( TAG(GMB_loop_end) ) ) + +ALIGNTEXT16 +LLBL ( TAG(GMB_loop_begin) ): + + CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */ + JE ( LLBL ( TAG(GMB_loop_continue) ) ) + + /* main loop */ +#define ONE(x) +#define TWO(x) x + MAIN +#undef ONE +#undef TWO + +LLBL ( TAG(GMB_loop_continue) ): + + DEC_L ( ECX ) + DEC_L ( ECX ) /* n -= 2 */ + ADD_L ( CONST(2), EBX ) /* mask += 2 */ + ADD_L ( CONST(8), EDI ) /* rgba += 2 */ + ADD_L ( CONST(8), ESI ) /* dest += 2 */ + CMP_L ( CONST(2), ECX ) + JAE ( LLBL ( TAG(GMB_loop_begin) ) ) + +LLBL ( TAG(GMB_loop_end) ): + + CMP_L ( CONST(1), ECX ) + JB ( LLBL ( TAG(GMB_done) ) ) + + CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ + JE ( LLBL ( TAG(GMB_done) ) ) + + /* runout */ +#define ONE(x) x +#define TWO(x) + MAIN +#undef ONE +#undef TWO + +LLBL ( TAG(GMB_done) ): + + EMMS + +LLBL ( TAG(GMB_return) ): + + POP_L ( EBX ) + POP_L ( EDI ) + POP_L ( ESI ) + MOV_L ( EBP, ESP ) + POP_L ( EBP ) + RET + +#undef TAG +#undef INIT +#undef MAIN |