summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJose Fonseca <j_r_fonseca@yahoo.co.uk>2002-04-18 11:57:28 +0000
committerJose Fonseca <j_r_fonseca@yahoo.co.uk>2002-04-18 11:57:28 +0000
commit55d9ee83b4c29e8f7c373ee6326bbb4f77402bee (patch)
tree6298e1e0caa0bebe42f3fcf140ddd901e9c5d115
parent9ff3e9d992bcd7b195feb39a2aacc7d0ea43bd5a (diff)
Definition of several utility macros for self-contained MMX operations such as scaling and lerping.
Restructured the MMX blending function to use a template, being only necessary to specify the main loop, which is also used for making the runin and runout sections. Optimization of the MMX function after remembering that the multiplication was commutative (how can somebody forget this..) resulting in less register usage. Now there is no need for generate or read from memory any constant inside the loop. Assemblers other than the GNU assembler can choke on the output of the C preprocessor since it was necessary to add line separators ';' to the defined macros.
-rw-r--r--src/mesa/x86/mmx_blend.S521
-rw-r--r--src/mesa/x86/mmx_blendtmp.h113
2 files changed, 334 insertions, 300 deletions
diff --git a/src/mesa/x86/mmx_blend.S b/src/mesa/x86/mmx_blend.S
index e679aa7bc7..f80cbf6c45 100644
--- a/src/mesa/x86/mmx_blend.S
+++ b/src/mesa/x86/mmx_blend.S
@@ -4,8 +4,10 @@
#include "matypes.h"
-/*
- * make the following approximation to the division (Sree)
+
+/* integer multiplication - alpha plus one
+ *
+ * makes the following approximation to the division (Sree)
*
* rgb*a/255 ~= (rgb*(a+1)) >> 256
*
@@ -13,12 +15,24 @@
*
* 0*0 = 0 and 255*255 = 255
*
- * note this one should be used alone
+ * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
+ *
+ * PCMPEQW ( MX1, MX1 )
*/
-#define GMBT_ALPHA_PLUS_ONE 0
+#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
+ PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\
+TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\
+ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */
-/*
- * take the geometric series approximation to the division
+
+/* integer multiplication - geometric series
+ *
+ * takes the geometric series approximation to the division
*
* t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
*
@@ -29,333 +43,240 @@
* note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254,
* so the special case a = 255 must be accounted or roundoff must be used
*/
-#define GMBT_GEOMETRIC_SERIES 1
+#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
+ PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+ ;\
+ PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
+TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
-/*
+
+/* integer multiplication - geometric series plus rounding
+ *
* when using a geometric series division instead of truncating the result
* use roundoff in the approximation (Jim Blinn)
*
* t = rgb*a + 0x80
*
* achieving the exact results
+ *
+ * note that M80 is register with the 0x0080008000800080 constant
+ */
+#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
+ PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\
+ ;\
+ PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
+TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+ ;\
+ PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
+TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
+
+/* linear interpolation - geometric series
+ */
+#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+ PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
+TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
+ ;\
+ PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
+TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
+ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+ ;\
+ PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
+TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+ ;\
+ PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
+TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
+
+/* linear interpolation - geometric series with roundoff
+ *
+ * this is a generalization of Blinn's formula to signed arithmetic
+ *
+ * note that M80 is a register with the 0x0080008000800080 constant
*/
-#define GMBT_ROUNDOFF 0
+#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
+ PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
+TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
+ ;\
+ PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
+TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
+ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
+ ;\
+ PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\
+TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\
+ ;\
+ PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\
+TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\
+ ;\
+ PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\
+TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\
+ ;\
+ PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\
+TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+ ;\
+ PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
+TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+ ;\
+ PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
+TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
-/* instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
+/* linear interpolation - geometric series with correction
+ *
+ * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
*
* t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
*
* note that although is faster than rounding off it doesn't give always the exact results
*/
-#define GMBT_GEOMETRIC_CORRECTION 1
+#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
+ PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\
+TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\
+ ;\
+ PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\
+TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\
+ ;\
+ PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\
+TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\
+ ;\
+ MOVQ ( MA1, MP1 ) ;\
+TWO(MOVQ ( MA2, MP2 )) ;\
+ ;\
+ PSRLW ( CONST(8), MP1 ) /* t1 >> 8 */ ;\
+TWO(PSRLW ( CONST(8), MP2 )) /* t2 >> 8 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(7), MP1 ) /* t1 >> 15 */ ;\
+TWO(PSRLW ( CONST(7), MP2 )) /* t2 >> 15 */ ;\
+ ;\
+ PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\
+TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\
+ ;\
+ PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\
+TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\
+ ;\
+ PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\
+TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */
+
-#if GMBT_ROUNDOFF
+/* common blending initialization code
+ */
+#if 0 /* rounding not used */
SEG_DATA
ALIGNDATA8
const_80:
D_LONG 0x00800080, 0x00800080
-#endif
-
- SEG_TEXT
-
-ALIGNTEXT16
-GLOBL GLNAME(_mesa_mmx_blend_transparency)
-
-/*
- * void blend_transparency( GLcontext *ctx,
- * GLuint n,
- * const GLubyte mask[],
- * GLchan rgba[][4],
- * CONST GLchan dest[][4] )
- *
- * Common transparency blending mode.
- */
-GLNAME( _mesa_mmx_blend_transparency ):
-
- PUSH_L ( EBP )
- MOV_L ( ESP, EBP )
- PUSH_L ( ESI )
- PUSH_L ( EDI )
- PUSH_L ( EBX )
-
- MOV_L ( REGOFF(12, EBP), ECX ) /* n */
- CMP_L ( CONST(0), ECX)
- JE ( LLBL (GMBT_return) )
-
- MOV_L ( REGOFF(16, EBP), EBX ) /* mask */
- MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */
- MOV_L ( REGOFF(24, EBP), ESI ) /* dest */
-
- TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */
- JZ ( LLBL (GMBT_align_end) )
- CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
- JE ( LLBL (GMBT_align_continue) )
+#define GMB_INIT( M00, M80 ) \
+ PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
+ MOVQ ( CONTENT(const_80), M80 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
- PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
+#else
- MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */
- MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */
+#define GMB_INIT( M00 ) \
+ PXOR ( M00, M00 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
- PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */
- PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */
-
- MOVQ ( MM2, MM3 )
-
- PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */
- PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */
-
-#if GMBT_ALPHA_PLUS_ONE
- PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
-
- PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */
-#endif
-
- PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
-
- PSLLW ( CONST(8), MM1 ) /* q1 << 8 */
-
-#if GMBT_ROUNDOFF
- MOVQ ( MM2, MM4 )
-#endif
-
- PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */
-
-#if GMBT_ROUNDOFF
- PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */
-
- PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */
-
- PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */
-#endif
-
-#if GMBT_ROUNDOFF
- MOVQ ( CONTENT(const_80), MM4 )
-
- PADDW ( MM4, MM2 ) /* t1 += 0x80 */
#endif
-#if GMBT_GEOMETRIC_SERIES
- MOVQ ( MM2, MM3 )
-
- PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */
-
- PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */
-
-#if GMBT_GEOMETRIC_CORRECTION
- PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */
-
- PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */
-#endif
-#endif
-
- PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */
-
- PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */
-
- PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */
- MOVD ( MM2, REGIND(EDI) )
-
-LLBL (GMBT_align_continue):
-
- DEC_L ( ECX ) /* n -= 1 */
- INC_L ( EBX ) /* mask += 1 */
- ADD_L ( CONST(4), EDI ) /* rgba += 1 */
- ADD_L ( CONST(4), ESI ) /* dest += 1 */
-
-LLBL (GMBT_align_end):
-
- CMP_L ( CONST(2), ECX)
- JB ( LLBL (GMBT_loop_end) )
-
-ALIGNTEXT16
-LLBL (GMBT_loop_begin):
-
- CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */
- JE ( LLBL (GMBT_loop_continue) )
-
- /* NOTE: the instruction pairing when multiple pipelines are available must be checked */
-
- PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
-
- MOVQ ( REGIND(ESI), MM7 ) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */
- MOVQ ( REGIND(EDI), MM6 ) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
-
- MOVQ ( MM7, MM1 )
- MOVQ ( MM6, MM2 )
-
- PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */
- PUNPCKHBW ( MM0, MM7 ) /* qa2 | qb2 | qg2 | qr2 */
- PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */
- PUNPCKHBW ( MM0, MM6 ) /* pa2 | pb2 | pg2 | pr2 */
-
- MOVQ ( MM2, MM3 )
- MOVQ ( MM6, MM5 )
-
- PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */
- PUNPCKHWD ( MM5, MM5 ) /* pa2 | pa2 | | */
- PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */
- PUNPCKHDQ ( MM5, MM5 ) /* pa2 | pa2 | pa2 | pa2 */
-
-#if GMBT_ALPHA_PLUS_ONE
- PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
-
- PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */
- PSUBW ( MM4, MM5 ) /* pa2 + 1 | pa2 + 1 | pa2 + 1 | pa2 + 1 */
-#endif
-
- PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
- PSUBW ( MM7, MM6 ) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */
-
- PSLLW ( CONST(8), MM1 ) /* q1 << 8 */
- PSLLW ( CONST(8), MM7 ) /* q2 << 8 */
-
-#if GMBT_ROUNDOFF
- MOVQ ( MM2, MM0 )
- MOVQ ( MM6, MM4 )
-#endif
-
- PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */
- PMULLW ( MM5, MM6 ) /* t2 = (q2 - p2)*pa2 */
-
-#if GMBT_ROUNDOFF
- PSRLW ( CONST(15), MM0 ) /* q1 > p1 ? 1 : 0 */
- PSRLW ( CONST(15), MM4 ) /* q2 > q2 ? 1 : 0 */
-
- PSLLW ( CONST(8), MM0 ) /* q1 > p1 ? 0x100 : 0 */
- PSLLW ( CONST(8), MM4 ) /* q2 > q2 ? 0x100 : 0 */
-
- PSUBW ( MM0, MM2 ) /* t1 -=? 0x100 */
- PSUBW ( MM4, MM7 ) /* t2 -=? 0x100 */
-#endif
-
-#if GMBT_ROUNDOFF
- MOVQ ( CONTENT(const_80), MM4 )
-
- PADDW ( MM4, MM2 ) /* t1 += 0x80 */
- PADDW ( MM4, MM6 ) /* t2 += 0x80 */
-#endif
-
-#if GMBT_GEOMETRIC_SERIES
- MOVQ ( MM2, MM3 )
- MOVQ ( MM6, MM5 )
-
- PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */
- PSRLW ( CONST(8), MM5 ) /* t2 >> 8 */
-
- PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */
- PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */
-
-#if GMBT_GEOMETRIC_CORRECTION
- PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */
- PSRLW ( CONST(7), MM5 ) /* t2 >> 15 */
-
- PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */
- PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */
-#endif
-#endif
-
- PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */
- PADDW ( MM7, MM6 ) /* (t2/255 + q2) << 8 */
-
- PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */
- PSRLW ( CONST(8), MM6 ) /* sa2 | sb2 | sg2 | sr2 */
-
- PACKUSWB ( MM6, MM2 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
- MOVQ ( MM2, REGIND(EDI) )
-
-LLBL (GMBT_loop_continue):
-
- DEC_L ( ECX )
- DEC_L ( ECX ) /* n -= 2 */
- ADD_L ( CONST(2), EBX ) /* mask += 2 */
- ADD_L ( CONST(8), EDI ) /* rgba += 2 */
- ADD_L ( CONST(8), ESI ) /* dest += 2 */
- CMP_L ( CONST(2), ECX )
- JAE ( LLBL (GMBT_loop_begin) )
-
-LLBL (GMBT_loop_end):
-
- CMP_L ( CONST(1), ECX )
- JB ( LLBL (GMBT_done) )
-
- CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
- JE ( LLBL (GMBT_done) )
-
- PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */
-
- MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */
- MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */
-
- PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */
- PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */
-
- MOVQ ( MM2, MM3 )
-
- PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */
- PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */
-
-#if GMBT_ALPHA_PLUS_ONE
- PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */
-
- PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */
-#endif
-
- PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
-
- PSLLW ( CONST(8), MM1 ) /* q1 << 8 */
-
-#if GMBT_ROUNDOFF
- MOVQ ( MM2, MM4 )
-#endif
-
- PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */
-
-#if GMBT_ROUNDOFF
- PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */
-
- PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */
-
- PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */
-#endif
-
-#if GMBT_ROUNDOFF
- MOVQ ( CONTENT(const_80), MM4 )
-
- PADDW ( MM4, MM2 ) /* t1 += 0x80 */
-#endif
-
-#if GMBT_GEOMETRIC_SERIES
- MOVQ ( MM2, MM3 )
+/* common blending loading code
+ *
+ * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
+ *
+ * PXOR ( M00, M00 )
+ */
+#define GMB_LOAD(rgba, dest, MP1, MQ1, MA1, MP2, MQ2, MA2, M00) \
+ONE(MOVD ( REGIND(rgba), MP1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\
+ONE(MOVD ( REGIND(dest), MQ1 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\
+ ;\
+TWO(MOVQ ( REGIND(rgba), MP1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\
+TWO(MOVQ ( REGIND(dest), MQ1 )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ ;\
+ ;\
+TWO(MOVQ ( MP1, MP2 )) ;\
+TWO(MOVQ ( MQ1, MQ2 )) ;\
+ ;\
+ PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\
+TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\
+ PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\
+TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ ;\
+ ;\
+ MOVQ ( MP1, MA1 ) ;\
+TWO(MOVQ ( MP2, MA2 )) ;\
+ ;\
+ PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\
+TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\
+ PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\
+TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */
- PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */
+
+/* common blending storing code
+ */
+#define GMB_STORE(rgba, MA1, MA2) \
+ PACKUSWB ( MA2, MA1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\
+ ;\
+ONE(MOVD ( MA1, REGIND(rgba) )) ;\
+TWO(MOVQ ( MA1, REGIND(rgba) ))
- PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */
-#if GMBT_GEOMETRIC_CORRECTION
- PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */
+ SEG_TEXT
- PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */
-#endif
-#endif
- PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */
+/* common transparency blending mode
+ */
- PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */
-
- PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */
- MOVD ( MM2, REGIND(EDI) )
+#define TAG(x) x##_transparency
-LLBL (GMBT_done):
+#define INIT \
+ GMB_INIT( MM0 )
- EMMS
+#define MAIN \
+ GMB_LOAD( EDI, ESI, MM1, MM2, MM3, MM4, MM5, MM6, MM0) ;\
+ GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\
+ GMB_STORE( EDI, MM3, MM6 )
-LLBL (GMBT_return):
+#include "mmx_blendtmp.h"
- POP_L ( EBX )
- POP_L ( EDI )
- POP_L ( ESI )
- MOV_L ( EBP, ESP )
- POP_L ( EBP )
- RET
diff --git a/src/mesa/x86/mmx_blendtmp.h b/src/mesa/x86/mmx_blendtmp.h
new file mode 100644
index 0000000000..395436ba01
--- /dev/null
+++ b/src/mesa/x86/mmx_blendtmp.h
@@ -0,0 +1,113 @@
+/*
+ * Written by José Fonseca <j_r_fonseca@yahoo.co.uk>
+ */
+
+
+/*
+ * void _mesa_mmx_blend( GLcontext *ctx,
+ * GLuint n,
+ * const GLubyte mask[],
+ * GLchan rgba[][4],
+ * CONST GLchan dest[][4] )
+ *
+ */
+ALIGNTEXT16
+GLOBL GLNAME( TAG(_mesa_mmx_blend) )
+
+GLNAME( TAG(_mesa_mmx_blend) ):
+
+ PUSH_L ( EBP )
+ MOV_L ( ESP, EBP )
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+ PUSH_L ( EBX )
+
+ MOV_L ( REGOFF(12, EBP), ECX ) /* n */
+ CMP_L ( CONST(0), ECX)
+ JE ( LLBL ( TAG(GMB_return) ) )
+
+ MOV_L ( REGOFF(16, EBP), EBX ) /* mask */
+ MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */
+ MOV_L ( REGOFF(24, EBP), ESI ) /* dest */
+
+ INIT
+
+ TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */
+ JZ ( LLBL ( TAG(GMB_align_end) ) )
+
+ CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
+ JE ( LLBL ( TAG(GMB_align_continue) ) )
+
+ /* runin */
+#define ONE(x) x
+#define TWO(x)
+ MAIN
+#undef ONE
+#undef TWO
+
+LLBL ( TAG(GMB_align_continue) ):
+
+ DEC_L ( ECX ) /* n -= 1 */
+ INC_L ( EBX ) /* mask += 1 */
+ ADD_L ( CONST(4), EDI ) /* rgba += 1 */
+ ADD_L ( CONST(4), ESI ) /* dest += 1 */
+
+LLBL ( TAG(GMB_align_end) ):
+
+ CMP_L ( CONST(2), ECX)
+ JB ( LLBL ( TAG(GMB_loop_end) ) )
+
+ALIGNTEXT16
+LLBL ( TAG(GMB_loop_begin) ):
+
+ CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */
+ JE ( LLBL ( TAG(GMB_loop_continue) ) )
+
+ /* main loop */
+#define ONE(x)
+#define TWO(x) x
+ MAIN
+#undef ONE
+#undef TWO
+
+LLBL ( TAG(GMB_loop_continue) ):
+
+ DEC_L ( ECX )
+ DEC_L ( ECX ) /* n -= 2 */
+ ADD_L ( CONST(2), EBX ) /* mask += 2 */
+ ADD_L ( CONST(8), EDI ) /* rgba += 2 */
+ ADD_L ( CONST(8), ESI ) /* dest += 2 */
+ CMP_L ( CONST(2), ECX )
+ JAE ( LLBL ( TAG(GMB_loop_begin) ) )
+
+LLBL ( TAG(GMB_loop_end) ):
+
+ CMP_L ( CONST(1), ECX )
+ JB ( LLBL ( TAG(GMB_done) ) )
+
+ CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */
+ JE ( LLBL ( TAG(GMB_done) ) )
+
+ /* runout */
+#define ONE(x) x
+#define TWO(x)
+ MAIN
+#undef ONE
+#undef TWO
+
+LLBL ( TAG(GMB_done) ):
+
+ EMMS
+
+LLBL ( TAG(GMB_return) ):
+
+ POP_L ( EBX )
+ POP_L ( EDI )
+ POP_L ( ESI )
+ MOV_L ( EBP, ESP )
+ POP_L ( EBP )
+ RET
+
+#undef TAG
+#undef INIT
+#undef MAIN