From bdd53efe8302e85fd1be4ceda0aa576e0119b14e Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 2 Nov 2004 18:25:45 +0000
Subject: Added MMX optimized version of the RGB565 ReadRGBASpan routine.

---
 src/mesa/x86/read_rgba_span_x86.S | 223 ++++++++++++++++++++++++++++++++++++++
 src/mesa/x86/read_rgba_span_x86.h |   3 +
 2 files changed, 226 insertions(+)

(limited to 'src/mesa/x86')

diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S
index e637f22da3..06bdc6d264 100644
--- a/src/mesa/x86/read_rgba_span_x86.S
+++ b/src/mesa/x86/read_rgba_span_x86.S
@@ -451,3 +451,226 @@ _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 	popl	%esi
 	ret
 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
+
+
+
+	.section	.rodata
+
+	.align	16
+mask_565:
+	.word	0xf800
+	.word	0x07e0
+	.word	0x001f
+	.word	0x0000
+
+/* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
+ * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
+ * at a small cost to accuracy.
+ */
+
+#define SCALE_ADJUST	5
+#if SCALE_ADJUST == 5
+prescale:
+	.word	0x0001
+	.word	0x0010
+	.word	0x0200
+	.word	0x0000
+
+scale:
+	.word	0x20e8		/* (0x00ff0000 / 0x000007c0) + 1 */
+	.word	0x40c5		/* (0x00ff0000 / 0x000003f0) + 1 */
+	.word	0x839d		/* (0x00ff0000 / 0x000001f0) + 1 */
+	.word	0x0000
+#elif SCALE_ADJUST == 0
+prescale:
+	.word	0x0001
+	.word	0x0020
+	.word	0x0800
+	.word	0x0000
+
+scale:
+	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
+	.word	0x0104		/* (0x00ff0000 / 0x0000fc00) + 1 */
+	.word	0x0108		/* (0x00ff0000 / 0x0000f800) + 1 */
+	.word	0x0000
+#else
+#error SCALE_ADJUST must either be 5 or 0.
+#endif
+
+
+alpha:	.long	0x00000000
+	.long	0x00ff0000
+
+/**
+ * MMX optimized version of the RGB565 to RGBA copy routine.
+ */
+
+	.text
+	.globl	_generic_read_RGBA_span_RGB565_MMX
+	.type	_generic_read_RGBA_span_RGB565_MMX, @function
+
+_generic_read_RGBA_span_RGB565_MMX:
+
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+
+	movl	4(%esp), %eax	/* source pointer */
+	movl	8(%esp), %edx	/* destination pointer */
+	movl	12(%esp), %ecx	/* number of pixels to copy */
+
+	movq	mask_565, %mm5
+	movq	prescale, %mm6
+	movq	scale, %mm7
+
+	shrl	$2, %ecx
+	jmp	.L02
+
+.L03:
+	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
+	 * second pixels into the four words of %mm0 and %mm2.
+      	 */
+
+	movq	(%eax), %mm4
+	addl	$8, %eax
+
+	pshufw	$0x00, %mm4, %mm0
+	pshufw	$0x55, %mm4, %mm2
+
+
+	/* Mask the pixels so that each word of each register contains only
+	 * one color component.
+	 */
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+
+
+	/* Adjust the component values so that they are as small as possible,
+	 * but large enough so that we can multiply them by an unsigned 16-bit
+	 * number and get a value as large as 0x00ff0000.
+ 	 */
+
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+
+	/* Scale the input component values to be on the range
+	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
+	 */
+
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+
+	/* Always set the alpha value to 0xff.
+	 */
+
+	por	alpha, %mm0
+	por	alpha, %mm2
+
+
+	/* Pack the 16-bit values to 8-bit values and store the converted
+	 * pixel data.
+	 */
+
+	packuswb	%mm2, %mm0
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+
+
+	pshufw	$0xaa, %mm4, %mm0
+	pshufw	$0xff, %mm4, %mm2
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+	por	alpha, %mm0
+	por	alpha, %mm2
+
+	packuswb	%mm2, %mm0
+
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+	subl	$1, %ecx
+.L02:
+	jne	.L03
+
+
+	/* At this point there can be at most 3 pixels left to process.  If
+	 * there is either 2 or 3 left, process 2.
+         */
+
+	movl	12(%esp), %ecx
+	testl	$0x02, %ecx
+	je	.L04
+
+	movd	(%eax), %mm4
+	addl	$4, %eax
+
+	pshufw	$0x00, %mm4, %mm0
+	pshufw	$0x55, %mm4, %mm2
+
+	pand	%mm5, %mm0
+	pand	%mm5, %mm2
+	pmullw	%mm6, %mm0
+	pmullw	%mm6, %mm2
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+	psrlw	$SCALE_ADJUST, %mm2
+#endif
+	pmulhuw	%mm7, %mm0
+	pmulhuw	%mm7, %mm2
+
+	por	alpha, %mm0
+	por	alpha, %mm2
+
+	packuswb	%mm2, %mm0
+
+	movq	%mm0, (%edx)
+	addl	$8, %edx
+
+.L04:
+	/* At this point there can be at most 1 pixel left to process.
+	 * Process it if needed.
+         */
+
+	testl	$0x01, %ecx
+	je	.L01
+
+	movzxw	(%eax), %ecx
+	movd	%ecx, %mm4
+
+	pshufw	$0x00, %mm4, %mm0
+
+	pand	%mm5, %mm0
+	pmullw	%mm6, %mm0
+#if SCALE_ADJUST > 0
+	psrlw	$SCALE_ADJUST, %mm0
+#endif
+	pmulhuw	%mm7, %mm0
+
+	por	alpha, %mm0
+
+	packuswb	%mm0, %mm0
+
+	movd	%mm0, (%edx)
+
+.L01:
+#ifdef USE_INNER_EMMS
+	emms
+#endif
+	ret
diff --git a/src/mesa/x86/read_rgba_span_x86.h b/src/mesa/x86/read_rgba_span_x86.h
index 99dd0e365d..564b1bb0f9 100644
--- a/src/mesa/x86/read_rgba_span_x86.h
+++ b/src/mesa/x86/read_rgba_span_x86.h
@@ -48,6 +48,9 @@ extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *,
 #if defined(USE_MMX_ASM)
 extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *,
     unsigned char *, unsigned );
+
+extern void _generic_read_RGBA_span_RGB565_MMX( const unsigned char *,
+    unsigned char *, unsigned );
 #endif
 
 #endif /* READ_RGBA_SPAN_X86_H */
-- 
cgit v1.2.3