summaryrefslogtreecommitdiff
path: root/src/mesa/x86/read_rgba_span_x86.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/mesa/x86/read_rgba_span_x86.S')
-rw-r--r--src/mesa/x86/read_rgba_span_x86.S453
1 files changed, 453 insertions, 0 deletions
diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S
new file mode 100644
index 0000000000..e637f22da3
--- /dev/null
+++ b/src/mesa/x86/read_rgba_span_x86.S
@@ -0,0 +1,453 @@
+/*
+ * (C) Copyright IBM Corporation 2004
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file read_rgba_span_x86.S
+ * Optimized routines to transfer pixel data from the framebuffer to a
+ * buffer in main memory.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+ .file "read_rgba_span_x86.S"
+ .section .rodata
+ .align 16
+ .type mask, @object
+ .size mask, 32
+mask:
+ .long 0xff00ff00
+ .long 0xff00ff00
+ .long 0xff00ff00
+ .long 0xff00ff00
+ .long 0x00ff0000
+ .long 0x00ff0000
+ .long 0x00ff0000
+ .long 0x00ff0000
+
+
+/* I implemented these as macros because the appear in quite a few places,
+ * and I've tweaked them a number of times. I got tired of changing every
+ * place they appear. :)
+ */
+
+#define DO_ONE_PIXEL() \
+ movl (%ebx), %eax ; \
+ addl $4, %ebx ; \
+ bswap %eax /* ARGB -> BGRA */ ; \
+ rorl $8, %eax /* BGRA -> ABGR */ ; \
+ movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
+ addl $4, %ecx
+
+#define DO_ONE_LAST_PIXEL() \
+ movl (%ebx), %eax ; \
+ bswap %eax /* ARGB -> BGRA */ ; \
+ rorl $8, %eax /* BGRA -> ABGR */ ; \
+ movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
+
+
+/**
+ * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
+ .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
+_generic_read_RGBA_span_BGRA8888_REV_MMX:
+ pushl %ebx
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+ movq mask, %mm1
+ movq mask+16, %mm2
+
+ movl 8(%esp), %ebx /* source pointer */
+ movl 16(%esp), %edx /* number of pixels to copy */
+ movl 12(%esp), %ecx /* destination pointer */
+
+ testl %edx, %edx
+ je .L20 /* Bail if there's nothing to do. */
+
+ movl %ebx, %eax
+
+ negl %eax
+ sarl $2, %eax
+ andl $1, %eax
+ je .L17
+
+ subl %eax, %edx
+ DO_ONE_PIXEL()
+.L17:
+
+ /* Would it be faster to unroll this loop once and process 4 pixels
+ * per pass, instead of just two?
+ */
+
+ movl %edx, %eax
+ shrl %eax
+ jmp .L18
+.L19:
+ movq (%ebx), %mm0
+ addl $8, %ebx
+
+ /* These 9 instructions do what PSHUFB (if there were such an
+ * instruction) could do in 1. :(
+ */
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+
+ pand %mm2, %mm3
+ psllq $16, %mm4
+ psrlq $16, %mm3
+ pand %mm2, %mm4
+
+ pand %mm1, %mm0
+ por %mm4, %mm3
+ por %mm3, %mm0
+
+ movq %mm0, (%ecx)
+ addl $8, %ecx
+ subl $1, %eax
+.L18:
+ jne .L19
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+
+ /* At this point there are either 1 or 0 pixels remaining to be
+ * converted. Convert the last pixel, if needed.
+ */
+
+ testl $1, %edx
+ je .L20
+
+ DO_ONE_LAST_PIXEL()
+
+.L20:
+ popl %ebx
+ ret
+ .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
+
+
+/**
+ * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
+ * instructions are only actually used to read data from the framebuffer.
+ * In practice, the speed-up is pretty small.
+ *
+ * \todo
+ * Do some more testing and determine if there's any reason to have this
+ * function in addition to the MMX version.
+ *
+ * \warning
+ * This function assumes that the caller will issue the EMMS instruction
+ * at the correct places.
+ */
+
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
+ .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE:
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+ movq mask, %mm1
+ movq mask+16, %mm2
+
+ movl 16(%esp), %ebx /* source pointer */
+ movl 24(%esp), %edx /* number of pixels to copy */
+ movl 20(%esp), %ecx /* destination pointer */
+
+ movl %esp, %ebp
+ subl $16, %esp
+ andl $0xfffffff0, %esp
+
+ movl %ebx, %eax
+ movl %edx, %esi
+
+ negl %eax
+ andl $15, %eax
+ sarl $2, %eax
+ cmpl %edx, %eax
+ cmovle %eax, %esi
+
+ subl %esi, %edx
+
+ testl $1, %esi
+ je .L32
+
+ DO_ONE_PIXEL()
+.L32:
+
+ testl $2, %esi
+ je .L31
+
+ movq (%ebx), %mm0
+ addl $8, %ebx
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+
+ pand %mm2, %mm3
+ psllq $16, %mm4
+ psrlq $16, %mm3
+ pand %mm2, %mm4
+
+ pand %mm1, %mm0
+ por %mm4, %mm3
+ por %mm3, %mm0
+
+ movq %mm0, (%ecx)
+ addl $8, %ecx
+.L31:
+
+ movl %edx, %eax
+ shrl $2, %eax
+ jmp .L33
+.L34:
+ movaps (%ebx), %xmm0
+ addl $16, %ebx
+
+ /* This would be so much better if we could just move directly from
+ * an SSE register to an MMX register. Unfortunately, that
+ * functionality wasn't introduced until SSE2 with the MOVDQ2Q
+ * instruction.
+ */
+
+ movaps %xmm0, (%esp)
+ movq (%esp), %mm0
+ movq 8(%esp), %mm5
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+ movq %mm5, %mm6
+ movq %mm5, %mm7
+
+ pand %mm2, %mm3
+ pand %mm2, %mm6
+
+ psllq $16, %mm4
+ psllq $16, %mm7
+
+ psrlq $16, %mm3
+ psrlq $16, %mm6
+
+ pand %mm2, %mm4
+ pand %mm2, %mm7
+
+ pand %mm1, %mm0
+ pand %mm1, %mm5
+
+ por %mm4, %mm3
+ por %mm7, %mm6
+
+ por %mm3, %mm0
+ por %mm6, %mm5
+
+ movq %mm0, (%ecx)
+ movq %mm5, 8(%ecx)
+ addl $16, %ecx
+
+ subl $1, %eax
+.L33:
+ jne .L34
+
+#ifdef USE_INNER_EMMS
+ emms
+#endif
+ movl %ebp, %esp
+
+ /* At this point there are either [0, 3] pixels remaining to be
+ * converted.
+ */
+
+ testl $2, %edx
+ je .L36
+
+ movq (%ebx), %mm0
+ addl $8, %ebx
+
+ movq %mm0, %mm3
+ movq %mm0, %mm4
+
+ pand %mm2, %mm3
+ psllq $16, %mm4
+ psrlq $16, %mm3
+ pand %mm2, %mm4
+
+ pand %mm1, %mm0
+ por %mm4, %mm3
+ por %mm3, %mm0
+
+ movq %mm0, (%ecx)
+ addl $8, %ecx
+.L36:
+
+ testl $1, %edx
+ je .L35
+
+ DO_ONE_LAST_PIXEL()
+.L35:
+ popl %ebp
+ popl %ebx
+ popl %esi
+ ret
+ .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
+
+
+/**
+ * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
+ */
+
+ .text
+.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
+ .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
+_generic_read_RGBA_span_BGRA8888_REV_SSE2:
+ pushl %esi
+ pushl %ebx
+
+ movdqa mask, %xmm1
+ movdqa mask+16, %xmm2
+
+ movl 12(%esp), %ebx /* source pointer */
+ movl 20(%esp), %edx /* number of pixels to copy */
+ movl 16(%esp), %ecx /* destination pointer */
+
+ movl %ebx, %eax
+ movl %edx, %esi
+
+ /* If the source pointer isn't a multiple of 16 we have to process
+ * a few pixels the "slow" way to get the address aligned for
+ * the SSE fetch intsructions.
+ */
+
+ negl %eax
+ andl $15, %eax
+ sarl $2, %eax
+
+ cmpl %edx, %eax
+ cmovbe %eax, %esi
+ subl %esi, %edx
+
+ testl $1, %esi
+ je .L41
+
+ DO_ONE_PIXEL()
+.L41:
+ testl $2, %esi
+ je .L40
+
+ movq (%ebx), %xmm0
+ addl $8, %ebx
+
+ movdqa %xmm0, %xmm3
+ movdqa %xmm0, %xmm4
+ andps %xmm1, %xmm0
+
+ andps %xmm2, %xmm3
+ pslldq $2, %xmm4
+ psrldq $2, %xmm3
+ andps %xmm2, %xmm4
+
+ orps %xmm4, %xmm3
+ orps %xmm3, %xmm0
+
+ movq %xmm0, (%ecx)
+ addl $8, %ecx
+.L40:
+
+ /* Would it be worth having a specialized version of this loop for
+ * the case where the destination is 16-byte aligned? That version
+ * would be identical except that it could use movedqa instead of
+ * movdqu.
+ */
+
+ movl %edx, %eax
+ shrl $2, %eax
+ jmp .L42
+.L43:
+ movdqa (%ebx), %xmm0
+ addl $16, %ebx
+
+ movdqa %xmm0, %xmm3
+ movdqa %xmm0, %xmm4
+ andps %xmm1, %xmm0
+
+ andps %xmm2, %xmm3
+ pslldq $2, %xmm4
+ psrldq $2, %xmm3
+ andps %xmm2, %xmm4
+
+ orps %xmm4, %xmm3
+ orps %xmm3, %xmm0
+
+ movdqu %xmm0, (%ecx)
+ addl $16, %ecx
+ subl $1, %eax
+.L42:
+ jne .L43
+
+
+ /* There may be upto 3 pixels remaining to be copied. Take care
+ * of them now. We do the 2 pixel case first because the data
+ * will be aligned.
+ */
+
+ testl $2, %edx
+ je .L47
+
+ movq (%ebx), %xmm0
+
+ movdqa %xmm0, %xmm3
+ movdqa %xmm0, %xmm4
+ andps %xmm1, %xmm0
+
+ andps %xmm2, %xmm3
+ pslldq $2, %xmm4
+ psrldq $2, %xmm3
+ andps %xmm2, %xmm4
+
+ orps %xmm4, %xmm3
+ orps %xmm3, %xmm0
+
+ movq %xmm0, (%ecx)
+.L47:
+
+ testl $1, %edx
+ je .L46
+
+ DO_ONE_LAST_PIXEL()
+.L46:
+
+ popl %ebx
+ popl %esi
+ ret
+ .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2