diff options
-rw-r--r-- | src/mesa/drivers/dri/common/spantmp2.h | 564 | ||||
-rw-r--r-- | src/mesa/drivers/dri/r128/r128_span.c | 105 | ||||
-rw-r--r-- | src/mesa/drivers/dri/r200/r200_span.c | 78 | ||||
-rw-r--r-- | src/mesa/drivers/dri/unichrome/via_span.c | 53 | ||||
-rw-r--r-- | src/mesa/sources | 1 | ||||
-rw-r--r-- | src/mesa/x86/read_rgba_span_x86.S | 453 | ||||
-rw-r--r-- | src/mesa/x86/read_rgba_span_x86.h | 53 |
7 files changed, 1114 insertions, 193 deletions
diff --git a/src/mesa/drivers/dri/common/spantmp2.h b/src/mesa/drivers/dri/common/spantmp2.h new file mode 100644 index 0000000000..ce0a66de70 --- /dev/null +++ b/src/mesa/drivers/dri/common/spantmp2.h @@ -0,0 +1,564 @@ +/* + * Copyright 2000-2001 VA Linux Systems, Inc. + * (C) Copyright IBM Corporation 2004 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEM, IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file spantmp2.h + * + * Template file of span read / write functions. + * + * \author Keith Whitwell <keithw@tungstengraphics.com> + * \author Gareth Hughes <gareth@nvidia.com> + * \author Ian Romanick <idr@us.ibm.com> + */ + +#include "colormac.h" + +#ifndef DBG +#define DBG 0 +#endif + +#ifndef HW_WRITE_LOCK +#define HW_WRITE_LOCK() HW_LOCK() +#endif + +#ifndef HW_WRITE_UNLOCK +#define HW_WRITE_UNLOCK() HW_UNLOCK() +#endif + +#ifndef HW_READ_LOCK +#define HW_READ_LOCK() HW_LOCK() +#endif + +#ifndef HW_READ_UNLOCK +#define HW_READ_UNLOCK() HW_UNLOCK() +#endif + +#ifndef HW_READ_CLIPLOOP +#define HW_READ_CLIPLOOP() HW_CLIPLOOP() +#endif + +#ifndef HW_WRITE_CLIPLOOP +#define HW_WRITE_CLIPLOOP() HW_CLIPLOOP() +#endif + +#if (SPANTMP_PIXEL_FMT == GL_RGB) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5) + +#define INIT_MONO_PIXEL(p, color) \ + p = PACK_COLOR_565( color[0], color[1], color[2] ) + +#define WRITE_RGBA( _x, _y, r, g, b, a ) \ + do { \ + GLshort * _p = (GLshort *) GET_DST_PTR(_x, _y); \ + _p[0] = ((((int)r & 0xf8) << 8) | (((int)g & 0xfc) << 3) | \ + (((int)b & 0xf8) >> 3)); \ + } while(0) + +#define WRITE_PIXEL( _x, _y, p ) \ + do { \ + GLushort * _p = (GLushort *) GET_DST_PTR(_x, _y); \ + _p[0] = p; \ + } while(0) + +#define READ_RGBA( rgba, _x, _y ) \ + do { \ + GLushort p = *(volatile GLshort *) GET_SRC_PTR(_x, _y); \ + rgba[0] = ((p >> 8) & 0xf8) * 255 / 0xf8; \ + rgba[1] = ((p >> 3) & 0xfc) * 255 / 0xfc; \ + rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8; \ + rgba[3] = 0xff; \ + } while (0) + +#elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV) + +# define INIT_MONO_PIXEL(p, color) \ + p = PACK_COLOR_8888(color[3], color[0], color[1], color[2]) + +# define WRITE_RGBA(_x, _y, r, g, b, a) \ + do { \ + GLuint * _p = (GLuint *) GET_DST_PTR(_x, _y); \ + _p[0] = ((r << 16) | (g << 8) | (b << 0) | (a << 24)); \ + } while(0) + +#define WRITE_PIXEL(_x, _y, p) \ + do { \ + GLuint * _p = (GLuint *) GET_DST_PTR(_x, _y); \ + _p[0] = p; \ + } while(0) + +# if defined( USE_X86_ASM ) +# define READ_RGBA(rgba, _x, _y) \ + do { \ + GLuint p = *(volatile GLuint *) GET_SRC_PTR(_x, _y); \ + __asm__ __volatile__( "bswap %0; rorl $8, %0" \ + : "=r" (p) : "r" (p) ); \ + ((GLuint *)rgba)[0] = p; \ + } while (0) +# else +# define READ_RGBA( rgba, _x, _y ) \ + do { \ + GLuint p = *(volatile GLuint *) GET_SRC_PTR(_x, _y); \ + rgba[0] = (p >> 16) & 0xff; \ + rgba[1] = (p >> 8) & 0xff; \ + rgba[2] = (p >> 0) & 0xff; \ + rgba[3] = (p >> 24) & 0xff; \ + } while (0) +# endif + +#else +#error SPANTMP_PIXEL_FMT must be set to a valid value! +#endif + +#if defined( USE_MMX_ASM ) || defined( USE_SSE_ASM ) +#include "x86/read_rgba_span_x86.h" +#include "x86/common_x86_asm.h" +#endif + +static void TAG(WriteRGBASpan)( const GLcontext *ctx, + GLuint n, GLint x, GLint y, + const GLubyte rgba[][4], + const GLubyte mask[] ) +{ + HW_WRITE_LOCK() + { + GLint x1; + GLint n1; + LOCAL_VARS; + + y = Y_FLIP(y); + + HW_WRITE_CLIPLOOP() + { + GLint i = 0; + CLIPSPAN(x,y,n,x1,n1,i); + + if (DBG) fprintf(stderr, "WriteRGBASpan %d..%d (x1 %d)\n", + (int)i, (int)n1, (int)x1); + + if (mask) + { + for (;n1>0;i++,x1++,n1--) + if (mask[i]) + WRITE_RGBA( x1, y, + rgba[i][0], rgba[i][1], + rgba[i][2], rgba[i][3] ); + } + else + { + for (;n1>0;i++,x1++,n1--) + WRITE_RGBA( x1, y, + rgba[i][0], rgba[i][1], + rgba[i][2], rgba[i][3] ); + } + } + HW_ENDCLIPLOOP(); + } + HW_WRITE_UNLOCK(); +} + +static void TAG(WriteRGBSpan)( const GLcontext *ctx, + GLuint n, GLint x, GLint y, + const GLubyte rgb[][3], + const GLubyte mask[] ) +{ + HW_WRITE_LOCK() + { + GLint x1; + GLint n1; + LOCAL_VARS; + + y = Y_FLIP(y); + + HW_WRITE_CLIPLOOP() + { + GLint i = 0; + CLIPSPAN(x,y,n,x1,n1,i); + + if (DBG) fprintf(stderr, "WriteRGBSpan %d..%d (x1 %d)\n", + (int)i, (int)n1, (int)x1); + + if (mask) + { + for (;n1>0;i++,x1++,n1--) + if (mask[i]) + WRITE_RGBA( x1, y, rgb[i][0], rgb[i][1], rgb[i][2], 255 ); + } + else + { + for (;n1>0;i++,x1++,n1--) + WRITE_RGBA( x1, y, rgb[i][0], rgb[i][1], rgb[i][2], 255 ); + } + } + HW_ENDCLIPLOOP(); + } + HW_WRITE_UNLOCK(); +} + +static void TAG(WriteRGBAPixels)( const GLcontext *ctx, + GLuint n, + const GLint x[], + const GLint y[], + const GLubyte rgba[][4], + const GLubyte mask[] ) +{ + HW_WRITE_LOCK() + { + GLint i; + LOCAL_VARS; + + if (DBG) fprintf(stderr, "WriteRGBAPixels\n"); + + HW_WRITE_CLIPLOOP() + { + if (mask) + { + for (i=0;i<n;i++) + { + if (mask[i]) { + const int fy = Y_FLIP(y[i]); + if (CLIPPIXEL(x[i],fy)) + WRITE_RGBA( x[i], fy, + rgba[i][0], rgba[i][1], + rgba[i][2], rgba[i][3] ); + } + } + } + else + { + for (i=0;i<n;i++) + { + const int fy = Y_FLIP(y[i]); + if (CLIPPIXEL(x[i],fy)) + WRITE_RGBA( x[i], fy, + rgba[i][0], rgba[i][1], + rgba[i][2], rgba[i][3] ); + } + } + } + HW_ENDCLIPLOOP(); + } + HW_WRITE_UNLOCK(); +} + + +static void TAG(WriteMonoRGBASpan)( const GLcontext *ctx, + GLuint n, GLint x, GLint y, + const GLchan color[4], + const GLubyte mask[] ) +{ + HW_WRITE_LOCK() + { + GLint x1; + GLint n1; + LOCAL_VARS; + INIT_MONO_PIXEL(p, color); + + y = Y_FLIP( y ); + + if (DBG) fprintf(stderr, "WriteMonoRGBASpan\n"); + + HW_WRITE_CLIPLOOP() + { + GLint i = 0; + CLIPSPAN(x,y,n,x1,n1,i); + if (mask) + { + for (;n1>0;i++,x1++,n1--) + if (mask[i]) + WRITE_PIXEL( x1, y, p ); + } + else + { + for (;n1>0;i++,x1++,n1--) + WRITE_PIXEL( x1, y, p ); + } + } + HW_ENDCLIPLOOP(); + } + HW_WRITE_UNLOCK(); +} + + +static void TAG(WriteMonoRGBAPixels)( const GLcontext *ctx, + GLuint n, + const GLint x[], const GLint y[], + const GLchan color[], + const GLubyte mask[] ) +{ + HW_WRITE_LOCK() + { + GLint i; + LOCAL_VARS; + INIT_MONO_PIXEL(p, color); + + if (DBG) fprintf(stderr, "WriteMonoRGBAPixels\n"); + + HW_WRITE_CLIPLOOP() + { + if (mask) + { + for (i=0;i<n;i++) + if (mask[i]) { + int fy = Y_FLIP(y[i]); + if (CLIPPIXEL( x[i], fy )) + WRITE_PIXEL( x[i], fy, p ); + } + } + else + { + for (i=0;i<n;i++) { + int fy = Y_FLIP(y[i]); + if (CLIPPIXEL( x[i], fy )) + WRITE_PIXEL( x[i], fy, p ); + } + } + } + HW_ENDCLIPLOOP(); + } + HW_WRITE_UNLOCK(); +} + + +static void TAG(ReadRGBASpan)( const GLcontext *ctx, + GLuint n, GLint x, GLint y, + GLubyte rgba[][4]) +{ + HW_READ_LOCK() + { + GLint x1,n1; + LOCAL_VARS; + + y = Y_FLIP(y); + + if (DBG) fprintf(stderr, "ReadRGBASpan\n"); + + HW_READ_CLIPLOOP() + { + GLint i = 0; + CLIPSPAN(x,y,n,x1,n1,i); + for (;n1>0;i++,x1++,n1--) + READ_RGBA( rgba[i], x1, y ); + } + HW_ENDCLIPLOOP(); + } + HW_READ_UNLOCK(); +} + + +#if defined(USE_MMX_ASM) +static void TAG2(ReadRGBASpan,_MMX)( const GLcontext *ctx, + GLuint n, GLint x, GLint y, + GLubyte rgba[][4]) +{ +#ifndef USE_INNER_EMMS + /* The EMMS instruction is directly in-lined here because using GCC's + * built-in _mm_empty function was found to utterly destroy performance. + */ + __asm__ __volatile__( "emms" ); +#endif + + HW_LOCK() + { + GLint x1,n1; + LOCAL_VARS; + + y = Y_FLIP(y); + + if (DBG) fprintf(stderr, "ReadRGBASpan\n"); + + HW_READ_CLIPLOOP() + { + GLint i = 0; + CLIPSPAN(x,y,n,x1,n1,i); + + { + const char * src = (read_buf + x1*4 + y*pitch); + _generic_read_RGBA_span_BGRA8888_REV_MMX( src, rgba[i], n1 ); + } + } + HW_ENDCLIPLOOP(); + } + HW_UNLOCK(); +#ifndef USE_INNER_EMMS + __asm__ __volatile__( "emms" ); +#endif +} +#endif + + +#if defined(USE_SSE_ASM) +static void TAG2(ReadRGBASpan,_SSE2)( const GLcontext *ctx, + GLuint n, GLint x, GLint y, + GLubyte rgba[][4]) +{ + HW_LOCK() + { + GLint x1,n1; + LOCAL_VARS; + + y = Y_FLIP(y); + + if (DBG) fprintf(stderr, "ReadRGBASpan\n"); + + HW_READ_CLIPLOOP() + { + GLint i = 0; + CLIPSPAN(x,y,n,x1,n1,i); + + { + const char * src = (read_buf + x1*4 + y*pitch); + _generic_read_RGBA_span_BGRA8888_REV_SSE2( src, rgba[i], n1 ); + } + } + HW_ENDCLIPLOOP(); + } + HW_UNLOCK(); +} +#endif + +#if defined(USE_SSE_ASM) +static void TAG2(ReadRGBASpan,_SSE)( const GLcontext *ctx, + GLuint n, GLint x, GLint y, + GLubyte rgba[][4]) +{ +#ifndef USE_INNER_EMMS + /* The EMMS instruction is directly in-lined here because using GCC's + * built-in _mm_empty function was found to utterly destroy performance. + */ + __asm__ __volatile__( "emms" ); +#endif + + HW_LOCK() + { + GLint x1,n1; + LOCAL_VARS; + + y = Y_FLIP(y); + + if (DBG) fprintf(stderr, "ReadRGBASpan\n"); + + HW_READ_CLIPLOOP() + { + GLint i = 0; + CLIPSPAN(x,y,n,x1,n1,i); + + { + const char * src = (read_buf + x1*4 + y*pitch); + _generic_read_RGBA_span_BGRA8888_REV_SSE( src, rgba[i], n1 ); + } + } + HW_ENDCLIPLOOP(); + } + HW_UNLOCK(); +#ifndef USE_INNER_EMMS + __asm__ __volatile__( "emms" ); +#endif +} +#endif + + +static void TAG(ReadRGBAPixels)( const GLcontext *ctx, + GLuint n, const GLint x[], const GLint y[], + GLubyte rgba[][4], const GLubyte mask[] ) +{ + HW_READ_LOCK() + { + GLint i; + LOCAL_VARS; + + if (DBG) fprintf(stderr, "ReadRGBAPixels\n"); + + HW_READ_CLIPLOOP() + { + if (mask) + { + for (i=0;i<n;i++) + if (mask[i]) { + int fy = Y_FLIP( y[i] ); + if (CLIPPIXEL( x[i], fy )) + READ_RGBA( rgba[i], x[i], fy ); + } + } + else + { + for (i=0;i<n;i++) { + int fy = Y_FLIP( y[i] ); + if (CLIPPIXEL( x[i], fy )) + READ_RGBA( rgba[i], x[i], fy ); + } + } + } + HW_ENDCLIPLOOP(); + } + HW_READ_UNLOCK(); +} + +static void TAG(InitPointers)(struct swrast_device_driver *swdd) +{ + swdd->WriteRGBASpan = TAG(WriteRGBASpan); + swdd->WriteRGBSpan = TAG(WriteRGBSpan); + swdd->WriteMonoRGBASpan = TAG(WriteMonoRGBASpan); + swdd->WriteRGBAPixels = TAG(WriteRGBAPixels); + swdd->WriteMonoRGBAPixels = TAG(WriteMonoRGBAPixels); + swdd->ReadRGBAPixels = TAG(ReadRGBAPixels); + +#if defined(USE_SSE_ASM) + if ( cpu_has_xmm2 ) { + if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE2" ); + swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE2); + } + else +#endif +#if defined(USE_SSE_ASM) + if ( cpu_has_xmm ) { + if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "SSE" ); + swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _SSE); + } + else +#endif +#if defined(USE_MMX_ASM) + if ( cpu_has_mmx ) { + if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "MMX" ); + swdd->ReadRGBASpan = TAG2(ReadRGBASpan, _MMX); + } + else +#endif + { + if (DBG) fprintf( stderr, "Using %s version of ReadRGBASpan\n", "C" ); + swdd->ReadRGBASpan = TAG(ReadRGBASpan); + } + +} + + +#undef INIT_MONO_PIXEL +#undef WRITE_PIXEL +#undef WRITE_RGBA +#undef READ_RGBA +#undef TAG +#undef TAG2 +#undef GET_SRC_PTR +#undef GET_DST_PTR +#undef SPANTMP_PIXEL_FMT +#undef SPANTMP_PIXEL_TYPE diff --git a/src/mesa/drivers/dri/r128/r128_span.c b/src/mesa/drivers/dri/r128/r128_span.c index db2ec44fbf..b169dc7509 100644 --- a/src/mesa/drivers/dri/r128/r128_span.c +++ b/src/mesa/drivers/dri/r128/r128_span.c @@ -125,85 +125,27 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. /* 16 bit, RGB565 color spanline and pixel functions */ -#undef INIT_MONO_PIXEL -#define INIT_MONO_PIXEL(p, color) \ - p = R128PACKCOLOR565( color[0], color[1], color[2] ) +#define GET_SRC_PTR(_x, _y) (read_buf + _x * 2 + _y * pitch) +#define GET_DST_PTR(_x, _y) ( buf + _x * 2 + _y * pitch) +#define SPANTMP_PIXEL_FMT GL_RGB +#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5 -#define WRITE_RGBA( _x, _y, r, g, b, a ) \ - *(GLushort *)(buf + _x*2 + _y*pitch) = ((((int)r & 0xf8) << 8) | \ - (((int)g & 0xfc) << 3) | \ - (((int)b & 0xf8) >> 3)) +#define TAG(x) r128##x##_RGB565 +#define TAG2(x,y) r128##x##_RGB565##y +#include "spantmp2.h" -#define WRITE_PIXEL( _x, _y, p ) \ - *(GLushort *)(buf + _x*2 + _y*pitch) = p - -#define READ_RGBA( rgba, _x, _y ) \ - do { \ - GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ - rgba[0] = (p >> 8) & 0xf8; \ - rgba[1] = (p >> 3) & 0xfc; \ - rgba[2] = (p << 3) & 0xf8; \ - rgba[3] = 0xff; \ - if ( rgba[0] & 0x08 ) rgba[0] |= 0x07; \ - if ( rgba[1] & 0x04 ) rgba[1] |= 0x03; \ - if ( rgba[2] & 0x08 ) rgba[2] |= 0x07; \ - } while (0) - -#define TAG(x) r128##x##_RGB565 -#include "spantmp.h" - -#define READ_DEPTH(d, _x, _y) \ - d = *(GLushort *)(buf + _x*2 + _y*pitch) /* 32 bit, ARGB8888 color spanline and pixel functions */ -#undef INIT_MONO_PIXEL -#define INIT_MONO_PIXEL(p, color) \ - p = R128PACKCOLOR8888( color[0], color[1], color[2], color[3] ) - -#define WRITE_RGBA( _x, _y, r, g, b, a ) \ - *(GLuint *)(buf + _x*4 + _y*pitch) = ((b << 0) | \ - (g << 8) | \ - (r << 16) | \ - (a << 24) ) +#define GET_SRC_PTR(_x, _y) (read_buf + _x * 4 + _y * pitch) +#define GET_DST_PTR(_x, _y) ( buf + _x * 4 + _y * pitch) +#define SPANTMP_PIXEL_FMT GL_BGRA +#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV -#define WRITE_PIXEL( _x, _y, p ) \ - *(GLuint *)(buf + _x*4 + _y*pitch) = p +#define TAG(x) r128##x##_ARGB8888 +#define TAG2(x,y) r128##x##_ARGB8888##y +#include "spantmp2.h" -#define READ_RGBA( rgba, _x, _y ) \ -do { \ - GLuint p = *(GLuint *)(read_buf + _x*4 + _y*pitch); \ - rgba[0] = (p >> 16) & 0xff; \ - rgba[1] = (p >> 8) & 0xff; \ - rgba[2] = (p >> 0) & 0xff; \ - rgba[3] = 0xff;/*(p >> 24) & 0xff;*/ \ -} while (0) - -#define TAG(x) r128##x##_ARGB8888 -#include "spantmp.h" - - -/* 24 bit, RGB888 color spanline and pixel functions */ -#undef INIT_MONO_PIXEL -#define INIT_MONO_PIXEL(p, color) \ - p = R128PACKCOLOR888( color[0], color[1], color[2] ) - -#define WRITE_RGBA(_x, _y, r, g, b, a) \ - *(GLuint *)(buf + _x*3 + _y*pitch) = ((r << 16) | \ - (g << 8) | \ - (b << 0)) - -#define WRITE_PIXEL(_x, _y, p) \ - *(GLuint *)(buf + _x*3 + _y*pitch) = p - -#define READ_RGBA(rgba, _x, _y) \ - do { \ - GLuint p = *(GLuint *)(read_buf + _x*3 + _y*pitch); \ - rgba[0] = (p >> 16) & 0xff; \ - rgba[1] = (p >> 8) & 0xff; \ - rgba[2] = (p >> 0) & 0xff; \ - rgba[3] = 0xff; \ - } while (0) /* ================================================================ * Depth buffer @@ -211,6 +153,9 @@ do { \ /* 16-bit depth buffer functions */ +#define READ_DEPTH(d, _x, _y) \ + d = *(GLushort *)(buf + _x*2 + _y*pitch) + #define WRITE_DEPTH_SPAN() \ r128WriteDepthSpanLocked( rmesa, n, \ x + dPriv->x, \ @@ -423,23 +368,11 @@ void r128DDInitSpanFuncs( GLcontext *ctx ) switch ( rmesa->r128Screen->cpp ) { case 2: - swdd->WriteRGBASpan = r128WriteRGBASpan_RGB565; - swdd->WriteRGBSpan = r128WriteRGBSpan_RGB565; - swdd->WriteMonoRGBASpan = r128WriteMonoRGBASpan_RGB565; - swdd->WriteRGBAPixels = r128WriteRGBAPixels_RGB565; - swdd->WriteMonoRGBAPixels = r128WriteMonoRGBAPixels_RGB565; - swdd->ReadRGBASpan = r128ReadRGBASpan_RGB565; - swdd->ReadRGBAPixels = r128ReadRGBAPixels_RGB565; + r128InitPointers_RGB565( swdd ); break; case 4: - swdd->WriteRGBASpan = r128WriteRGBASpan_ARGB8888; - swdd->WriteRGBSpan = r128WriteRGBSpan_ARGB8888; - swdd->WriteMonoRGBASpan = r128WriteMonoRGBASpan_ARGB8888; - swdd->WriteRGBAPixels = r128WriteRGBAPixels_ARGB8888; - swdd->WriteMonoRGBAPixels = r128WriteMonoRGBAPixels_ARGB8888; - swdd->ReadRGBASpan = r128ReadRGBASpan_ARGB8888; - swdd->ReadRGBAPixels = r128ReadRGBAPixels_ARGB8888; + r128InitPointers_ARGB8888( swdd ); break; default: diff --git a/src/mesa/drivers/dri/r200/r200_span.c b/src/mesa/drivers/dri/r200/r200_span.c index e1ad976ce3..efb5e6ea7c 100644 --- a/src/mesa/drivers/dri/r200/r200_span.c +++ b/src/mesa/drivers/dri/r200/r200_span.c @@ -122,61 +122,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /* 16 bit, RGB565 color spanline and pixel functions */ -#define INIT_MONO_PIXEL(p, color) \ - p = PACK_COLOR_565( color[0], color[1], color[2] ) -#define WRITE_RGBA( _x, _y, r, g, b, a ) \ - *(GLushort *)(buf + _x*2 + _y*pitch) = ((((int)r & 0xf8) << 8) | \ - (((int)g & 0xfc) << 3) | \ - (((int)b & 0xf8) >> 3)) +#define GET_SRC_PTR(_x, _y) (read_buf + _x * 2 + _y * pitch) +#define GET_DST_PTR(_x, _y) ( buf + _x * 2 + _y * pitch) +#define SPANTMP_PIXEL_FMT GL_RGB +#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5 -#define WRITE_PIXEL( _x, _y, p ) \ - *(GLushort *)(buf + _x*2 + _y*pitch) = p - -#define READ_RGBA( rgba, _x, _y ) \ - do { \ - GLushort p = *(GLushort *)(read_buf + _x*2 + _y*pitch); \ - rgba[0] = ((p >> 8) & 0xf8) * 255 / 0xf8; \ - rgba[1] = ((p >> 3) & 0xfc) * 255 / 0xfc; \ - rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8; \ - rgba[3] = 0xff; \ - } while (0) - -#define TAG(x) r200##x##_RGB565 -#include "spantmp.h" +#define TAG(x) r200##x##_RGB565 +#define TAG2(x,y) r200##x##_RGB565##y +#include "spantmp2.h" /* 32 bit, ARGB8888 color spanline and pixel functions */ -#undef INIT_MONO_PIXEL -#define INIT_MONO_PIXEL(p, color) \ - p = PACK_COLOR_8888( color[3], color[0], color[1], color[2] ) - -#define WRITE_RGBA( _x, _y, r, g, b, a ) \ -do { \ - *(GLuint *)(buf + _x*4 + _y*pitch) = ((b << 0) | \ - (g << 8) | \ - (r << 16) | \ - (a << 24) ); \ -} while (0) - -#define WRITE_PIXEL( _x, _y, p ) \ -do { \ - *(GLuint *)(buf + _x*4 + _y*pitch) = p; \ -} while (0) - -#define READ_RGBA( rgba, _x, _y ) \ -do { \ - volatile GLuint *ptr = (volatile GLuint *)(read_buf + _x*4 + _y*pitch); \ - GLuint p = *ptr; \ - rgba[0] = (p >> 16) & 0xff; \ - rgba[1] = (p >> 8) & 0xff; \ - rgba[2] = (p >> 0) & 0xff; \ - rgba[3] = (p >> 24) & 0xff; \ -} while (0) -#define TAG(x) r200##x##_ARGB8888 -#include "spantmp.h" +#define GET_SRC_PTR(_x, _y) (read_buf + _x * 4 + _y * pitch) +#define GET_DST_PTR(_x, _y) ( buf + _x * 4 + _y * pitch) +#define SPANTMP_PIXEL_FMT GL_BGRA +#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV +#define TAG(x) r200##x##_ARGB8888 +#define TAG2(x,y) r200##x##_ARGB8888##y +#include "spantmp2.h" /* ================================================================ @@ -380,23 +346,11 @@ void r200InitSpanFuncs( GLcontext *ctx ) switch ( rmesa->r200Screen->cpp ) { case 2: - swdd->WriteRGBASpan = r200WriteRGBASpan_RGB565; - swdd->WriteRGBSpan = r200WriteRGBSpan_RGB565; - swdd->WriteMonoRGBASpan = r200WriteMonoRGBASpan_RGB565; - swdd->WriteRGBAPixels = r200WriteRGBAPixels_RGB565; - swdd->WriteMonoRGBAPixels = r200WriteMonoRGBAPixels_RGB565; - swdd->ReadRGBASpan = r200ReadRGBASpan_RGB565; - swdd->ReadRGBAPixels = r200ReadRGBAPixels_RGB565; + r200InitPointers_RGB565( swdd ); break; case 4: - swdd->WriteRGBASpan = r200WriteRGBASpan_ARGB8888; - swdd->WriteRGBSpan = r200WriteRGBSpan_ARGB8888; - swdd->WriteMonoRGBASpan = r200WriteMonoRGBASpan_ARGB8888; - swdd->WriteRGBAPixels = r200WriteRGBAPixels_ARGB8888; - swdd->WriteMonoRGBAPixels = r200WriteMonoRGBAPixels_ARGB8888; - swdd->ReadRGBASpan = r200ReadRGBASpan_ARGB8888; - swdd->ReadRGBAPixels = r200ReadRGBAPixels_ARGB8888; + r200InitPointers_ARGB8888( swdd ); break; default: diff --git a/src/mesa/drivers/dri/unichrome/via_span.c b/src/mesa/drivers/dri/unichrome/via_span.c index 693b6de142..3a747a3d99 100644 --- a/src/mesa/drivers/dri/unichrome/via_span.c +++ b/src/mesa/drivers/dri/unichrome/via_span.c @@ -204,19 +204,6 @@ #undef LOCAL_VARS #undef LOCAL_DEPTH_VARS -/*=* [DBG] csmash : fix options worng position *=*/ -/*#define LOCAL_VARS \ - __DRIdrawablePrivate *dPriv = vmesa->driDrawable; \ - GLuint pitch = vmesa->drawPitch; \ - GLuint height = dPriv->h; \ - GLuint p; \ - char *buf = (char *)(vmesa->drawMap + \ - dPriv->x * 4 + \ - dPriv->y * pitch); \ - char *read_buf = (char *)(vmesa->readMap + \ - dPriv->x * 4 + \ - dPriv->y * pitch); \ - (void)read_buf; (void)buf; (void)p*/ #define LOCAL_VARS \ __DRIdrawablePrivate *dPriv = vmesa->driDrawable; \ GLuint pitch = vmesa->drawPitch; \ @@ -237,33 +224,15 @@ dPriv->y * pitch); \ } +#define GET_SRC_PTR(_x, _y) (read_buf + _x * 4 + _y * pitch) +#define GET_DST_PTR(_x, _y) ( buf + _x * 4 + _y * pitch) +#define SPANTMP_PIXEL_FMT GL_BGRA +#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV -#undef INIT_MONO_PIXEL -#define INIT_MONO_PIXEL(p, color) \ - p = PACK_COLOR_8888(color[3], color[0], color[1], color[2]) - -#define WRITE_RGBA(_x, _y, r, g, b, a) \ - *(GLuint *)(buf + _x * 4 + _y * pitch) = ((r << 16) | \ - (g << 8) | \ - (b << 0) | \ - (a << 24)); - - -#define WRITE_PIXEL(_x, _y, p) \ - *(GLuint *)(buf + _x * 4 + _y * pitch) = p - -#define READ_RGBA(rgba, _x, _y) \ - do { \ - GLuint p = *(GLuint *)(read_buf + _x * 4 + _y * pitch); \ - rgba[0] = (p >> 16) & 0xff; \ - rgba[1] = (p >> 8) & 0xff; \ - rgba[2] = (p >> 0) & 0xff; \ - rgba[3] = 255; \ - } while (0) +#define TAG(x) via##x##_8888 +#define TAG2(x,y) via##x##_8888##y +#include "spantmp2.h" -#define TAG(x) via##x##_8888 -#include "spantmp.h" -/*#include "via_spantmp.h"*/ /* 16 bit depthbuffer functions. */ @@ -367,13 +336,7 @@ void viaInitSpanFuncs(GLcontext *ctx) swdd->ReadRGBAPixels = viaReadRGBAPixels_565; } else if (vmesa->viaScreen->bitsPerPixel == 0x20) { - swdd->WriteRGBASpan = viaWriteRGBASpan_8888; - swdd->WriteRGBSpan = viaWriteRGBSpan_8888; - swdd->WriteMonoRGBASpan = viaWriteMonoRGBASpan_8888; - swdd->WriteRGBAPixels = viaWriteRGBAPixels_8888; - swdd->WriteMonoRGBAPixels = viaWriteMonoRGBAPixels_8888; - swdd->ReadRGBASpan = viaReadRGBASpan_8888; - swdd->ReadRGBAPixels = viaReadRGBAPixels_8888; + viaInitPointers_8888( swdd ); } else ASSERT(0); diff --git a/src/mesa/sources b/src/mesa/sources index 7013182f6f..f82d344bf5 100644 --- a/src/mesa/sources +++ b/src/mesa/sources @@ -170,6 +170,7 @@ X86_SOURCES = \ x86/sse_xform3.S \ x86/sse_xform4.S \ x86/sse_normal.S \ + x86/read_rgba_span_x86.S \ tnl/t_vtx_x86_gcc.S X86_API = \ diff --git a/src/mesa/x86/read_rgba_span_x86.S b/src/mesa/x86/read_rgba_span_x86.S new file mode 100644 index 0000000000..e637f22da3 --- /dev/null +++ b/src/mesa/x86/read_rgba_span_x86.S @@ -0,0 +1,453 @@ +/* + * (C) Copyright IBM Corporation 2004 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file read_rgba_span_x86.S + * Optimized routines to transfer pixel data from the framebuffer to a + * buffer in main memory. + * + * \author Ian Romanick <idr@us.ibm.com> + */ + + .file "read_rgba_span_x86.S" + .section .rodata + .align 16 + .type mask, @object + .size mask, 32 +mask: + .long 0xff00ff00 + .long 0xff00ff00 + .long 0xff00ff00 + .long 0xff00ff00 + .long 0x00ff0000 + .long 0x00ff0000 + .long 0x00ff0000 + .long 0x00ff0000 + + +/* I implemented these as macros because the appear in quite a few places, + * and I've tweaked them a number of times. I got tired of changing every + * place they appear. :) + */ + +#define DO_ONE_PIXEL() \ + movl (%ebx), %eax ; \ + addl $4, %ebx ; \ + bswap %eax /* ARGB -> BGRA */ ; \ + rorl $8, %eax /* BGRA -> ABGR */ ; \ + movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ + addl $4, %ecx + +#define DO_ONE_LAST_PIXEL() \ + movl (%ebx), %eax ; \ + bswap %eax /* ARGB -> BGRA */ ; \ + rorl $8, %eax /* BGRA -> ABGR */ ; \ + movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ + + +/** + * MMX optimized version of the BGRA8888_REV to RGBA copy routine. + * + * \warning + * This function assumes that the caller will issue the EMMS instruction + * at the correct places. + */ + +.globl _generic_read_RGBA_span_BGRA8888_REV_MMX + .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function +_generic_read_RGBA_span_BGRA8888_REV_MMX: + pushl %ebx + +#ifdef USE_INNER_EMMS + emms +#endif + movq mask, %mm1 + movq mask+16, %mm2 + + movl 8(%esp), %ebx /* source pointer */ + movl 16(%esp), %edx /* number of pixels to copy */ + movl 12(%esp), %ecx /* destination pointer */ + + testl %edx, %edx + je .L20 /* Bail if there's nothing to do. */ + + movl %ebx, %eax + + negl %eax + sarl $2, %eax + andl $1, %eax + je .L17 + + subl %eax, %edx + DO_ONE_PIXEL() +.L17: + + /* Would it be faster to unroll this loop once and process 4 pixels + * per pass, instead of just two? + */ + + movl %edx, %eax + shrl %eax + jmp .L18 +.L19: + movq (%ebx), %mm0 + addl $8, %ebx + + /* These 9 instructions do what PSHUFB (if there were such an + * instruction) could do in 1. :( + */ + + movq %mm0, %mm3 + movq %mm0, %mm4 + + pand %mm2, %mm3 + psllq $16, %mm4 + psrlq $16, %mm3 + pand %mm2, %mm4 + + pand %mm1, %mm0 + por %mm4, %mm3 + por %mm3, %mm0 + + movq %mm0, (%ecx) + addl $8, %ecx + subl $1, %eax +.L18: + jne .L19 + +#ifdef USE_INNER_EMMS + emms +#endif + + /* At this point there are either 1 or 0 pixels remaining to be + * converted. Convert the last pixel, if needed. + */ + + testl $1, %edx + je .L20 + + DO_ONE_LAST_PIXEL() + +.L20: + popl %ebx + ret + .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX + + +/** + * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE + * instructions are only actually used to read data from the framebuffer. + * In practice, the speed-up is pretty small. + * + * \todo + * Do some more testing and determine if there's any reason to have this + * function in addition to the MMX version. + * + * \warning + * This function assumes that the caller will issue the EMMS instruction + * at the correct places. + */ + +.globl _generic_read_RGBA_span_BGRA8888_REV_SSE + .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function +_generic_read_RGBA_span_BGRA8888_REV_SSE: + pushl %esi + pushl %ebx + pushl %ebp + +#ifdef USE_INNER_EMMS + emms +#endif + movq mask, %mm1 + movq mask+16, %mm2 + + movl 16(%esp), %ebx /* source pointer */ + movl 24(%esp), %edx /* number of pixels to copy */ + movl 20(%esp), %ecx /* destination pointer */ + + movl %esp, %ebp + subl $16, %esp + andl $0xfffffff0, %esp + + movl %ebx, %eax + movl %edx, %esi + + negl %eax + andl $15, %eax + sarl $2, %eax + cmpl %edx, %eax + cmovle %eax, %esi + + subl %esi, %edx + + testl $1, %esi + je .L32 + + DO_ONE_PIXEL() +.L32: + + testl $2, %esi + je .L31 + + movq (%ebx), %mm0 + addl $8, %ebx + + movq %mm0, %mm3 + movq %mm0, %mm4 + + pand %mm2, %mm3 + psllq $16, %mm4 + psrlq $16, %mm3 + pand %mm2, %mm4 + + pand %mm1, %mm0 + por %mm4, %mm3 + por %mm3, %mm0 + + movq %mm0, (%ecx) + addl $8, %ecx +.L31: + + movl %edx, %eax + shrl $2, %eax + jmp .L33 +.L34: + movaps (%ebx), %xmm0 + addl $16, %ebx + + /* This would be so much better if we could just move directly from + * an SSE register to an MMX register. Unfortunately, that + * functionality wasn't introduced until SSE2 with the MOVDQ2Q + * instruction. + */ + + movaps %xmm0, (%esp) + movq (%esp), %mm0 + movq 8(%esp), %mm5 + + movq %mm0, %mm3 + movq %mm0, %mm4 + movq %mm5, %mm6 + movq %mm5, %mm7 + + pand %mm2, %mm3 + pand %mm2, %mm6 + + psllq $16, %mm4 + psllq $16, %mm7 + + psrlq $16, %mm3 + psrlq $16, %mm6 + + pand %mm2, %mm4 + pand %mm2, %mm7 + + pand %mm1, %mm0 + pand %mm1, %mm5 + + por %mm4, %mm3 + por %mm7, %mm6 + + por %mm3, %mm0 + por %mm6, %mm5 + + movq %mm0, (%ecx) + movq %mm5, 8(%ecx) + addl $16, %ecx + + subl $1, %eax +.L33: + jne .L34 + +#ifdef USE_INNER_EMMS + emms +#endif + movl %ebp, %esp + + /* At this point there are either [0, 3] pixels remaining to be + * converted. + */ + + testl $2, %edx + je .L36 + + movq (%ebx), %mm0 + addl $8, %ebx + + movq %mm0, %mm3 + movq %mm0, %mm4 + + pand %mm2, %mm3 + psllq $16, %mm4 + psrlq $16, %mm3 + pand %mm2, %mm4 + + pand %mm1, %mm0 + por %mm4, %mm3 + por %mm3, %mm0 + + movq %mm0, (%ecx) + addl $8, %ecx +.L36: + + testl $1, %edx + je .L35 + + DO_ONE_LAST_PIXEL() +.L35: + popl %ebp + popl %ebx + popl %esi + ret + .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE + + +/** + * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. + */ + + .text +.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 + .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function +_generic_read_RGBA_span_BGRA8888_REV_SSE2: + pushl %esi + pushl %ebx + + movdqa mask, %xmm1 + movdqa mask+16, %xmm2 + + movl 12(%esp), %ebx /* source pointer */ + movl 20(%esp), %edx /* number of pixels to copy */ + movl 16(%esp), %ecx /* destination pointer */ + + movl %ebx, %eax + movl %edx, %esi + + /* If the source pointer isn't a multiple of 16 we have to process + * a few pixels the "slow" way to get the address aligned for + * the SSE fetch intsructions. + */ + + negl %eax + andl $15, %eax + sarl $2, %eax + + cmpl %edx, %eax + cmovbe %eax, %esi + subl %esi, %edx + + testl $1, %esi + je .L41 + + DO_ONE_PIXEL() +.L41: + testl $2, %esi + je .L40 + + movq (%ebx), %xmm0 + addl $8, %ebx + + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm4 + andps %xmm1, %xmm0 + + andps %xmm2, %xmm3 + pslldq $2, %xmm4 + psrldq $2, %xmm3 + andps %xmm2, %xmm4 + + orps %xmm4, %xmm3 + orps %xmm3, %xmm0 + + movq %xmm0, (%ecx) + addl $8, %ecx +.L40: + + /* Would it be worth having a specialized version of this loop for + * the case where the destination is 16-byte aligned? That version + * would be identical except that it could use movedqa instead of + * movdqu. + */ + + movl %edx, %eax + shrl $2, %eax + jmp .L42 +.L43: + movdqa (%ebx), %xmm0 + addl $16, %ebx + + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm4 + andps %xmm1, %xmm0 + + andps %xmm2, %xmm3 + pslldq $2, %xmm4 + psrldq $2, %xmm3 + andps %xmm2, %xmm4 + + orps %xmm4, %xmm3 + orps %xmm3, %xmm0 + + movdqu %xmm0, (%ecx) + addl $16, %ecx + subl $1, %eax +.L42: + jne .L43 + + + /* There may be upto 3 pixels remaining to be copied. Take care + * of them now. We do the 2 pixel case first because the data + * will be aligned. + */ + + testl $2, %edx + je .L47 + + movq (%ebx), %xmm0 + + movdqa %xmm0, %xmm3 + movdqa %xmm0, %xmm4 + andps %xmm1, %xmm0 + + andps %xmm2, %xmm3 + pslldq $2, %xmm4 + psrldq $2, %xmm3 + andps %xmm2, %xmm4 + + orps %xmm4, %xmm3 + orps %xmm3, %xmm0 + + movq %xmm0, (%ecx) +.L47: + + testl $1, %edx + je .L46 + + DO_ONE_LAST_PIXEL() +.L46: + + popl %ebx + popl %esi + ret + .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 diff --git a/src/mesa/x86/read_rgba_span_x86.h b/src/mesa/x86/read_rgba_span_x86.h new file mode 100644 index 0000000000..99dd0e365d --- /dev/null +++ b/src/mesa/x86/read_rgba_span_x86.h @@ -0,0 +1,53 @@ +/* + * (C) Copyright IBM Corporation 2004 + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * \file read_rgba_span_x86.h + * + * \author Ian Romanick <idr@us.ibm.com> + */ + +#ifndef READ_RGBA_SPAN_X86_H +#define READ_RGBA_SPAN_X86_H + +#if defined(USE_SSE_ASM) || defined(USE_MMX_ASM) +#include "x86/common_x86_asm.h" +#endif + +#if defined(USE_SSE_ASM) +extern void _generic_read_RGBA_span_BGRA8888_REV_SSE2( const unsigned char *, + unsigned char *, unsigned ); +#endif + +#if defined(USE_SSE_ASM) +extern void _generic_read_RGBA_span_BGRA8888_REV_SSE( const unsigned char *, + unsigned char *, unsigned ); +#endif + +#if defined(USE_MMX_ASM) +extern void _generic_read_RGBA_span_BGRA8888_REV_MMX( const unsigned char *, + unsigned char *, unsigned ); +#endif + +#endif /* READ_RGBA_SPAN_X86_H */ |