diff options
-rw-r--r-- | docs/README.3DFX | 10 | ||||
-rw-r--r-- | src/mesa/Makefile.mgw | 7 | ||||
-rw-r--r-- | src/mesa/tnl/t_vtx_api.c | 16 | ||||
-rw-r--r-- | src/mesa/tnl/t_vtx_api.h | 2 | ||||
-rw-r--r-- | src/mesa/tnl/t_vtx_x86.c | 156 | ||||
-rw-r--r-- | src/mesa/tnl/t_vtx_x86_gcc.S | 573 | ||||
-rw-r--r-- | src/mesa/x86/assyntax.h | 8 |
7 files changed, 478 insertions, 294 deletions
diff --git a/docs/README.3DFX b/docs/README.3DFX index e70aea7de9..15888c30d9 100644 --- a/docs/README.3DFX +++ b/docs/README.3DFX @@ -3,7 +3,7 @@ -Mesa-6.0 release notes: +Mesa-6.1 release notes: ----------------------- 1) Glide2 support has been ceased; in order to keep Voodoo Rush @@ -26,7 +26,7 @@ DOS (DJGPP), Windows9x/2k (MinGW/MSVC), Linux How to compile: --------------- -DJGPP/MinGW/MSVC: +DJGPP/MinGW: Place the Glide3 SDK in the top Mesa directory: $(MESA)/glide3/include/*.h $(MESA)/glide3/lib/ @@ -35,11 +35,9 @@ DJGPP/MinGW/MSVC: Required libraries: OS specific Type: - make -f Makefile.DJ HAVE_MMX=1 HAVE_3DNOW=1 FX=1 + make -f Makefile.DJ X86=1 FX=1 or - make -f Makefile.mgw HAVE_MMX=1 HAVE_3DNOW=1 FX=1 - or - nmake -f Makefile.wfx + make -f Makefile.mgw X86=1 FX=1 Look into the corresponding makefiles for further information. Linux: diff --git a/src/mesa/Makefile.mgw b/src/mesa/Makefile.mgw index 9084bf478b..a209e582d9 100644 --- a/src/mesa/Makefile.mgw +++ b/src/mesa/Makefile.mgw @@ -134,7 +134,7 @@ x86/matypes.h: x86/gen_matypes.exe x86/gen_matypes.exe: x86/gen_matypes.c $(CC) -o $@ $(CFLAGS) -s $< -# [dBorca] Hack alert: +# [dBorca] # glapi_x86.S needs some adjustments # in order to generate correct entrypoints x86/glapi_x86.o: x86/glapi_x86.S @@ -142,6 +142,11 @@ x86/glapi_x86.o: x86/glapi_x86.S #main/dispatch.o: main/dispatch.c # $(CC) -o $@ $(CFLAGS) -UUSE_X86_ASM -c $< +# [dBorca] +# if we want codegen, we have to stdcall +tnl/t_vtx_x86_gcc.o: tnl/t_vtx_x86_gcc.S + $(CC) -o $@ $(CFLAGS) -DSTDCALL_API -c $< + clean: -$(call UNLINK,array_cache/*.o) -$(call UNLINK,glapi/*.o) diff --git a/src/mesa/tnl/t_vtx_api.c b/src/mesa/tnl/t_vtx_api.c index 0e28c73910..24ced9da18 100644 --- a/src/mesa/tnl/t_vtx_api.c +++ b/src/mesa/tnl/t_vtx_api.c @@ -102,8 +102,10 @@ static void _tnl_wrap_buffers( GLcontext *ctx ) /* Deal with buffer wrapping where provoked by the vertex buffer * filling up, as opposed to upgrade_vertex(). + * + * Make it GLAPIENTRY, so we can tail from the codegen'ed Vertex*fv */ -void _tnl_wrap_filled_vertex( GLcontext *ctx ) +void GLAPIENTRY _tnl_wrap_filled_vertex( GLcontext *ctx ) { TNLcontext *tnl = TNL_CONTEXT(ctx); GLfloat *data = tnl->vtx.copied.buffer; @@ -403,7 +405,7 @@ static attrfv_func do_choose( GLuint attr, GLuint sz ) /* Try to use codegen: - */ + */ #ifdef USE_X86_ASM if (tnl->AllowCodegen) tnl->vtx.tabfv[attr][sz-1] = do_codegen( ctx, attr, sz ); @@ -473,11 +475,15 @@ static void reset_attrfv( TNLcontext *tnl ) for (i = 0 ; i < _TNL_ATTRIB_MAX ; i++) if (tnl->vtx.attrsz[i]) { - GLuint j = tnl->vtx.attrsz[i] - 1; + GLint j = tnl->vtx.attrsz[i] - 1; tnl->vtx.attrsz[i] = 0; - if (i < _TNL_MAX_ATTR_CODEGEN) - tnl->vtx.tabfv[i][j] = choose[i][j]; + if (i < _TNL_MAX_ATTR_CODEGEN) { + while (j >= 0) { + tnl->vtx.tabfv[i][j] = choose[i][j]; + j--; + } + } } tnl->vtx.vertex_size = 0; diff --git a/src/mesa/tnl/t_vtx_api.h b/src/mesa/tnl/t_vtx_api.h index f58461332e..97d5cf5563 100644 --- a/src/mesa/tnl/t_vtx_api.h +++ b/src/mesa/tnl/t_vtx_api.h @@ -49,7 +49,7 @@ extern void _tnl_vtx_destroy( GLcontext *ctx ); extern void _tnl_FlushVertices( GLcontext *ctx, GLuint flags ); extern void _tnl_flush_vtx( GLcontext *ctx ); -extern void _tnl_wrap_filled_vertex( GLcontext *ctx ); +extern void GLAPIENTRY _tnl_wrap_filled_vertex( GLcontext *ctx ); /* t_vtx_exec.c: */ diff --git a/src/mesa/tnl/t_vtx_x86.c b/src/mesa/tnl/t_vtx_x86.c index 5aafedebbe..5d7f95e98b 100644 --- a/src/mesa/tnl/t_vtx_x86.c +++ b/src/mesa/tnl/t_vtx_x86.c @@ -60,22 +60,25 @@ EXTERN( _tnl_x86_Vertex2fv ); EXTERN( _tnl_x86_Vertex3fv ); EXTERN( _tnl_x86_Vertex4fv ); -EXTERN( _tnl_x86_dispatch_attrf ); +EXTERN( _tnl_x86_dispatch_attrf1 ); +EXTERN( _tnl_x86_dispatch_attrf2 ); +EXTERN( _tnl_x86_dispatch_attrf3 ); +EXTERN( _tnl_x86_dispatch_attrf4 ); EXTERN( _tnl_x86_dispatch_attrfv ); -EXTERN( _tnl_x86_dispatch_multitexcoordf ); +EXTERN( _tnl_x86_dispatch_multitexcoordf1 ); +EXTERN( _tnl_x86_dispatch_multitexcoordf2 ); +EXTERN( _tnl_x86_dispatch_multitexcoordf3 ); +EXTERN( _tnl_x86_dispatch_multitexcoordf4 ); EXTERN( _tnl_x86_dispatch_multitexcoordfv ); -EXTERN( _tnl_x86_dispatch_vertexattribf ); +EXTERN( _tnl_x86_dispatch_vertexattribf1 ); +EXTERN( _tnl_x86_dispatch_vertexattribf2 ); +EXTERN( _tnl_x86_dispatch_vertexattribf3 ); +EXTERN( _tnl_x86_dispatch_vertexattribf4 ); EXTERN( _tnl_x86_dispatch_vertexattribfv ); EXTERN( _tnl_x86_choose_fv ); -static void notify( void ) -{ - GET_CURRENT_CONTEXT( ctx ); - _tnl_wrap_filled_vertex( ctx ); -} - #define DONT_KNOW_OFFSETS 1 @@ -93,7 +96,7 @@ static void notify( void ) #define FIXUP( CODE, KNOWN_OFFSET, CHECKVAL, NEWVAL ) \ do { \ - GLuint subst = 0x10101010 + CHECKVAL; \ + GLint subst = 0x10101010 + CHECKVAL; \ \ if (DONT_KNOW_OFFSETS) { \ while (*(int *)(CODE+offset) != subst) offset++; \ @@ -112,7 +115,7 @@ do { \ #define FIXUPREL( CODE, KNOWN_OFFSET, CHECKVAL, NEWVAL )\ do { \ - GLuint subst = 0x10101010 + CHECKVAL; \ + GLint subst = 0x10101010 + CHECKVAL; \ \ if (DONT_KNOW_OFFSETS) { \ while (*(int *)(CODE+offset) != subst) offset++; \ @@ -262,53 +265,16 @@ void _tnl_InitX86Codegen( struct _tnl_dynfn_generators *gen ) } -static attrfv_func -_do_choose( GLuint attr, GLuint sz ) -{ - return NULL; -} - - -/* I purposely avoided one single macro, since they might need to be - * handled in different ways. Ohwell, once things get much clearer, - * they could collapse... - */ -#define MAKE_DISPATCH_ATTR(FUNC, SIZE, TYPE, ATTR) \ +#define MKDISP(FUNC, SIZE, ATTR, WARP) \ do { \ char *code; \ - char *start = (char *)&_tnl_x86_dispatch_attr##TYPE; \ - char *end = (char *)&_tnl_x86_dispatch_attr##TYPE##_end; \ + char *start = (char *)&WARP; \ + char *end = (char *)&WARP##_end; \ int offset = 0; \ code = ALIGN_MALLOC( end - start, 16 ); \ memcpy (code, start, end - start); \ FIXUP(code, 0, 0, (int)&(TNL_CONTEXT(ctx)->vtx.tabfv[ATTR][SIZE-1]));\ - vfmt->FUNC##SIZE##TYPE = code; \ -} while (0) - - -#define MAKE_DISPATCH_MULTITEXCOORD(FUNC, SIZE, TYPE, ATTR) \ -do { \ - char *code; \ - char *start = (char *)&_tnl_x86_dispatch_multitexcoord##TYPE; \ - char *end = (char *)&_tnl_x86_dispatch_multitexcoord##TYPE##_end; \ - int offset = 0; \ - code = ALIGN_MALLOC( end - start, 16 ); \ - memcpy (code, start, end - start); \ - FIXUP(code, 0, 0, (int)&(TNL_CONTEXT(ctx)->vtx.tabfv[_TNL_ATTRIB_TEX0][SIZE-1]));\ - vfmt->FUNC##SIZE##TYPE##ARB = code; \ -} while (0) - - -#define MAKE_DISPATCH_VERTEXATTRIB(FUNC, SIZE, TYPE, ATTR) \ -do { \ - char *code; \ - char *start = (char *)&_tnl_x86_dispatch_vertexattrib##TYPE; \ - char *end = (char *)&_tnl_x86_dispatch_vertexattrib##TYPE##_end; \ - int offset = 0; \ - code = ALIGN_MALLOC( end - start, 16 ); \ - memcpy (code, start, end - start); \ - FIXUP(code, 0, 0, (int)&(TNL_CONTEXT(ctx)->vtx.tabfv[0][SIZE-1])); \ - vfmt->FUNC##SIZE##TYPE##NV = code; \ + *(void **)&vfmt->FUNC = code; \ } while (0) @@ -319,48 +285,48 @@ void _tnl_x86_exec_vtxfmt_init( GLcontext *ctx ) { GLvertexformat *vfmt = &(TNL_CONTEXT(ctx)->exec_vtxfmt); - MAKE_DISPATCH_ATTR(Color,3,f, _TNL_ATTRIB_COLOR0); - MAKE_DISPATCH_ATTR(Color,3,fv, _TNL_ATTRIB_COLOR0); - MAKE_DISPATCH_ATTR(Color,4,f, _TNL_ATTRIB_COLOR0); - MAKE_DISPATCH_ATTR(Color,4,fv, _TNL_ATTRIB_COLOR0); -/* vfmt->FogCoordfEXT = _tnl_FogCoordfEXT; - vfmt->FogCoordfvEXT = _tnl_FogCoordfvEXT;*/ - MAKE_DISPATCH_ATTR(Normal,3,f, _TNL_ATTRIB_NORMAL); - MAKE_DISPATCH_ATTR(Normal,3,fv, _TNL_ATTRIB_NORMAL); -/* vfmt->SecondaryColor3fEXT = _tnl_SecondaryColor3fEXT; - vfmt->SecondaryColor3fvEXT = _tnl_SecondaryColor3fvEXT; */ - MAKE_DISPATCH_ATTR(TexCoord,1,f, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(TexCoord,1,fv, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(TexCoord,2,f, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(TexCoord,2,fv, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(TexCoord,3,f, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(TexCoord,3,fv, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(TexCoord,4,f, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(TexCoord,4,fv, _TNL_ATTRIB_TEX0); - MAKE_DISPATCH_ATTR(Vertex,2,f, _TNL_ATTRIB_POS); - MAKE_DISPATCH_ATTR(Vertex,2,fv, _TNL_ATTRIB_POS); - MAKE_DISPATCH_ATTR(Vertex,3,f, _TNL_ATTRIB_POS); - MAKE_DISPATCH_ATTR(Vertex,3,fv, _TNL_ATTRIB_POS); - MAKE_DISPATCH_ATTR(Vertex,4,f, _TNL_ATTRIB_POS); - MAKE_DISPATCH_ATTR(Vertex,4,fv, _TNL_ATTRIB_POS); - - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,1,f, 0); - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,1,fv, 0); - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,2,f, 0); - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,2,fv, 0); - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,3,f, 0); - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,3,fv, 0); - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,4,f, 0); - MAKE_DISPATCH_MULTITEXCOORD(MultiTexCoord,4,fv, 0); - - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,1,f, 0); - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,1,fv, 0); - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,2,f, 0); - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,2,fv, 0); - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,3,f, 0); - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,3,fv, 0); - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,4,f, 0); - MAKE_DISPATCH_VERTEXATTRIB(VertexAttrib,4,fv, 0); + MKDISP(Color3f, 3, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrf3); + MKDISP(Color3fv, 3, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrfv); + MKDISP(Color4f, 4, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrf4); + MKDISP(Color4fv, 4, _TNL_ATTRIB_COLOR0, _tnl_x86_dispatch_attrfv); + MKDISP(FogCoordfEXT, 1, _TNL_ATTRIB_FOG, _tnl_x86_dispatch_attrf1); + MKDISP(FogCoordfvEXT, 1, _TNL_ATTRIB_FOG, _tnl_x86_dispatch_attrfv); + MKDISP(Normal3f, 3, _TNL_ATTRIB_NORMAL, _tnl_x86_dispatch_attrf3); + MKDISP(Normal3fv, 3, _TNL_ATTRIB_NORMAL, _tnl_x86_dispatch_attrfv); + MKDISP(SecondaryColor3fEXT, 3, _TNL_ATTRIB_COLOR1, _tnl_x86_dispatch_attrf3); + MKDISP(SecondaryColor3fvEXT,3, _TNL_ATTRIB_COLOR1, _tnl_x86_dispatch_attrfv); + MKDISP(TexCoord1f, 1, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrf1); + MKDISP(TexCoord1fv, 1, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrfv); + MKDISP(TexCoord2f, 2, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrf2); + MKDISP(TexCoord2fv, 2, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrfv); + MKDISP(TexCoord3f, 3, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrf3); + MKDISP(TexCoord3fv, 3, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrfv); + MKDISP(TexCoord4f, 4, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrf4); + MKDISP(TexCoord4fv, 4, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_attrfv); + MKDISP(Vertex2f, 2, _TNL_ATTRIB_POS, _tnl_x86_dispatch_attrf2); + MKDISP(Vertex2fv, 2, _TNL_ATTRIB_POS, _tnl_x86_dispatch_attrfv); + MKDISP(Vertex3f, 3, _TNL_ATTRIB_POS, _tnl_x86_dispatch_attrf3); + MKDISP(Vertex3fv, 3, _TNL_ATTRIB_POS, _tnl_x86_dispatch_attrfv); + MKDISP(Vertex4f, 4, _TNL_ATTRIB_POS, _tnl_x86_dispatch_attrf4); + MKDISP(Vertex4fv, 4, _TNL_ATTRIB_POS, _tnl_x86_dispatch_attrfv); + + MKDISP(MultiTexCoord1fARB, 1, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordf1); + MKDISP(MultiTexCoord1fvARB, 1, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordfv); + MKDISP(MultiTexCoord2fARB, 2, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordf2); + MKDISP(MultiTexCoord2fvARB, 2, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordfv); + MKDISP(MultiTexCoord3fARB, 3, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordf3); + MKDISP(MultiTexCoord3fvARB, 3, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordfv); + MKDISP(MultiTexCoord4fARB, 4, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordf4); + MKDISP(MultiTexCoord4fvARB, 4, _TNL_ATTRIB_TEX0, _tnl_x86_dispatch_multitexcoordfv); + + MKDISP(VertexAttrib1fNV, 1, 0, _tnl_x86_dispatch_vertexattribf1); + MKDISP(VertexAttrib1fvNV, 1, 0, _tnl_x86_dispatch_vertexattribfv); + MKDISP(VertexAttrib2fNV, 2, 0, _tnl_x86_dispatch_vertexattribf2); + MKDISP(VertexAttrib2fvNV, 2, 0, _tnl_x86_dispatch_vertexattribfv); + MKDISP(VertexAttrib3fNV, 3, 0, _tnl_x86_dispatch_vertexattribf3); + MKDISP(VertexAttrib3fvNV, 3, 0, _tnl_x86_dispatch_vertexattribfv); + MKDISP(VertexAttrib4fNV, 4, 0, _tnl_x86_dispatch_vertexattribf4); + MKDISP(VertexAttrib4fvNV, 4, 0, _tnl_x86_dispatch_vertexattribfv); } @@ -384,7 +350,7 @@ void _tnl_x86choosers( attrfv_func (*choose)[4], FIXUP(code, 0, 0, attr); FIXUP(code, 0, 1, size + 1); FIXUPREL(code, 0, 2, do_choose); - choose[attr][size] = code; + choose[attr][size] = (attrfv_func)code; } } } diff --git a/src/mesa/tnl/t_vtx_x86_gcc.S b/src/mesa/tnl/t_vtx_x86_gcc.S index bad87d3ee9..fcc69f1d0d 100644 --- a/src/mesa/tnl/t_vtx_x86_gcc.S +++ b/src/mesa/tnl/t_vtx_x86_gcc.S @@ -28,97 +28,114 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. /* * Authors: * Keith Whitwell <keith@tungstengraphics.com> + * Daniel Borca <dborca@yahoo.com> */ -#if !defined (__DJGPP__) && !defined (__MINGW32__) - +#if defined (__DJGPP__) || defined (__MINGW32__) +#define GLOBL( x ) \ +.globl _##x; \ +_##x: +#else /* !defined (__DJGPP__) && !defined (__MINGW32__) */ #define GLOBL( x ) \ .globl x; \ x: +#endif /* !defined (__DJGPP__) && !defined (__MINGW32__) */ -#else /* defined(__DJGPP__) || defined (__MINGW32__) */ -#define GLOBL( x ) \ -.globl _##x; \ -_##x: +#if !defined (STDCALL_API) +#define RETCLEAN( x ) ret +#else +#define RETCLEAN( x ) ret $x +#endif -#endif /* defined(__DJGPP__) || defined (__MINGW32__) */ -.data -.align 4 +#define _JMP(x) \ +.byte 0xe9; \ +.long x + +#define _CALL(x) \ +.byte 0xe8; \ +.long x -// Someone who knew a lot about this sort of thing would use this -// macro to note current offsets, etc in a special region of the -// object file & just make everything work out neat. I do not know -// enough to do that... + +/* Someone who knew a lot about this sort of thing would use this + * macro to note current offsets, etc in a special region of the + * object file & just make everything work out neat. I don't know + * enough to do that... + */ #define SUBST( x ) (0x10101010 + x) +.data -// [dBorca] TODO -// Unfold functions for each vertex size? -// Build super-specialized SSE versions? -// STDCALL woes (HAVE_NONSTANDARD_GLAPIENTRY): -// need separate routine for the non "fv" case, -// to clean up the stack! +/* [dBorca] TODO + * Unfold functions for each vertex size? + * Build super-specialized SSE versions? + * + * There is a trick in Vertex*fv: under certain conditions, + * we tail to _tnl_wrap_filled_vertex(ctx). This means that + * if Vertex*fv is STDCALL, then _tnl_wrap_filled_vertex must + * be STDCALL as well, because (GLcontext *) and (GLfloat *) + * have the same size. + */ +.align 4 GLOBL ( _tnl_x86_Vertex1fv ) movl 4(%esp), %ecx push %edi push %esi - movl SUBST(0), %edi # 0x0 --> tnl->vtx.vbptr - movl (%ecx), %edx # load v[0] - movl %edx, (%edi) # tnl->vtx.vbptr[0] = v[0] - addl $4, %edi # tnl->vtx.vbptr += 1 - movl $SUBST(1), %ecx # 0x1 --> (tnl->vtx.vertex_size - 1) - movl $SUBST(2), %esi # 0x2 --> (tnl->vtx.vertex + 1) + movl SUBST(0), %edi /* 0x0 --> tnl->vtx.vbptr */ + movl (%ecx), %edx /* load v[0] */ + movl %edx, (%edi) /* tnl->vtx.vbptr[0] = v[0] */ + addl $4, %edi /* tnl->vtx.vbptr += 1 */ + movl $SUBST(1), %ecx /* 0x1 --> (tnl->vtx.vertex_size - 1) */ + movl $SUBST(2), %esi /* 0x2 --> (tnl->vtx.vertex + 1) */ repz movsl %ds:(%esi), %es:(%edi) - movl %edi, SUBST(0) # 0x0 --> tnl->vtx.vbptr - movl SUBST(3), %edx # 0x3 --> counter + movl %edi, SUBST(0) /* 0x0 --> tnl->vtx.vbptr */ + movl SUBST(3), %edx /* 0x3 --> counter */ pop %esi pop %edi - dec %edx # counter-- - movl %edx, SUBST(3) # 0x3 --> counter - jne .0 # if (counter != 0) return - pushl $SUBST(4) # 0x4 --> ctx - .byte 0xe8 # call ... - .long SUBST(5) # ... _tnl_wrap_filled_vertex(ctx) - pop %eax + dec %edx /* counter-- */ + movl %edx, SUBST(3) /* 0x3 --> counter */ + je .0 /* if (counter == 0) goto .0 */ + RETCLEAN(4) /* return */ + .balign 16 .0: - ret # return + movl $SUBST(4), %eax /* load ctx */ + movl %eax, 4(%esp) /* push ctx */ + _JMP (SUBST(5)) /* jmp _tnl_wrap_filled_vertex */ GLOBL ( _tnl_x86_Vertex1fv_end ) - .align 4 GLOBL ( _tnl_x86_Vertex2fv ) movl 4(%esp), %ecx push %edi push %esi - movl SUBST(0), %edi # load tnl->vtx.vbptr - movl (%ecx), %edx # load v[0] - movl 4(%ecx), %eax # load v[1] - movl %edx, (%edi) # tnl->vtx.vbptr[0] = v[0] - movl %eax, 4(%edi) # tnl->vtx.vbptr[1] = v[1] - addl $8, %edi # tnl->vtx.vbptr += 2 - movl $SUBST(1), %ecx # vertex_size - 2 - movl $SUBST(2), %esi # tnl->vtx.vertex + 2 + movl SUBST(0), %edi /* load tnl->vtx.vbptr */ + movl (%ecx), %edx /* load v[0] */ + movl 4(%ecx), %eax /* load v[1] */ + movl %edx, (%edi) /* tnl->vtx.vbptr[0] = v[0] */ + movl %eax, 4(%edi) /* tnl->vtx.vbptr[1] = v[1] */ + addl $8, %edi /* tnl->vtx.vbptr += 2 */ + movl $SUBST(1), %ecx /* vertex_size - 2 */ + movl $SUBST(2), %esi /* tnl->vtx.vertex + 2 */ repz movsl %ds:(%esi), %es:(%edi) - movl %edi, SUBST(0) # save tnl->vtx.vbptr - movl SUBST(3), %edx # load counter + movl %edi, SUBST(0) /* save tnl->vtx.vbptr */ + movl SUBST(3), %edx /* load counter */ pop %esi pop %edi - dec %edx # counter-- - movl %edx, SUBST(3) # save counter - jne .1 # if (counter != 0) return - pushl $SUBST(4) # load ctx - .byte 0xe8 # call ... - .long SUBST(5) # ... _tnl_wrap_filled_vertex(ctx) - pop %eax + dec %edx /* counter-- */ + movl %edx, SUBST(3) /* save counter */ + je .1 /* if (counter == 0) goto .1 */ + RETCLEAN(4) /* return */ + .balign 16 .1: - ret # return + movl $SUBST(4), %eax /* load ctx */ + movl %eax, 4(%esp) /* push ctx */ + _JMP (SUBST(5)) /* jmp _tnl_wrap_filled_vertex */ GLOBL ( _tnl_x86_Vertex2fv_end ) .align 4 @@ -126,92 +143,88 @@ GLOBL ( _tnl_x86_Vertex3fv ) movl 4(%esp), %ecx push %edi push %esi - movl SUBST(0), %edi # load tnl->vtx.vbptr - movl (%ecx), %edx # load v[0] - movl 4(%ecx), %eax # load v[1] - movl 8(%ecx), %esi # load v[2] - movl %edx, (%edi) # tnl->vtx.vbptr[0] = v[0] - movl %eax, 4(%edi) # tnl->vtx.vbptr[1] = v[1] - movl %esi, 8(%edi) # tnl->vtx.vbptr[2] = v[2] - addl $12, %edi # tnl->vtx.vbptr += 3 - movl $SUBST(1), %ecx # vertex_size - 3 - movl $SUBST(2), %esi # tnl->vtx.vertex + 3 + movl SUBST(0), %edi /* load tnl->vtx.vbptr */ + movl (%ecx), %edx /* load v[0] */ + movl 4(%ecx), %eax /* load v[1] */ + movl 8(%ecx), %esi /* load v[2] */ + movl %edx, (%edi) /* tnl->vtx.vbptr[0] = v[0] */ + movl %eax, 4(%edi) /* tnl->vtx.vbptr[1] = v[1] */ + movl %esi, 8(%edi) /* tnl->vtx.vbptr[2] = v[2] */ + addl $12, %edi /* tnl->vtx.vbptr += 3 */ + movl $SUBST(1), %ecx /* vertex_size - 3 */ + movl $SUBST(2), %esi /* tnl->vtx.vertex + 3 */ repz movsl %ds:(%esi), %es:(%edi) - movl %edi, SUBST(0) # save tnl->vtx.vbptr - movl SUBST(3), %edx # load counter + movl %edi, SUBST(0) /* save tnl->vtx.vbptr */ + movl SUBST(3), %edx /* load counter */ pop %esi pop %edi - dec %edx # counter-- - movl %edx, SUBST(3) # save counter - jne .2 # if (counter != 0) return - pushl $SUBST(4) # load ctx - .byte 0xe8 # call ... - .long SUBST(5) # ... _tnl_wrap_filled_vertex(ctx) - pop %eax + dec %edx /* counter-- */ + movl %edx, SUBST(3) /* save counter */ + je .2 /* if (counter == 0) goto .2 */ + RETCLEAN(4) /* return */ + .balign 16 .2: - ret # return + movl $SUBST(4), %eax /* load ctx */ + movl %eax, 4(%esp) /* push ctx */ + _JMP (SUBST(5)) /* jmp _tnl_wrap_filled_vertex */ GLOBL ( _tnl_x86_Vertex3fv_end ) - .align 4 GLOBL ( _tnl_x86_Vertex4fv ) movl 4(%esp), %ecx push %edi push %esi - movl SUBST(0), %edi # load tnl->vtx.vbptr - movl (%ecx), %edx # load v[0] - movl 4(%ecx), %eax # load v[1] - movl 8(%ecx), %esi # load v[2] - movl 12(%ecx), %ecx # load v[3] - movl %edx, (%edi) # tnl->vtx.vbptr[0] = v[0] - movl %eax, 4(%edi) # tnl->vtx.vbptr[1] = v[1] - movl %esi, 8(%edi) # tnl->vtx.vbptr[2] = v[2] - movl %ecx, 12(%edi) # tnl->vtx.vbptr[3] = v[3] - addl $16, %edi # tnl->vtx.vbptr += 4 - movl $SUBST(1), %ecx # vertex_size - 4 - movl $SUBST(2), %esi # tnl->vtx.vertex + 3 + movl SUBST(0), %edi /* load tnl->vtx.vbptr */ + movl (%ecx), %edx /* load v[0] */ + movl 4(%ecx), %eax /* load v[1] */ + movl 8(%ecx), %esi /* load v[2] */ + movl 12(%ecx), %ecx /* load v[3] */ + movl %edx, (%edi) /* tnl->vtx.vbptr[0] = v[0] */ + movl %eax, 4(%edi) /* tnl->vtx.vbptr[1] = v[1] */ + movl %esi, 8(%edi) /* tnl->vtx.vbptr[2] = v[2] */ + movl %ecx, 12(%edi) /* tnl->vtx.vbptr[3] = v[3] */ + addl $16, %edi /* tnl->vtx.vbptr += 4 */ + movl $SUBST(1), %ecx /* vertex_size - 4 */ + movl $SUBST(2), %esi /* tnl->vtx.vertex + 4 */ repz movsl %ds:(%esi), %es:(%edi) - movl %edi, SUBST(0) # save tnl->vtx.vbptr - movl SUBST(3), %edx # load counter + movl %edi, SUBST(0) /* save tnl->vtx.vbptr */ + movl SUBST(3), %edx /* load counter */ pop %esi pop %edi - dec %edx # counter-- - movl %edx, SUBST(3) # save counter - jne .3 # if (counter != 0) return - pushl $SUBST(4) # load ctx - .byte 0xe8 # call ... - .long SUBST(5) # ... _tnl_wrap_filled_vertex(ctx) - pop %eax + dec %edx /* counter-- */ + movl %edx, SUBST(3) /* save counter */ + je .3 /* if (counter == 0) goto .3 */ + RETCLEAN(4) /* return */ + .balign 16 .3: - ret # return + movl $SUBST(4), %eax /* load ctx */ + movl %eax, 4(%esp) /* push ctx */ + _JMP (SUBST(5)) /* jmp _tnl_wrap_filled_vertex */ GLOBL ( _tnl_x86_Vertex4fv_end ) - /** * Generic handlers for vector format data. */ - -GLOBL( _tnl_x86_Attribute1fv) +GLOBL( _tnl_x86_Attribute1fv ) movl 4(%esp), %ecx movl (%ecx), %eax /* load v[0] */ movl %eax, SUBST(0) /* store v[0] to current vertex */ - ret + RETCLEAN(4) GLOBL ( _tnl_x86_Attribute1fv_end ) -GLOBL( _tnl_x86_Attribute2fv) +GLOBL( _tnl_x86_Attribute2fv ) movl 4(%esp), %ecx movl (%ecx), %eax /* load v[0] */ movl 4(%ecx), %edx /* load v[1] */ movl %eax, SUBST(0) /* store v[0] to current vertex */ movl %edx, SUBST(1) /* store v[1] to current vertex */ - ret + RETCLEAN(4) GLOBL ( _tnl_x86_Attribute2fv_end ) - -GLOBL( _tnl_x86_Attribute3fv) +GLOBL( _tnl_x86_Attribute3fv ) movl 4(%esp), %ecx movl (%ecx), %eax /* load v[0] */ movl 4(%ecx), %edx /* load v[1] */ @@ -219,10 +232,10 @@ GLOBL( _tnl_x86_Attribute3fv) movl %eax, SUBST(0) /* store v[0] to current vertex */ movl %edx, SUBST(1) /* store v[1] to current vertex */ movl %ecx, SUBST(2) /* store v[2] to current vertex */ - ret + RETCLEAN(4) GLOBL ( _tnl_x86_Attribute3fv_end ) -GLOBL( _tnl_x86_Attribute4fv) +GLOBL( _tnl_x86_Attribute4fv ) movl 4(%esp), %ecx movl (%ecx), %eax /* load v[0] */ movl 4(%ecx), %edx /* load v[1] */ @@ -232,84 +245,131 @@ GLOBL( _tnl_x86_Attribute4fv) movl 12(%ecx), %edx /* load v[3] */ movl %eax, SUBST(2) /* store v[2] to current vertex */ movl %edx, SUBST(3) /* store v[3] to current vertex */ - ret + RETCLEAN(4) GLOBL ( _tnl_x86_Attribute4fv_end ) -// Choosers: - -// Must generate all of these ahead of first usage. Generate at -// compile-time? - - -GLOBL( _tnl_x86_choose_fv) - subl $12, %esp # gcc does 16 byte alignment of stack frames? - movl $SUBST(0), (%esp) # arg 0 - attrib - movl $SUBST(1), 4(%esp) # arg 1 - N - .byte 0xe8 # call ... - .long SUBST(2) # ... do_choose - add $12, %esp # tear down stack frame - jmp *%eax # jump to new func +/* Choosers: + * + * Must generate all of these ahead of first usage. Generate at + * compile-time? + */ +GLOBL( _tnl_x86_choose_fv ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl $SUBST(0), (%esp) /* arg 0 - attrib */ + movl $SUBST(1), 4(%esp) /* arg 1 - N */ + _CALL (SUBST(2)) /* call do_choose */ + add $12, %esp /* tear down stack frame */ + jmp *%eax /* jump to new func */ GLOBL ( _tnl_x86_choose_fv_end ) +/* FIRST LEVEL FUNCTIONS -- these are plugged directly into GL dispatch. + * + * In the 1st level dispatch functions, switch to a different + * calling convention -- (const GLfloat *v) in %ecx. + * + * As with regular (x86) dispatch, don't create a new stack frame - + * just let the 'ret' in the dispatched function return straight + * back to the original caller. + * + * Vertex/Normal/Color, etc: the address of the function pointer + * is known at codegen time. + */ -// FIRST LEVEL FUNCTIONS -- these are plugged directly into GL dispatch. - - - -// In the 1st level dispatch functions, switch to a different -// calling convention -- (const GLfloat *v) in %ecx. -// -// As with regular (x86) dispatch, do not create a new stack frame - -// just let the 'ret' in the dispatched function return straight -// back to the original caller. - - - -// Vertex/Normal/Color, etc: the address of the function pointer -// is known at codegen time. - - -// Unfortunately, have to play with the stack in the non-fv case: -// -GLOBL( _tnl_x86_dispatch_attrf ) - subl $12, %esp # gcc does 16 byte alignment of stack frames? - leal 16(%esp), %edx # address of first float on stack - movl %edx, (%esp) # save as 'v' - call *SUBST(0) # 0x0 --> tabfv[attr][n] - addl $12, %esp # tear down frame - ret # return -GLOBL( _tnl_x86_dispatch_attrf_end ) - -// The fv case is simpler: -// +/* Unfortunately, have to play with the stack in the non-fv case: + */ +#if !defined (STDCALL_API) +GLOBL( _tnl_x86_dispatch_attrf1 ) +GLOBL( _tnl_x86_dispatch_attrf2 ) +GLOBL( _tnl_x86_dispatch_attrf3 ) +GLOBL( _tnl_x86_dispatch_attrf4 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + leal 16(%esp), %edx /* address of first float on stack */ + movl %edx, (%esp) /* save as 'v' */ + call *SUBST(0) /* 0x0 --> tabfv[attr][n] */ + addl $12, %esp /* tear down frame */ + ret /* return */ +GLOBL( _tnl_x86_dispatch_attrf4_end ) +GLOBL( _tnl_x86_dispatch_attrf3_end ) +GLOBL( _tnl_x86_dispatch_attrf2_end ) +GLOBL( _tnl_x86_dispatch_attrf1_end ) + +#else /* defined(STDCALL_API) */ + +GLOBL( _tnl_x86_dispatch_attrf1 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + leal 16(%esp), %edx /* address of first float on stack */ + movl %edx, (%esp) /* save as 'v' */ + call *SUBST(0) /* 0x0 --> tabfv[attr][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $4 /* return */ +GLOBL( _tnl_x86_dispatch_attrf1_end ) + +GLOBL( _tnl_x86_dispatch_attrf2 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + leal 16(%esp), %edx /* address of first float on stack */ + movl %edx, (%esp) /* save as 'v' */ + call *SUBST(0) /* 0x0 --> tabfv[attr][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $8 /* return */ +GLOBL( _tnl_x86_dispatch_attrf2_end ) + +GLOBL( _tnl_x86_dispatch_attrf3 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + leal 16(%esp), %edx /* address of first float on stack */ + movl %edx, (%esp) /* save as 'v' */ + call *SUBST(0) /* 0x0 --> tabfv[attr][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $12 /* return */ +GLOBL( _tnl_x86_dispatch_attrf3_end ) + +GLOBL( _tnl_x86_dispatch_attrf4 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + leal 16(%esp), %edx /* address of first float on stack */ + movl %edx, (%esp) /* save as 'v' */ + call *SUBST(0) /* 0x0 --> tabfv[attr][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $16 /* return */ +GLOBL( _tnl_x86_dispatch_attrf4_end ) +#endif /* defined(STDCALL_API) */ + +/* The fv case is simpler: + */ GLOBL( _tnl_x86_dispatch_attrfv ) - jmp *SUBST(0) # 0x0 --> tabfv[attr][n] + jmp *SUBST(0) /* 0x0 --> tabfv[attr][n] */ GLOBL( _tnl_x86_dispatch_attrfv_end ) -// MultiTexcoord: the address of the function pointer must be -// calculated, but can use the index argument slot to hold 'v', and -// avoid setting up a new stack frame. -// -// [dBorca] -// right, this would be the preferred approach, but gcc does not -// clean up the stack after each function call when optimizing (-fdefer-pop); -// can it make assumptions about what is already on the stack? I dunno, -// but in this case, we can't mess with the caller's stack frame, and -// we must use a model like '_x86_dispatch_attrfv' above. Caveat emptor! - -// Also, will only need a maximum of four of each of these per context: -// -GLOBL( _tnl_x86_dispatch_multitexcoordf ) +/* MultiTexcoord: the address of the function pointer must be + * calculated, but can use the index argument slot to hold 'v', and + * avoid setting up a new stack frame. + * + * [dBorca] + * right, this would be the preferred approach, but gcc does not + * clean up the stack after each function call when optimizing (-fdefer-pop); + * can it make assumptions about what's already on the stack? I dunno, + * but in this case, we can't mess with the caller's stack frame, and + * we must use a model like `_x86_dispatch_attrfv' above. Caveat emptor! + */ + +/* Also, will only need a maximum of four of each of these per context: + */ +#if !defined (STDCALL_API) +GLOBL( _tnl_x86_dispatch_multitexcoordf1 ) +GLOBL( _tnl_x86_dispatch_multitexcoordf2 ) +GLOBL( _tnl_x86_dispatch_multitexcoordf3 ) +GLOBL( _tnl_x86_dispatch_multitexcoordf4 ) movl 4(%esp), %ecx leal 8(%esp), %edx andl $7, %ecx movl %edx, 4(%esp) sall $4, %ecx - jmp *SUBST(0)(%ecx) # 0x0 - tabfv[tex0][n] -GLOBL( _tnl_x86_dispatch_multitexcoordf_end ) + jmp *SUBST(0)(%ecx) /* 0x0 - tabfv[tex0][n] */ +GLOBL( _tnl_x86_dispatch_multitexcoordf4_end ) +GLOBL( _tnl_x86_dispatch_multitexcoordf3_end ) +GLOBL( _tnl_x86_dispatch_multitexcoordf2_end ) +GLOBL( _tnl_x86_dispatch_multitexcoordf1_end ) GLOBL( _tnl_x86_dispatch_multitexcoordfv ) movl 4(%esp), %ecx @@ -317,32 +377,181 @@ GLOBL( _tnl_x86_dispatch_multitexcoordfv ) andl $7, %ecx movl %edx, 4(%esp) sall $4, %ecx - jmp *SUBST(0)(%ecx) # 0x0 - tabfv[tex0][n] + jmp *SUBST(0)(%ecx) /* 0x0 - tabfv[tex0][n] */ GLOBL( _tnl_x86_dispatch_multitexcoordfv_end ) -// VertexAttrib: the address of the function pointer must be -// calculated. +#else /* defined (STDCALL_API) */ + +GLOBL( _tnl_x86_dispatch_multitexcoordf1 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %ecx + leal 20(%esp), %edx + andl $7, %ecx + movl %edx, (%esp) + sall $4, %ecx + call *SUBST(0)(%ecx) /* 0x0 - tabfv[tex0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $8 /* return */ +GLOBL( _tnl_x86_dispatch_multitexcoordf1_end ) + +GLOBL( _tnl_x86_dispatch_multitexcoordf2 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %ecx + leal 20(%esp), %edx + andl $7, %ecx + movl %edx, (%esp) + sall $4, %ecx + call *SUBST(0)(%ecx) /* 0x0 - tabfv[tex0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $12 /* return */ +GLOBL( _tnl_x86_dispatch_multitexcoordf2_end ) + +GLOBL( _tnl_x86_dispatch_multitexcoordf3 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %ecx + leal 20(%esp), %edx + andl $7, %ecx + movl %edx, (%esp) + sall $4, %ecx + call *SUBST(0)(%ecx) /* 0x0 - tabfv[tex0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $16 /* return */ +GLOBL( _tnl_x86_dispatch_multitexcoordf3_end ) + +GLOBL( _tnl_x86_dispatch_multitexcoordf4 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %ecx + leal 20(%esp), %edx + andl $7, %ecx + movl %edx, (%esp) + sall $4, %ecx + call *SUBST(0)(%ecx) /* 0x0 - tabfv[tex0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $20 /* return */ +GLOBL( _tnl_x86_dispatch_multitexcoordf4_end ) -GLOBL( _tnl_x86_dispatch_vertexattribf ) +GLOBL( _tnl_x86_dispatch_multitexcoordfv ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %ecx + movl 20(%esp), %edx + andl $7, %ecx + movl %edx, (%esp) + sall $4, %ecx + call *SUBST(0)(%ecx) /* 0x0 - tabfv[tex0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $8 /* return */ +GLOBL( _tnl_x86_dispatch_multitexcoordfv_end ) +#endif /* defined (STDCALL_API) */ + + +/* VertexAttrib: the address of the function pointer must be + * calculated. + */ +#if !defined (STDCALL_API) +GLOBL( _tnl_x86_dispatch_vertexattribf1 ) +GLOBL( _tnl_x86_dispatch_vertexattribf2 ) +GLOBL( _tnl_x86_dispatch_vertexattribf3 ) +GLOBL( _tnl_x86_dispatch_vertexattribf4 ) movl 4(%esp), %eax cmpl $16, %eax - jb .8 # "cmovge" is not supported on all CPUs + jb .8 /* "cmovge" is not supported on all CPUs */ movl $16, %eax .8: - leal 8(%esp), %ecx # calculate 'v' - movl %ecx, 4(%esp) # save in 1st arg slot + leal 8(%esp), %ecx /* calculate 'v' */ + movl %ecx, 4(%esp) /* save in 1st arg slot */ sall $4, %eax - jmp *SUBST(0)(%eax) # 0x0 - tabfv[0][n] -GLOBL( _tnl_x86_dispatch_vertexattribf_end ) + jmp *SUBST(0)(%eax) /* 0x0 - tabfv[0][n] */ +GLOBL( _tnl_x86_dispatch_vertexattribf4_end ) +GLOBL( _tnl_x86_dispatch_vertexattribf3_end ) +GLOBL( _tnl_x86_dispatch_vertexattribf2_end ) +GLOBL( _tnl_x86_dispatch_vertexattribf1_end ) GLOBL( _tnl_x86_dispatch_vertexattribfv ) movl 4(%esp), %eax cmpl $16, %eax - jb .9 # "cmovge" is not supported on all CPUs + jb .9 /* "cmovge" is not supported on all CPUs */ + movl $16, %eax +.9: + movl 8(%esp), %ecx /* load 'v' */ + movl %ecx, 4(%esp) /* save in 1st arg slot */ + sall $4, %eax + jmp *SUBST(0)(%eax) /* 0x0 - tabfv[0][n] */ +GLOBL( _tnl_x86_dispatch_vertexattribfv_end ) + +#else /* defined (STDCALL_API) */ + +GLOBL( _tnl_x86_dispatch_vertexattribf1 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %eax + cmpl $16, %eax + jb .81 /* "cmovge" is not supported on all CPUs */ + movl $16, %eax +.81: + leal 20(%esp), %ecx /* load 'v' */ + movl %ecx, (%esp) /* save in 1st arg slot */ + sall $4, %eax + call *SUBST(0)(%eax) /* 0x0 - tabfv[0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $8 /* return */ +GLOBL( _tnl_x86_dispatch_vertexattribf1_end ) + +GLOBL( _tnl_x86_dispatch_vertexattribf2 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %eax + cmpl $16, %eax + jb .82 /* "cmovge" is not supported on all CPUs */ + movl $16, %eax +.82: + leal 20(%esp), %ecx /* load 'v' */ + movl %ecx, (%esp) /* save in 1st arg slot */ + sall $4, %eax + call *SUBST(0)(%eax) /* 0x0 - tabfv[0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $12 /* return */ +GLOBL( _tnl_x86_dispatch_vertexattribf2_end ) + +GLOBL( _tnl_x86_dispatch_vertexattribf3 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %eax + cmpl $16, %eax + jb .83 /* "cmovge" is not supported on all CPUs */ + movl $16, %eax +.83: + leal 20(%esp), %ecx /* load 'v' */ + movl %ecx, (%esp) /* save in 1st arg slot */ + sall $4, %eax + call *SUBST(0)(%eax) /* 0x0 - tabfv[0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $16 /* return */ +GLOBL( _tnl_x86_dispatch_vertexattribf3_end ) + +GLOBL( _tnl_x86_dispatch_vertexattribf4 ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %eax + cmpl $16, %eax + jb .84 /* "cmovge" is not supported on all CPUs */ + movl $16, %eax +.84: + leal 20(%esp), %ecx /* load 'v' */ + movl %ecx, (%esp) /* save in 1st arg slot */ + sall $4, %eax + call *SUBST(0)(%eax) /* 0x0 - tabfv[0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $20 /* return */ +GLOBL( _tnl_x86_dispatch_vertexattribf4_end ) + +GLOBL( _tnl_x86_dispatch_vertexattribfv ) + subl $12, %esp /* gcc does 16 byte alignment of stack frames? */ + movl 16(%esp), %eax + cmpl $16, %eax + jb .9 /* "cmovge" is not supported on all CPUs */ movl $16, %eax .9: - movl 8(%esp), %ecx # load 'v' - movl %ecx, 4(%esp) # save in 1st arg slot + movl 20(%esp), %ecx /* load 'v' */ + movl %ecx, (%esp) /* save in 1st arg slot */ sall $4, %eax - jmp *SUBST(0)(%eax) # 0x0 - tabfv[0][n] + call *SUBST(0)(%eax) /* 0x0 - tabfv[0][n] */ + addl $8, %esp /* tear down frame (4 shaved off by the callee) */ + ret $8 /* return */ GLOBL( _tnl_x86_dispatch_vertexattribfv_end ) +#endif /* defined (STDCALL_API) */ diff --git a/src/mesa/x86/assyntax.h b/src/mesa/x86/assyntax.h index c048790c43..62d079f253 100644 --- a/src/mesa/x86/assyntax.h +++ b/src/mesa/x86/assyntax.h @@ -1300,11 +1300,11 @@ SECTION _DATA public align=16 class=DATA use32 flat #define REPZ REPE #define RET ret #define SAHF sahf -#define SAL_L(a, b) sal L_(b), L_(a) -#define SAL_W(a, b) sal W_(b), W_(a) +#define SAL_L(a, b) sal L_(b), B_(a) +#define SAL_W(a, b) sal W_(b), B_(a) #define SAL_B(a, b) sal B_(b), B_(a) -#define SAR_L(a, b) sar L_(b), L_(a) -#define SAR_W(a, b) sar W_(b), W_(a) +#define SAR_L(a, b) sar L_(b), B_(a) +#define SAR_W(a, b) sar W_(b), B_(a) #define SAR_B(a, b) sar B_(b), B_(a) #define SBB_L(a, b) sbb L_(b), L_(a) #define SBB_W(a, b) sbb W_(b), W_(a) |