16 files changed, 537 insertions, 295 deletions
diff --git a/src/mesa/tnl/descrip.mms b/src/mesa/tnl/descrip.mms
new file mode 100644
index 0000000000..25dd1aecb1
--- /dev/null
+++ b/src/mesa/tnl/descrip.mms
@@ -0,0 +1,68 @@
+# Makefile for core library for VMS
+# contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
+# Last revision : 39 September 2008
+
+.first
+	define gl [---.include.gl]
+	define math [-.math]
+	define vbo [-.vbo]
+	define shader [-.shader]
+	define swrast [-.swrast]
+	define array_cache [-.array_cache]
+	define main [-.main]
+	define glapi [-.glapi]
+	define tnl [-.tnl]
+
+.include [---]mms-config.
+
+##### MACROS #####
+
+VPATH = RCS
+
+INCDIR = [---.include],[-.main],[-.glapi],[-.shader],[-.shader.slang]
+LIBDIR = [---.lib]
+CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)/float=ieee/ieee=denorm
+
+SOURCES = t_context.c t_draw.c \
+	t_pipeline.c t_vb_fog.c \
+	t_vb_light.c t_vb_normals.c t_vb_points.c t_vb_program.c \
+	t_vb_render.c t_vb_texgen.c t_vb_texmat.c t_vb_vertex.c \
+	t_vertex.c t_rasterpos.c\
+	t_vertex_generic.c t_vp_build.c
+
+OBJECTS = t_context.obj,t_draw.obj,\
+	t_pipeline.obj,t_vb_fog.obj,t_vb_light.obj,t_vb_normals.obj,\
+	t_vb_points.obj,t_vb_program.obj,t_vb_render.obj,t_vb_texgen.obj,\
+	t_vb_texmat.obj,t_vb_vertex.obj,t_rasterpos.obj,\
+	t_vertex.obj,t_vertex_generic.obj,\
+	t_vp_build.obj
+
+##### RULES #####
+
+VERSION=Mesa V3.4
+
+##### TARGETS #####
+# Make the library
+$(LIBDIR)$(GL_LIB) : $(OBJECTS)
+  @ library $(LIBDIR)$(GL_LIB) $(OBJECTS)
+
+clean :
+	purge
+	delete *.obj;*
+
+t_context.obj : t_context.c
+t_draw.obj : t_draw.c
+t_pipeline.obj : t_pipeline.c
+t_vb_fog.obj : t_vb_fog.c
+t_vb_light.obj : t_vb_light.c
+t_vb_normals.obj : t_vb_normals.c
+t_vb_points.obj : t_vb_points.c
+t_vb_program.obj : t_vb_program.c
+t_vb_render.obj : t_vb_render.c
+t_vb_texgen.obj : t_vb_texgen.c
+t_vb_texmat.obj : t_vb_texmat.c
+t_vb_vertex.obj : t_vb_vertex.c
+t_vertex.obj : t_vertex.c
+t_vertex_generic.obj : t_vertex_generic.c
+t_vp_build.obj : t_vp_build.c
+t_rasterpos.obj : t_rasterpos.c
diff --git a/src/mesa/tnl/sources b/src/mesa/tnl/sources
deleted file mode 100644
index a0888be11d..0000000000
--- a/src/mesa/tnl/sources
+++ /dev/null
@@ -1,34 +0,0 @@
-# List of source files in this directory used for X.org xserver build
-MESA_TNL_SOURCES = \
-t_context.c \
-t_pipeline.c \
-t_vb_arbprogram.c \
-t_vb_arbprogram_sse.c \
-t_vb_arbshader.c \
-t_vb_cull.c \
-t_vb_fog.c \
-t_vb_light.c \
-t_vb_normals.c \
-t_vb_points.c \
-t_vb_program.c \
-t_vb_render.c \
-t_vb_texgen.c \
-t_vb_texmat.c \
-t_vb_vertex.c \
-t_vertex.c \
-t_vertex_generic.c \
-t_vertex_sse.c \
-t_vp_build.c 
-
-MESA_TNL_HEADERS = \
-t_array_api.h \
-t_array_import.h \
-t_context.h \
-t_pipeline.h \
-t_vb_arbprogram.h \
-t_vb_cliptmp.h \
-t_vb_lighttmp.h \
-t_vb_rendertmp.h \
-t_vertex.h \
-t_vp_build.h \
-tnl.h
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index 0ace5c2d6f..f0d31fdac3 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -1,8 +1,8 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.5.2
+ * Version:  7.2
  *
- * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -32,6 +32,8 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/light.h"
+#include "math/m_translate.h"
+#include "math/m_xform.h"
 
 #include "tnl.h"
 #include "t_context.h"
@@ -81,6 +83,9 @@ _tnl_CreateContext( GLcontext *ctx )
    /* plug in the VBO drawing function */
    vbo_set_draw_func(ctx, _tnl_draw_prims);
 
+   _math_init_transformation();
+   _math_init_translate();
+
    return GL_TRUE;
 }
 
@@ -104,43 +109,55 @@ _tnl_InvalidateState( GLcontext *ctx, GLuint new_state )
    const struct gl_vertex_program *vp = ctx->VertexProgram._Current;
    const struct gl_fragment_program *fp = ctx->FragmentProgram._Current;
 
-   if (new_state & (_NEW_HINT)) {
+   if (new_state & (_NEW_HINT | _NEW_PROGRAM)) {
       ASSERT(tnl->AllowVertexFog || tnl->AllowPixelFog);
-      tnl->_DoVertexFog = (tnl->AllowVertexFog && (ctx->Hint.Fog != GL_NICEST))
-         || !tnl->AllowPixelFog;
+      tnl->_DoVertexFog = ((tnl->AllowVertexFog && (ctx->Hint.Fog != GL_NICEST))
+         || !tnl->AllowPixelFog) && !fp;
    }
 
    tnl->pipeline.new_state |= new_state;
 
-   /* Calculate tnl->render_inputs:
+   /* Calculate tnl->render_inputs.  This bitmask indicates which vertex
+    * attributes need to be emitted to the rasterizer.
     */
    if (ctx->Visual.rgbMode) {
       GLuint i;
 
       RENDERINPUTS_ZERO( tnl->render_inputs_bitset );
       RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_POS );
+
       if (!fp || (fp->Base.InputsRead & FRAG_BIT_COL0)) {
          RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR0 );
       }
+
+      if (NEED_SECONDARY_COLOR(ctx))
+         RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 );
+
       for (i = 0; i < ctx->Const.MaxTextureCoordUnits; i++) {
-         if (ctx->Texture._EnabledCoordUnits & (1 << i)) {
+         if (ctx->Texture._EnabledCoordUnits & (1 << i) ||
+             (fp && fp->Base.InputsRead & FRAG_BIT_TEX(i))) {
             RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_TEX(i) );
          }
       }
-
-      if (NEED_SECONDARY_COLOR(ctx))
-         RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 );
    }
    else {
       RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_POS );
       RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR_INDEX );
    }
 
-   if (ctx->Fog.Enabled ||
-       ((ctx->FragmentProgram._Active || ctx->FragmentProgram._Current) &&
-        (ctx->FragmentProgram._Current->FogOption != GL_NONE ||
-         (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_FOGC))))
+   if (ctx->Fog.Enabled) {
+      /* fixed-function fog */
       RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_FOG );
+   }
+   else if (ctx->FragmentProgram._Current) {
+      struct gl_fragment_program *fp = ctx->FragmentProgram._Current;
+      if (fp) {
+         if (fp->FogOption != GL_NONE || (fp->Base.InputsRead & FRAG_BIT_FOGC)) {
+            /* fragment program needs fog coord */
+            RENDERINPUTS_SET( tnl->render_inputs_bitset, _TNL_ATTRIB_FOG );
+         }
+      }
+   }
 
    if (ctx->Polygon.FrontMode != GL_FILL || 
        ctx->Polygon.BackMode != GL_FILL)
@@ -201,8 +218,8 @@ _tnl_allow_vertex_fog( GLcontext *ctx, GLboolean value )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    tnl->AllowVertexFog = value;
-   tnl->_DoVertexFog = (tnl->AllowVertexFog && (ctx->Hint.Fog != GL_NICEST))
-      || !tnl->AllowPixelFog;
+   tnl->_DoVertexFog = ((tnl->AllowVertexFog && (ctx->Hint.Fog != GL_NICEST))
+      || !tnl->AllowPixelFog) && !ctx->FragmentProgram._Current;
 
 }
 
@@ -211,7 +228,7 @@ _tnl_allow_pixel_fog( GLcontext *ctx, GLboolean value )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    tnl->AllowPixelFog = value;
-   tnl->_DoVertexFog = (tnl->AllowVertexFog && (ctx->Hint.Fog != GL_NICEST))
-      || !tnl->AllowPixelFog;
+   tnl->_DoVertexFog = ((tnl->AllowVertexFog && (ctx->Hint.Fog != GL_NICEST))
+      || !tnl->AllowPixelFog) && !ctx->FragmentProgram._Current;
 }
 
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index a1a46f6b82..2ec65d5323 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -1,9 +1,8 @@
-
 /*
  * Mesa 3-D graphics library
- * Version:  6.5
+ * Version:  7.1
  *
- * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -29,7 +28,7 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/imports.h"
-#include "state.h"
+#include "main/state.h"
 #include "main/mtypes.h"
 #include "main/macros.h"
 #include "main/enums.h"
@@ -89,6 +88,29 @@ static void free_space(GLcontext *ctx)
 } while (0)
 
 
+/**
+ * Convert array of BGRA/GLubyte[4] values to RGBA/float[4]
+ * \param ptr  input/ubyte array
+ * \param fptr  output/float array
+ */
+static void
+convert_bgra_to_float(const struct gl_client_array *input,
+                      const GLubyte *ptr, GLfloat *fptr,
+                      GLuint count )
+{
+   GLuint i;
+   assert(input->Normalized);
+   assert(input->Size == 4);
+   for (i = 0; i < count; i++) {
+      const GLubyte *in = (GLubyte *) ptr;  /* in is in BGRA order */
+      *fptr++ = UBYTE_TO_FLOAT(in[2]);  /* red */
+      *fptr++ = UBYTE_TO_FLOAT(in[1]);  /* green */
+      *fptr++ = UBYTE_TO_FLOAT(in[0]);  /* blue */
+      *fptr++ = UBYTE_TO_FLOAT(in[3]);  /* alpha */
+      ptr += input->StrideB;
+   }
+}
+
 
 /* Adjust pointer to point at first requested element, convert to
  * floating point, populate VB->AttribPtr[].
@@ -113,7 +135,13 @@ static void _tnl_import_array( GLcontext *ctx,
 	 CONVERT(GLbyte, BYTE_TO_FLOAT); 
 	 break;
       case GL_UNSIGNED_BYTE: 
-	 CONVERT(GLubyte, UBYTE_TO_FLOAT); 
+         if (input->Format == GL_BGRA) {
+            /* See GL_EXT_vertex_array_bgra */
+            convert_bgra_to_float(input, ptr, fptr, count);
+         }
+         else {
+            CONVERT(GLubyte, UBYTE_TO_FLOAT); 
+         }
 	 break;
       case GL_SHORT: 
 	 CONVERT(GLshort, SHORT_TO_FLOAT); 
@@ -368,7 +396,7 @@ void _tnl_draw_prims( GLcontext *ctx,
 			_tnl_draw_prims );
       return;
    }
-   else if (max_index >= max) {
+   else if (max_index > max) {
       /* The software TNL pipeline has a fixed amount of storage for
        * vertices and it is necessary to split incoming drawing commands
        * if they exceed that limit.
diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
index 2a0ed8852a..357ef1e24b 100644
--- a/src/mesa/tnl/t_pipeline.c
+++ b/src/mesa/tnl/t_pipeline.c
@@ -199,11 +199,11 @@ const struct tnl_pipeline_stage *_tnl_default_pipeline[] = {
    &_tnl_vertex_transform_stage,
    &_tnl_normal_transform_stage,
    &_tnl_lighting_stage,
-   &_tnl_fog_coordinate_stage,
    &_tnl_texgen_stage,
    &_tnl_texture_transform_stage,
    &_tnl_point_attenuation_stage,
    &_tnl_vertex_program_stage, 
+   &_tnl_fog_coordinate_stage,
    &_tnl_render_stage,
    NULL 
 };
diff --git a/src/mesa/tnl/t_rasterpos.c b/src/mesa/tnl/t_rasterpos.c
index dfbe0f1806..04fb1d8f8c 100644
--- a/src/mesa/tnl/t_rasterpos.c
+++ b/src/mesa/tnl/t_rasterpos.c
@@ -26,11 +26,11 @@
 #include "main/glheader.h"
 #include "main/colormac.h"
 #include "main/context.h"
-#include "feedback.h"
-#include "light.h"
+#include "main/feedback.h"
+#include "main/light.h"
 #include "main/macros.h"
-#include "rastpos.h"
-#include "simple_list.h"
+#include "main/rastpos.h"
+#include "main/simple_list.h"
 #include "main/mtypes.h"
 
 #include "math/m_matrix.h"
@@ -301,12 +301,12 @@ compute_texgen(GLcontext *ctx, const GLfloat vObj[4], const GLfloat vEye[4],
       mInv = 0.0F;
 
    if (texUnit->TexGenEnabled & S_BIT) {
-      switch (texUnit->GenModeS) {
+      switch (texUnit->GenS.Mode) {
          case GL_OBJECT_LINEAR:
-            texcoord[0] = DOT4(vObj, texUnit->ObjectPlaneS);
+            texcoord[0] = DOT4(vObj, texUnit->GenS.ObjectPlane);
             break;
          case GL_EYE_LINEAR:
-            texcoord[0] = DOT4(vEye, texUnit->EyePlaneS);
+            texcoord[0] = DOT4(vEye, texUnit->GenS.EyePlane);
             break;
          case GL_SPHERE_MAP:
             texcoord[0] = rx * mInv + 0.5F;
@@ -324,12 +324,12 @@ compute_texgen(GLcontext *ctx, const GLfloat vObj[4], const GLfloat vEye[4],
    }
 
    if (texUnit->TexGenEnabled & T_BIT) {
-      switch (texUnit->GenModeT) {
+      switch (texUnit->GenT.Mode) {
          case GL_OBJECT_LINEAR:
-            texcoord[1] = DOT4(vObj, texUnit->ObjectPlaneT);
+            texcoord[1] = DOT4(vObj, texUnit->GenT.ObjectPlane);
             break;
          case GL_EYE_LINEAR:
-            texcoord[1] = DOT4(vEye, texUnit->EyePlaneT);
+            texcoord[1] = DOT4(vEye, texUnit->GenT.EyePlane);
             break;
          case GL_SPHERE_MAP:
             texcoord[1] = ry * mInv + 0.5F;
@@ -347,12 +347,12 @@ compute_texgen(GLcontext *ctx, const GLfloat vObj[4], const GLfloat vEye[4],
    }
 
    if (texUnit->TexGenEnabled & R_BIT) {
-      switch (texUnit->GenModeR) {
+      switch (texUnit->GenR.Mode) {
          case GL_OBJECT_LINEAR:
-            texcoord[2] = DOT4(vObj, texUnit->ObjectPlaneR);
+            texcoord[2] = DOT4(vObj, texUnit->GenR.ObjectPlane);
             break;
          case GL_EYE_LINEAR:
-            texcoord[2] = DOT4(vEye, texUnit->EyePlaneR);
+            texcoord[2] = DOT4(vEye, texUnit->GenR.EyePlane);
             break;
          case GL_REFLECTION_MAP:
             texcoord[2] = rz;
@@ -367,12 +367,12 @@ compute_texgen(GLcontext *ctx, const GLfloat vObj[4], const GLfloat vEye[4],
    }
 
    if (texUnit->TexGenEnabled & Q_BIT) {
-      switch (texUnit->GenModeQ) {
+      switch (texUnit->GenQ.Mode) {
          case GL_OBJECT_LINEAR:
-            texcoord[3] = DOT4(vObj, texUnit->ObjectPlaneQ);
+            texcoord[3] = DOT4(vObj, texUnit->GenQ.ObjectPlane);
             break;
          case GL_EYE_LINEAR:
-            texcoord[3] = DOT4(vEye, texUnit->EyePlaneQ);
+            texcoord[3] = DOT4(vEye, texUnit->GenQ.EyePlane);
             break;
          default:
             _mesa_problem(ctx, "Bad Q texgen in compute_texgen()");
diff --git a/src/mesa/tnl/t_vb_fog.c b/src/mesa/tnl/t_vb_fog.c
index 00c0979f3f..f3a7bd49f4 100644
--- a/src/mesa/tnl/t_vb_fog.c
+++ b/src/mesa/tnl/t_vb_fog.c
@@ -41,7 +41,6 @@
 
 struct fog_stage_data {
    GLvector4f fogcoord;		/* has actual storage allocated */
-   GLvector4f input;		/* points into VB->EyePtr Z values */
 };
 
 #define FOG_STAGE_DATA(stage) ((struct fog_stage_data *)stage->privatePtr)
@@ -91,7 +90,8 @@ init_static_data( void )
  * evaluating the GL_LINEAR, GL_EXP or GL_EXP2 fog function.
  * Fog coordinates are distances from the eye (typically between the
  * near and far clip plane distances).
- * Note the fog (eye Z) coords may be negative so we use ABS(z) below.
+ * Note that fogcoords may be negative, if eye z is source absolute
+ * value must be taken earlier.
  * Fog blend factors are in the range [0,1].
  */
 static void
@@ -148,11 +148,11 @@ run_fog_stage(GLcontext *ctx, struct tnl_pipeline_stage *stage)
    struct fog_stage_data *store = FOG_STAGE_DATA(stage);
    GLvector4f *input;
 
-   if (!ctx->Fog.Enabled || ctx->VertexProgram._Current)
-      return GL_TRUE;
 
+   if (!ctx->Fog.Enabled)
+      return GL_TRUE;
 
-   if (ctx->Fog.FogCoordinateSource == GL_FRAGMENT_DEPTH_EXT) {
+   if (ctx->Fog.FogCoordinateSource == GL_FRAGMENT_DEPTH_EXT && !ctx->VertexProgram._Current) {
       GLuint i;
       GLfloat *coord;
       /* Fog is computed from vertex or fragment Z values */
@@ -169,13 +169,10 @@ run_fog_stage(GLcontext *ctx, struct tnl_pipeline_stage *stage)
 	  */
 	 input = &store->fogcoord;
 
-         /* NOTE: negate plane here so we get positive fog coords! */
-	 /* NOTE2: this doesn't always work (tests/fog - all frag depth fog
-	    coords will be negative). */
-	 plane[0] = -m[2];
-	 plane[1] = -m[6];
-	 plane[2] = -m[10];
-	 plane[3] = -m[14];
+	 plane[0] = m[2];
+	 plane[1] = m[6];
+	 plane[2] = m[10];
+	 plane[3] = m[14];
 	 /* Full eye coords weren't required, just calculate the
 	  * eye Z values.
 	  */
@@ -189,12 +186,12 @@ run_fog_stage(GLcontext *ctx, struct tnl_pipeline_stage *stage)
 	    NOTE should avoid going through array twice */
 	 coord = input->start;
 	 for (i = 0; i < input->count; i++) {
-	    input->data[i][0] = FABSF(*coord);
+	    *coord = FABSF(*coord);
 	    STRIDE_F(coord, input->stride);
 	 }
       }
       else {
-         /* fog coordinates = eye Z coordinates (use ABS later) */
+         /* fog coordinates = eye Z coordinates - need to copy for ABS */
 	 input = &store->fogcoord;
 
 	 if (VB->EyePtr->size < 2)
@@ -249,7 +246,6 @@ alloc_fog_data(GLcontext *ctx, struct tnl_pipeline_stage *stage)
       return GL_FALSE;
 
    _mesa_vector4f_alloc( &store->fogcoord, 0, tnl->vb.Size, 32 );
-   _mesa_vector4f_init( &store->input, 0, NULL );
 
    if (!inited)
       init_static_data();
diff --git a/src/mesa/tnl/t_vb_light.c b/src/mesa/tnl/t_vb_light.c
index ebd3412d20..f47f99397c 100644
--- a/src/mesa/tnl/t_vb_light.c
+++ b/src/mesa/tnl/t_vb_light.c
@@ -26,10 +26,10 @@
 
 #include "main/glheader.h"
 #include "main/colormac.h"
-#include "light.h"
+#include "main/light.h"
 #include "main/macros.h"
 #include "main/imports.h"
-#include "simple_list.h"
+#include "main/simple_list.h"
 #include "main/mtypes.h"
 
 #include "math/m_translate.h"
diff --git a/src/mesa/tnl/t_vb_program.c b/src/mesa/tnl/t_vb_program.c
index c778f5d248..f99401ca6d 100644
--- a/src/mesa/tnl/t_vb_program.c
+++ b/src/mesa/tnl/t_vb_program.c
@@ -1,6 +1,6 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.5.3
+ * Version:  7.1
  *
  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
  *
@@ -41,9 +41,129 @@
 #include "swrast/s_context.h"
 #include "swrast/s_texfilter.h"
 
-#include "tnl.h"
-#include "t_context.h"
-#include "t_pipeline.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+#include "tnl/t_pipeline.h"
+
+
+
+/*!
+ * Private storage for the vertex program pipeline stage.
+ */
+struct vp_stage_data {
+   /** The results of running the vertex program go into these arrays. */
+   GLvector4f results[VERT_RESULT_MAX];
+
+   GLvector4f ndcCoords;              /**< normalized device coords */
+   GLubyte *clipmask;                 /**< clip flags */
+   GLubyte ormask, andmask;           /**< for clipping */
+};
+
+
+#define VP_STAGE_DATA(stage) ((struct vp_stage_data *)(stage->privatePtr))
+
+
+static void
+userclip( GLcontext *ctx,
+          GLvector4f *clip,
+          GLubyte *clipmask,
+          GLubyte *clipormask,
+          GLubyte *clipandmask )
+{
+   GLuint p;
+
+   for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+      if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+	 GLuint nr, i;
+	 const GLfloat a = ctx->Transform._ClipUserPlane[p][0];
+	 const GLfloat b = ctx->Transform._ClipUserPlane[p][1];
+	 const GLfloat c = ctx->Transform._ClipUserPlane[p][2];
+	 const GLfloat d = ctx->Transform._ClipUserPlane[p][3];
+         GLfloat *coord = (GLfloat *)clip->data;
+         GLuint stride = clip->stride;
+         GLuint count = clip->count;
+
+	 for (nr = 0, i = 0 ; i < count ; i++) {
+	    GLfloat dp = (coord[0] * a + 
+			  coord[1] * b +
+			  coord[2] * c +
+			  coord[3] * d);
+
+	    if (dp < 0) {
+	       nr++;
+	       clipmask[i] |= CLIP_USER_BIT;
+	    }
+
+	    STRIDE_F(coord, stride);
+	 }
+
+	 if (nr > 0) {
+	    *clipormask |= CLIP_USER_BIT;
+	    if (nr == count) {
+	       *clipandmask |= CLIP_USER_BIT;
+	       return;
+	    }
+	 }
+      }
+   }
+}
+
+
+static GLboolean
+do_ndc_cliptest(GLcontext *ctx, struct vp_stage_data *store)
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct vertex_buffer *VB = &tnl->vb;
+   /* Cliptest and perspective divide.  Clip functions must clear
+    * the clipmask.
+    */
+   store->ormask = 0;
+   store->andmask = CLIP_FRUSTUM_BITS;
+
+   if (tnl->NeedNdcCoords) {
+      VB->NdcPtr =
+         _mesa_clip_tab[VB->ClipPtr->size]( VB->ClipPtr,
+                                            &store->ndcCoords,
+                                            store->clipmask,
+                                            &store->ormask,
+                                            &store->andmask );
+   }
+   else {
+      VB->NdcPtr = NULL;
+      _mesa_clip_np_tab[VB->ClipPtr->size]( VB->ClipPtr,
+                                            NULL,
+                                            store->clipmask,
+                                            &store->ormask,
+                                            &store->andmask );
+   }
+
+   if (store->andmask) {
+      /* All vertices are outside the frustum */
+      return GL_FALSE;
+   }
+
+   /* Test userclip planes.  This contributes to VB->ClipMask.
+    */
+   /** XXX NEW_SLANG _Enabled ??? */
+   if (ctx->Transform.ClipPlanesEnabled && (!ctx->VertexProgram._Enabled ||
+      ctx->VertexProgram.Current->IsPositionInvariant)) {
+      userclip( ctx,
+		VB->ClipPtr,
+		store->clipmask,
+		&store->ormask,
+		&store->andmask );
+
+      if (store->andmask) {
+	 return GL_FALSE;
+      }
+   }
+
+   VB->ClipAndMask = store->andmask;
+   VB->ClipOrMask = store->ormask;
+   VB->ClipMask = store->clipmask;
+
+   return GL_TRUE;
+}
 
 
 /**
@@ -52,7 +172,6 @@
  * real dependencies on the rest of swrast.  It should probably be
  * moved into main/ someday.
  */
-
 static void
 vp_fetch_texel(GLcontext *ctx, const GLfloat texcoord[4], GLfloat lambda,
                GLuint unit, GLfloat color[4])
@@ -85,22 +204,6 @@ _tnl_program_string(GLcontext *ctx, GLenum target, struct gl_program *program)
 }
 
 
-/*!
- * Private storage for the vertex program pipeline stage.
- */
-struct vp_stage_data {
-   /** The results of running the vertex program go into these arrays. */
-   GLvector4f results[VERT_RESULT_MAX];
-
-   GLvector4f ndcCoords;              /**< normalized device coords */
-   GLubyte *clipmask;                 /**< clip flags */
-   GLubyte ormask, andmask;           /**< for clipping */
-};
-
-
-#define VP_STAGE_DATA(stage) ((struct vp_stage_data *)(stage->privatePtr))
-
-
 /**
  * Initialize virtual machine state prior to executing vertex program.
  */
@@ -139,96 +242,50 @@ init_machine(GLcontext *ctx, struct gl_program_machine *machine)
 
    machine->FetchTexelLod = vp_fetch_texel;
    machine->FetchTexelDeriv = NULL; /* not used by vertex programs */
+
+   machine->Samplers = ctx->VertexProgram._Current->Base.SamplerUnits;
 }
 
 
 /**
- * Copy the 16 elements of a matrix into four consecutive program
- * registers starting at 'pos'.
+ * Map the texture images which the vertex program will access (if any).
  */
 static void
-load_matrix(GLfloat registers[][4], GLuint pos, const GLfloat mat[16])
+map_textures(GLcontext *ctx, const struct gl_vertex_program *vp)
 {
-   GLuint i;
-   for (i = 0; i < 4; i++) {
-      registers[pos + i][0] = mat[0 + i];
-      registers[pos + i][1] = mat[4 + i];
-      registers[pos + i][2] = mat[8 + i];
-      registers[pos + i][3] = mat[12 + i];
-   }
-}
+   GLuint u;
 
+   if (!ctx->Driver.MapTexture)
+      return;
 
-/**
- * As above, but transpose the matrix.
- */
-static void
-load_transpose_matrix(GLfloat registers[][4], GLuint pos,
-                      const GLfloat mat[16])
-{
-   MEMCPY(registers[pos], mat, 16 * sizeof(GLfloat));
+   for (u = 0; u < ctx->Const.MaxVertexTextureImageUnits; u++) {
+      if (vp->Base.TexturesUsed[u]) {
+         /* Note: _Current *should* correspond to the target indicated
+          * in TexturesUsed[u].
+          */
+         ctx->Driver.MapTexture(ctx, ctx->Texture.Unit[u]._Current);
+      }
+   }
 }
 
 
 /**
- * Load current vertex program's parameter registers with tracked
- * matrices (if NV program).  This only needs to be done per
- * glBegin/glEnd, not per-vertex.
+ * Unmap the texture images which were used by the vertex program (if any).
  */
-void
-_mesa_load_tracked_matrices(GLcontext *ctx)
+static void
+unmap_textures(GLcontext *ctx, const struct gl_vertex_program *vp)
 {
-   GLuint i;
+   GLuint u;
 
-   for (i = 0; i < MAX_NV_VERTEX_PROGRAM_PARAMS / 4; i++) {
-      /* point 'mat' at source matrix */
-      GLmatrix *mat;
-      if (ctx->VertexProgram.TrackMatrix[i] == GL_MODELVIEW) {
-         mat = ctx->ModelviewMatrixStack.Top;
-      }
-      else if (ctx->VertexProgram.TrackMatrix[i] == GL_PROJECTION) {
-         mat = ctx->ProjectionMatrixStack.Top;
-      }
-      else if (ctx->VertexProgram.TrackMatrix[i] == GL_TEXTURE) {
-         mat = ctx->TextureMatrixStack[ctx->Texture.CurrentUnit].Top;
-      }
-      else if (ctx->VertexProgram.TrackMatrix[i] == GL_COLOR) {
-         mat = ctx->ColorMatrixStack.Top;
-      }
-      else if (ctx->VertexProgram.TrackMatrix[i]==GL_MODELVIEW_PROJECTION_NV) {
-         /* XXX verify the combined matrix is up to date */
-         mat = &ctx->_ModelProjectMatrix;
-      }
-      else if (ctx->VertexProgram.TrackMatrix[i] >= GL_MATRIX0_NV &&
-               ctx->VertexProgram.TrackMatrix[i] <= GL_MATRIX7_NV) {
-         GLuint n = ctx->VertexProgram.TrackMatrix[i] - GL_MATRIX0_NV;
-         ASSERT(n < MAX_PROGRAM_MATRICES);
-         mat = ctx->ProgramMatrixStack[n].Top;
-      }
-      else {
-         /* no matrix is tracked, but we leave the register values as-is */
-         assert(ctx->VertexProgram.TrackMatrix[i] == GL_NONE);
-         continue;
-      }
+   if (!ctx->Driver.MapTexture)
+      return;
 
-      /* load the matrix values into sequential registers */
-      if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
-         load_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
-      }
-      else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_INVERSE_NV) {
-         _math_matrix_analyse(mat); /* update the inverse */
-         ASSERT(!_math_matrix_is_dirty(mat));
-         load_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
-      }
-      else if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_TRANSPOSE_NV) {
-         load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
-      }
-      else {
-         assert(ctx->VertexProgram.TrackMatrixTransform[i]
-                == GL_INVERSE_TRANSPOSE_NV);
-         _math_matrix_analyse(mat); /* update the inverse */
-         ASSERT(!_math_matrix_is_dirty(mat));
-         load_transpose_matrix(ctx->VertexProgram.Parameters, i*4, mat->inv);
+   for (u = 0; u < ctx->Const.MaxVertexTextureImageUnits; u++) {
+      if (vp->Base.TexturesUsed[u]) {
+         /* Note: _Current *should* correspond to the target indicated
+          * in TexturesUsed[u].
+          */
+         ctx->Driver.UnmapTexture(ctx, ctx->Texture.Unit[u]._Current);
       }
    }
 }
@@ -259,6 +316,7 @@ run_vp( GLcontext *ctx, struct tnl_pipeline_stage *stage )
       _mesa_load_state_parameters(ctx, program->Base.Parameters);
    }
 
+   /* make list of outputs to save some time below */
    numOutputs = 0;
    for (i = 0; i < VERT_RESULT_MAX; i++) {
       if (program->Base.OutputsWritten & (1 << i)) {
@@ -266,6 +324,8 @@ run_vp( GLcontext *ctx, struct tnl_pipeline_stage *stage )
       }
    }
 
+   map_textures(ctx, program);
+
    for (i = 0; i < VB->Count; i++) {
       GLuint attr;
 
@@ -317,6 +377,8 @@ run_vp( GLcontext *ctx, struct tnl_pipeline_stage *stage )
 #endif
    }
 
+   unmap_textures(ctx, program);
+
    /* Fixup fog and point size results if needed */
    if (program->IsNVProgram) {
       if (ctx->Fog.Enabled &&
@@ -334,12 +396,39 @@ run_vp( GLcontext *ctx, struct tnl_pipeline_stage *stage )
       }
    }
 
-   /* Setup the VB pointers so that the next pipeline stages get
-    * their data from the right place (the program output arrays).
-    */
-   VB->ClipPtr = &store->results[VERT_RESULT_HPOS];
-   VB->ClipPtr->size = 4;
-   VB->ClipPtr->count = VB->Count;
+   if (program->IsPositionInvariant) {
+      /* We need the exact same transform as in the fixed function path here
+       * to guarantee invariance, depending on compiler optimization flags
+       * results could be different otherwise.
+       */
+      VB->ClipPtr = TransformRaw( &store->results[0],
+				  &ctx->_ModelProjectMatrix,
+				  VB->AttribPtr[0] );
+
+      /* Drivers expect this to be clean to element 4...
+       */
+      switch (VB->ClipPtr->size) {
+      case 1:
+	 /* impossible */
+      case 2:
+	 _mesa_vector4f_clean_elem( VB->ClipPtr, VB->Count, 2 );
+	 /* fall-through */
+      case 3:
+	 _mesa_vector4f_clean_elem( VB->ClipPtr, VB->Count, 3 );
+	 /* fall-through */
+      case 4:
+	 break;
+      }
+   }
+   else {
+      /* Setup the VB pointers so that the next pipeline stages get
+       * their data from the right place (the program output arrays).
+       */
+      VB->ClipPtr = &store->results[VERT_RESULT_HPOS];
+      VB->ClipPtr->size = 4;
+      VB->ClipPtr->count = VB->Count;
+   }
+
    VB->ColorPtr[0] = &store->results[VERT_RESULT_COL0];
    VB->ColorPtr[1] = &store->results[VERT_RESULT_BFC0];
    VB->SecondaryColorPtr[0] = &store->results[VERT_RESULT_COL1];
@@ -365,41 +454,10 @@ run_vp( GLcontext *ctx, struct tnl_pipeline_stage *stage )
       }
    }
 
-   /* Cliptest and perspective divide.  Clip functions must clear
-    * the clipmask.
-    */
-   store->ormask = 0;
-   store->andmask = CLIP_FRUSTUM_BITS;
-
-   if (tnl->NeedNdcCoords) {
-      VB->NdcPtr =
-         _mesa_clip_tab[VB->ClipPtr->size]( VB->ClipPtr,
-                                            &store->ndcCoords,
-                                            store->clipmask,
-                                            &store->ormask,
-                                            &store->andmask );
-   }
-   else {
-      VB->NdcPtr = NULL;
-      _mesa_clip_np_tab[VB->ClipPtr->size]( VB->ClipPtr,
-                                            NULL,
-                                            store->clipmask,
-                                            &store->ormask,
-                                            &store->andmask );
-   }
-
-   if (store->andmask)  /* All vertices are outside the frustum */
-      return GL_FALSE;
 
-
-   /* This is where we'd do clip testing against the user-defined
-    * clipping planes, but they're not supported by vertex programs.
+   /* Perform NDC and cliptest operations:
     */
-
-   VB->ClipOrMask = store->ormask;
-   VB->ClipMask = store->clipmask;
-
-   return GL_TRUE;
+   return do_ndc_cliptest(ctx, store);
 }
 
 
diff --git a/src/mesa/tnl/t_vb_texgen.c b/src/mesa/tnl/t_vb_texgen.c
index 14d3876e54..7c1819b223 100644
--- a/src/mesa/tnl/t_vb_texgen.c
+++ b/src/mesa/tnl/t_vb_texgen.c
@@ -367,16 +367,16 @@ static void texgen( GLcontext *ctx,
 
    if (texUnit->TexGenEnabled & S_BIT) {
       GLuint i;
-      switch (texUnit->GenModeS) {
+      switch (texUnit->GenS.Mode) {
       case GL_OBJECT_LINEAR:
 	 _mesa_dotprod_tab[obj->size]( (GLfloat *)out->data,
 				       sizeof(out->data[0]), obj,
-				       texUnit->ObjectPlaneS );
+				       texUnit->GenS.ObjectPlane );
 	 break;
       case GL_EYE_LINEAR:
 	 _mesa_dotprod_tab[eye->size]( (GLfloat *)out->data,
 				       sizeof(out->data[0]), eye,
-				       texUnit->EyePlaneS );
+				       texUnit->GenS.EyePlane );
 	 break;
       case GL_SPHERE_MAP:
          for (i = 0; i < count; i++)
@@ -400,16 +400,16 @@ static void texgen( GLcontext *ctx,
 
    if (texUnit->TexGenEnabled & T_BIT) {
       GLuint i;
-      switch (texUnit->GenModeT) {
+      switch (texUnit->GenT.Mode) {
       case GL_OBJECT_LINEAR:
 	 _mesa_dotprod_tab[obj->size]( &(out->data[0][1]),
 				       sizeof(out->data[0]), obj,
-				       texUnit->ObjectPlaneT );
+				       texUnit->GenT.ObjectPlane );
 	 break;
       case GL_EYE_LINEAR:
 	 _mesa_dotprod_tab[eye->size]( &(out->data[0][1]),
 				       sizeof(out->data[0]), eye,
-				       texUnit->EyePlaneT );
+				       texUnit->GenT.EyePlane );
 	 break;
       case GL_SPHERE_MAP:
          for (i = 0; i < count; i++)
@@ -433,16 +433,16 @@ static void texgen( GLcontext *ctx,
 
    if (texUnit->TexGenEnabled & R_BIT) {
       GLuint i;
-      switch (texUnit->GenModeR) {
+      switch (texUnit->GenR.Mode) {
       case GL_OBJECT_LINEAR:
 	 _mesa_dotprod_tab[obj->size]( &(out->data[0][2]),
 				       sizeof(out->data[0]), obj,
-				       texUnit->ObjectPlaneR );
+				       texUnit->GenR.ObjectPlane );
 	 break;
       case GL_EYE_LINEAR:
 	 _mesa_dotprod_tab[eye->size]( &(out->data[0][2]),
 				       sizeof(out->data[0]), eye,
-				       texUnit->EyePlaneR );
+				       texUnit->GenR.EyePlane );
 	 break;
       case GL_REFLECTION_MAP_NV:
 	 for (i=0;i<count;i++)
@@ -461,16 +461,16 @@ static void texgen( GLcontext *ctx,
    }
 
    if (texUnit->TexGenEnabled & Q_BIT) {
-      switch (texUnit->GenModeQ) {
+      switch (texUnit->GenQ.Mode) {
       case GL_OBJECT_LINEAR:
 	 _mesa_dotprod_tab[obj->size]( &(out->data[0][3]),
 				       sizeof(out->data[0]), obj,
-				       texUnit->ObjectPlaneQ );
+				       texUnit->GenQ.ObjectPlane );
 	 break;
       case GL_EYE_LINEAR:
 	 _mesa_dotprod_tab[eye->size]( &(out->data[0][3]),
 				       sizeof(out->data[0]), eye,
-				       texUnit->EyePlaneQ );
+				       texUnit->GenQ.EyePlane );
 	 break;
       default:
 	 _mesa_problem(ctx, "Bad Q texgen");
diff --git a/src/mesa/tnl/t_vertex.c b/src/mesa/tnl/t_vertex.c
index b661524c87..fe4209ae57 100644
--- a/src/mesa/tnl/t_vertex.c
+++ b/src/mesa/tnl/t_vertex.c
@@ -376,6 +376,22 @@ void _tnl_notify_pipeline_output_change( GLcontext *ctx )
    invalidate_funcs(vtx);
 }
 
+
+static void adjust_input_ptrs( GLcontext *ctx, GLint diff)
+{
+   struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   struct tnl_clipspace_attr *a = vtx->attr;
+   const GLuint count = vtx->attr_count;
+   int j;
+
+   diff -= 1;
+   for (j=0; j<count; ++j) {
+           register GLvector4f *vptr = VB->AttribPtr[a->attrib];
+	   (a++)->inputptr += diff*vptr->stride;
+   }
+}
+
 static void update_input_ptrs( GLcontext *ctx, GLuint start )
 {
    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
@@ -431,13 +447,40 @@ void *_tnl_emit_vertices_to_buffer( GLcontext *ctx,
    struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
 
    update_input_ptrs(ctx, start);
-
    /* Note: dest should not be adjusted for non-zero 'start' values:
     */
    vtx->emit( ctx, end - start, (GLubyte*) dest );	
    return (void *)((GLubyte *)dest + vtx->vertex_size * (end - start));
 }
 
+/* Emit indexed VB vertices start..end to dest.  Note that VB vertex at
+ * postion start will be emitted to dest at position zero.
+ */
+
+void *_tnl_emit_indexed_vertices_to_buffer( GLcontext *ctx,
+					    const GLuint *elts,
+					    GLuint start,
+					    GLuint end,
+					    void *dest )
+{
+   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+   GLuint oldIndex;
+   GLubyte *cdest = dest;
+
+   update_input_ptrs(ctx, oldIndex = elts[start++]);
+   vtx->emit( ctx, 1, cdest );
+   cdest += vtx->vertex_size;
+
+   for (; start < end; ++start) {
+      adjust_input_ptrs(ctx, elts[start] - oldIndex);
+      oldIndex = elts[start];
+      vtx->emit( ctx, 1, cdest);
+      cdest += vtx->vertex_size;
+   }
+
+   return (void *) cdest;
+}
+
 
 void _tnl_init_vertices( GLcontext *ctx, 
 			GLuint vb_size,
@@ -492,27 +535,30 @@ void _tnl_init_vertices( GLcontext *ctx,
 
 void _tnl_free_vertices( GLcontext *ctx )
 {
-   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
-   struct tnl_clipspace_fastpath *fp, *tmp;
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   if (tnl) {
+      struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
+      struct tnl_clipspace_fastpath *fp, *tmp;
 
-   if (vtx->vertex_buf) {
-      ALIGN_FREE(vtx->vertex_buf);
-      vtx->vertex_buf = NULL;
-   }
-   
-   for (fp = vtx->fastpath ; fp ; fp = tmp) {
-      tmp = fp->next;
-      FREE(fp->attr);
-
-      /* KW: At the moment, fp->func is constrained to be allocated by
-       * _mesa_exec_alloc(), as the hardwired fastpaths in
-       * t_vertex_generic.c are handled specially.  It would be nice
-       * to unify them, but this probably won't change until this
-       * module gets another overhaul.
-       */
-      _mesa_exec_free((void *) fp->func);
-      FREE(fp);
+      if (vtx->vertex_buf) {
+         ALIGN_FREE(vtx->vertex_buf);
+         vtx->vertex_buf = NULL;
+      }
+
+      for (fp = vtx->fastpath ; fp ; fp = tmp) {
+         tmp = fp->next;
+         FREE(fp->attr);
+
+         /* KW: At the moment, fp->func is constrained to be allocated by
+          * _mesa_exec_alloc(), as the hardwired fastpaths in
+          * t_vertex_generic.c are handled specially.  It would be nice
+          * to unify them, but this probably won't change until this
+          * module gets another overhaul.
+          */
+         _mesa_exec_free((void *) fp->func);
+         FREE(fp);
+      }
+
+      vtx->fastpath = NULL;
    }
-   
-   vtx->fastpath = NULL;
 }
diff --git a/src/mesa/tnl/t_vertex.h b/src/mesa/tnl/t_vertex.h
index 712311a146..2dfd7b57f0 100644
--- a/src/mesa/tnl/t_vertex.h
+++ b/src/mesa/tnl/t_vertex.h
@@ -119,6 +119,18 @@ extern void *_tnl_emit_vertices_to_buffer( GLcontext *ctx,
 					   GLuint end,
 					   void *dest );
 
+/* This function isn't optimal. Check out 
+ * gallium/auxilary/translate for a more comprehensive implementation of
+ * the same functionality.
+ */
+  
+extern void *_tnl_emit_indexed_vertices_to_buffer( GLcontext *ctx,
+						   const GLuint *elts,
+						   GLuint start,
+						   GLuint end,
+						   void *dest );
+
+
 extern void _tnl_build_vertices( GLcontext *ctx,
 				 GLuint start,
 				 GLuint end,
diff --git a/src/mesa/tnl/t_vertex_generic.c b/src/mesa/tnl/t_vertex_generic.c
index c399423b9e..9812f8c808 100644
--- a/src/mesa/tnl/t_vertex_generic.c
+++ b/src/mesa/tnl/t_vertex_generic.c
@@ -29,11 +29,17 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/colormac.h"
+#include "main/simple_list.h"
 #include "t_context.h"
 #include "t_vertex.h"
-#include "simple_list.h"
 
 
+#if 0
+#define DEBUG_INSERT printf("%s\n", __FUNCTION__)
+#else
+#define DEBUG_INSERT
+#endif
+
 
 /*
  * These functions take the NDC coordinates pointed to by 'in', apply the
@@ -45,7 +51,7 @@ static INLINE void insert_4f_viewport_4( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -57,7 +63,7 @@ static INLINE void insert_4f_viewport_3( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -69,7 +75,7 @@ static INLINE void insert_4f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[14];
@@ -81,7 +87,7 @@ static INLINE void insert_4f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
    out[2] = vp[14];
@@ -93,7 +99,7 @@ static INLINE void insert_3f_viewport_3( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -104,10 +110,10 @@ static INLINE void insert_3f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
-   out[2] = vp[10] * in[2] + vp[14];
+   out[2] = vp[14];
 }
 
 static INLINE void insert_3f_viewport_1( const struct tnl_clipspace_attr *a, GLubyte *v,
@@ -115,7 +121,7 @@ static INLINE void insert_3f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
    out[2] = vp[14];
@@ -126,7 +132,7 @@ static INLINE void insert_2f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
 }
@@ -136,7 +142,7 @@ static INLINE void insert_2f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
 }
@@ -150,7 +156,7 @@ static INLINE void insert_4f_4( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -161,7 +167,7 @@ static INLINE void insert_4f_3( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -172,7 +178,7 @@ static INLINE void insert_4f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = 0;
@@ -183,7 +189,7 @@ static INLINE void insert_4f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
    out[2] = 0;
@@ -194,7 +200,7 @@ static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[3];
@@ -203,6 +209,7 @@ static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte
 static INLINE void insert_3f_xyw_err( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
 {
    (void) a; (void) v; (void) in;
+   DEBUG_INSERT;
    _mesa_exit(1);
 }
 
@@ -210,7 +217,7 @@ static INLINE void insert_3f_3( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -220,7 +227,7 @@ static INLINE void insert_3f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = 0;
@@ -230,7 +237,7 @@ static INLINE void insert_3f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
    out[2] = 0;
@@ -241,7 +248,7 @@ static INLINE void insert_2f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
 }
@@ -250,7 +257,7 @@ static INLINE void insert_2f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
 }
@@ -259,12 +266,13 @@ static INLINE void insert_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-
+   DEBUG_INSERT;
    out[0] = in[0];
 }
 
 static INLINE void insert_null( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a; (void) v; (void) in;
 }
 
@@ -272,6 +280,7 @@ static INLINE void insert_4chan_4f_rgba_4( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -283,6 +292,7 @@ static INLINE void insert_4chan_4f_rgba_3( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -294,6 +304,7 @@ static INLINE void insert_4chan_4f_rgba_2( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -305,6 +316,7 @@ static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    c[1] = 0;
@@ -315,6 +327,7 @@ static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, G
 static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -325,6 +338,7 @@ static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -335,6 +349,7 @@ static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -345,6 +360,7 @@ static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    v[1] = 0;
@@ -355,6 +371,7 @@ static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -365,6 +382,7 @@ static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -375,6 +393,7 @@ static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -385,6 +404,7 @@ static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    v[1] = 0;
@@ -395,6 +415,7 @@ static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -405,6 +426,7 @@ static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -415,6 +437,7 @@ static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -425,6 +448,7 @@ static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    v[2] = 0x00;
@@ -435,6 +459,7 @@ static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -445,6 +470,7 @@ static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -455,6 +481,7 @@ static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -465,6 +492,7 @@ static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    v[2] = 0x00;
@@ -475,6 +503,7 @@ static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -484,6 +513,7 @@ static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -493,6 +523,7 @@ static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    v[1] = 0;
@@ -502,6 +533,7 @@ static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -511,6 +543,7 @@ static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -520,6 +553,7 @@ static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    v[1] = 0;
@@ -530,6 +564,7 @@ static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_1ub_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			   const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
 }
@@ -551,6 +586,7 @@ static void extract_4f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
    /* Although included for completeness, the position coordinate is
     * usually handled differently during clipping.
     */
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = (in[2] - vp[14]) / vp[10];
@@ -562,7 +598,7 @@ static void extract_3f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
 {
    const GLfloat *in = (const GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = (in[2] - vp[14]) / vp[10];
@@ -575,7 +611,7 @@ static void extract_2f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
 {
    const GLfloat *in = (const GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = 0;
diff --git a/src/mesa/tnl/t_vertex_sse.c b/src/mesa/tnl/t_vertex_sse.c
index 96def25206..7a255d680a 100644
--- a/src/mesa/tnl/t_vertex_sse.c
+++ b/src/mesa/tnl/t_vertex_sse.c
@@ -28,10 +28,10 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/colormac.h"
+#include "main/simple_list.h"
+#include "main/enums.h"
 #include "t_context.h"
 #include "t_vertex.h"
-#include "simple_list.h"
-#include "main/enums.h"
 
 #if defined(USE_SSE_ASM)
 
@@ -39,6 +39,12 @@
 #include "x86/common_x86_asm.h"
 
 
+/**
+ * Number of bytes to allocate for generated SSE functions
+ */
+#define MAX_SSE_CODE_SIZE 1024
+
+
 #define X    0
 #define Y    1
 #define Z    2
@@ -140,7 +146,8 @@ static void emit_load3f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
-   emit_load4f_1(p, dest, arg0);
+   /* Loading from memory erases the upper bits. */
+   sse_movss(&p->func, dest, arg0);
 }
 
 static void emit_load2f_2( struct x86_program *p, 
@@ -154,7 +161,8 @@ static void emit_load2f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
-   emit_load4f_1(p, dest, arg0);
+   /* Loading from memory erases the upper bits. */
+   sse_movss(&p->func, dest, arg0);
 }
 
 static void emit_load1f_1( struct x86_program *p, 
@@ -346,6 +354,7 @@ static GLboolean build_vertex_emit( struct x86_program *p )
    struct x86_reg temp = x86_make_reg(file_XMM, 0);
    struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
    struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
+   struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
    GLubyte *fixup, *label;
 
    /* Push a few regs?
@@ -518,7 +527,8 @@ static GLboolean build_vertex_emit( struct x86_program *p )
 	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 
 	    get_src_ptr(p, srcECX, vtxESI, &a[1]);
-	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
+	    sse_movss(&p->func, temp, temp2);
 	    update_src_ptr(p, srcECX, vtxESI, &a[1]);
 
 	    /* Rearrange and possibly do BGR conversion:
@@ -533,8 +543,8 @@ static GLboolean build_vertex_emit( struct x86_program *p )
 	 }
 	 else {
 	    _mesa_printf("Can't emit 3ub\n");
+	    return GL_FALSE;	/* add this later */
 	 }
-	 return GL_FALSE;	/* add this later */
 	 break;
 
       case EMIT_4UB_4F_RGBA:
@@ -619,7 +629,10 @@ static GLboolean build_vertex_emit( struct x86_program *p )
    x86_pop(&p->func, countEBP);
    x86_ret(&p->func);
 
+   assert(!vtx->emit);
    vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
+
+   assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE );
    return GL_TRUE;
 }
 
@@ -644,7 +657,10 @@ void _tnl_generate_sse_emit( GLcontext *ctx )
    p.identity = x86_make_reg(file_XMM, 6);
    p.chan0 = x86_make_reg(file_XMM, 7);
 
-   x86_init_func(&p.func);
+   if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
+      vtx->emit = NULL;
+      return;
+   }
 
    if (build_vertex_emit(&p)) {
       _tnl_register_fastpath( vtx, GL_TRUE );
diff --git a/src/mesa/tnl/t_vp_build.c b/src/mesa/tnl/t_vp_build.c
index b3b63cd3e4..7be4d95af6 100644
--- a/src/mesa/tnl/t_vp_build.c
+++ b/src/mesa/tnl/t_vp_build.c
@@ -1,8 +1,8 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.5
+ * Version:  7.1
  *
- * Copyright (C) 2006  Tungsten Graphics   All Rights Reserved.
+ * Copyright (C) 2007  Tungsten Graphics   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
diff --git a/src/mesa/tnl/tnl.h b/src/mesa/tnl/tnl.h
index 4bdbed92df..4d628aa9a6 100644
--- a/src/mesa/tnl/tnl.h
+++ b/src/mesa/tnl/tnl.h
@@ -1,9 +1,8 @@
-
 /*
  * Mesa 3-D graphics library
- * Version:  3.5
+ * Version:  7.1
  *
- * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),