341 files changed, 9042 insertions, 7878 deletions
diff --git a/src/mesa/drivers/dri/Makefile.template b/src/mesa/drivers/dri/Makefile.template
index 1ce9315530..39d25ce3f4 100644
--- a/src/mesa/drivers/dri/Makefile.template
+++ b/src/mesa/drivers/dri/Makefile.template
@@ -60,9 +60,13 @@ SHARED_INCLUDES = \
 
 ##### TARGETS #####
 
-default: symlinks subdirs depend $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME)
+default: subdirs lib
 
 
+.PHONY: lib
+lib: symlinks subdirs depend
+	@$(MAKE) $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME)
+
 $(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(EXTRA_MODULES) $(WINOBJ) Makefile \
 		$(TOP)/src/mesa/drivers/dri/Makefile.template
 	$(MKLIB) -o $@ -noprefix -linker '$(CC)' -ldflags '$(LDFLAGS)' \
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index e48e10d7c0..3649c29666 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -97,7 +97,7 @@ driIntersectArea( drm_clip_rect_t rect1, drm_clip_rect_t rect2 )
  * 
  * \internal
  * This function calls __DriverAPIRec::UnbindContext, and then decrements
- * __DRIdrawablePrivateRec::refcount which must be non-zero for a successful
+ * __DRIdrawableRec::refcount which must be non-zero for a successful
  * return.
  * 
  * While casting the opaque private pointers associated with the parameters
@@ -167,11 +167,12 @@ static int driBindContext(__DRIcontext *pcp,
 			  __DRIdrawable *pdp,
 			  __DRIdrawable *prp)
 {
-    __DRIscreenPrivate *psp = pcp->driScreenPriv;
+    __DRIscreen *psp = NULL;
 
     /* Bind the drawable to the context */
 
     if (pcp) {
+	psp = pcp->driScreenPriv;
 	pcp->driDrawablePriv = pdp;
 	pcp->driReadablePriv = prp;
 	if (pdp) {
@@ -219,7 +220,7 @@ static int driBindContext(__DRIcontext *pcp,
  *
  * \param pdp pointer to the private drawable information to update.
  * 
- * This function basically updates the __DRIdrawablePrivate struct's
+ * This function basically updates the __DRIdrawable struct's
  * cliprect information by calling \c __DRIinterfaceMethods::getDrawableInfo.
  * This is usually called by the DRI_VALIDATE_DRAWABLE_INFO macro which
  * compares the __DRIdrwablePrivate pStamp and lastStamp values.  If
@@ -227,10 +228,10 @@ static int driBindContext(__DRIcontext *pcp,
  * info.
  */
 void
-__driUtilUpdateDrawableInfo(__DRIdrawablePrivate *pdp)
+__driUtilUpdateDrawableInfo(__DRIdrawable *pdp)
 {
-    __DRIscreenPrivate *psp = pdp->driScreenPriv;
-    __DRIcontextPrivate *pcp = pdp->driContextPriv;
+    __DRIscreen *psp = pdp->driScreenPriv;
+    __DRIcontext *pcp = pdp->driContextPriv;
     
     if (!pcp 
 	|| ((pdp != pcp->driDrawablePriv) && (pdp != pcp->driReadablePriv))) {
@@ -308,7 +309,7 @@ static void driReportDamage(__DRIdrawable *pdp,
  * \param drawablePrivate opaque pointer to the per-drawable private info.
  * 
  * \internal
- * This function calls __DRIdrawablePrivate::swapBuffers.
+ * This function calls __DRIdrawable::swapBuffers.
  * 
  * Is called directly from glXSwapBuffers().
  */
@@ -453,6 +454,7 @@ driCreateNewDrawable(__DRIscreen *psp, const __DRIconfig *config,
 
     pdp->driScreenPriv = psp;
     pdp->driContextPriv = &psp->dummyContextPriv;
+    pdp->validBuffers = GL_FALSE;
 
     if (!(*psp->DriverAPI.CreateBuffer)(psp, pdp, &config->modes,
 					renderType == GLX_PIXMAP_BIT)) {
@@ -496,13 +498,13 @@ static void dri_get_drawable(__DRIdrawable *pdp)
 	
 static void dri_put_drawable(__DRIdrawable *pdp)
 {
-    __DRIscreenPrivate *psp;
-
-    pdp->refcount--;
-    if (pdp->refcount)
-	return;
+    __DRIscreen *psp;
 
     if (pdp) {
+	pdp->refcount--;
+	if (pdp->refcount)
+	    return;
+
 	psp = pdp->driScreenPriv;
         (*psp->DriverAPI.DestroyBuffer)(pdp);
 	if (pdp->pClipRects) {
@@ -559,7 +561,7 @@ driDestroyContext(__DRIcontext *pcp)
  *          success, or \c NULL on failure.
  * 
  * \internal
- * This function allocates and fills a __DRIcontextPrivateRec structure.  It
+ * This function allocates and fills a __DRIcontextRec structure.  It
  * performs some device independent initialization and passes all the
  * relevent information to __DriverAPIRec::CreateContext to create the
  * context.
@@ -840,7 +842,7 @@ const __DRIlegacyExtension driLegacyExtension = {
     driCreateNewContext,
 };
 
-/** Legacy DRI interface */
+/** DRI2 interface */
 const __DRIdri2Extension driDRI2Extension = {
     { __DRI_DRI2, __DRI_DRI2_VERSION },
     dri2CreateNewScreen,
@@ -848,14 +850,6 @@ const __DRIdri2Extension driDRI2Extension = {
     dri2CreateNewContext,
 };
 
-/* This is the table of extensions that the loader will dlsym() for. */
-PUBLIC const __DRIextension *__driDriverExtensions[] = {
-    &driCoreExtension.base,
-    &driLegacyExtension.base,
-    &driDRI2Extension.base,
-    NULL
-};
-
 static int
 driFrameTracking(__DRIdrawable *drawable, GLboolean enable)
 {
@@ -870,7 +864,7 @@ driQueryFrameTracking(__DRIdrawable *dpriv,
    __DRIswapInfo   sInfo;
    int             status;
    int64_t         ust;
-   __DRIscreenPrivate *psp = dpriv->driScreenPriv;
+   __DRIscreen *psp = dpriv->driScreenPriv;
 
    status = dpriv->driScreenPriv->DriverAPI.GetSwapInfo( dpriv, & sInfo );
    if ( status == 0 ) {
@@ -920,14 +914,14 @@ const __DRIframeTrackingExtension driFrameTrackingExtension = {
  *       be possible to cache the sync rate?
  */
 float
-driCalculateSwapUsage( __DRIdrawablePrivate *dPriv, int64_t last_swap_ust,
+driCalculateSwapUsage( __DRIdrawable *dPriv, int64_t last_swap_ust,
 		       int64_t current_ust )
 {
    int32_t   n;
    int32_t   d;
    int       interval;
    float     usage = 1.0;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+   __DRIscreen *psp = dPriv->driScreenPriv;
 
    if ( (*psp->systemTime->getMSCRate)(dPriv, &n, &d, dPriv->loaderPrivate) ) {
       interval = (dPriv->swap_interval != 0) ? dPriv->swap_interval : 1;
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index c95a5c8299..95df702f1a 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -59,16 +59,12 @@
 
 typedef struct __DRIswapInfoRec        __DRIswapInfo;
 
-/* Typedefs to avoid rewriting the world. */
-typedef struct __DRIscreenRec	__DRIscreenPrivate;
-typedef struct __DRIdrawableRec	__DRIdrawablePrivate;
-typedef struct __DRIcontextRec	__DRIcontextPrivate;
-
 /**
  * Extensions.
  */
 extern const __DRIlegacyExtension driLegacyExtension;
 extern const __DRIcoreExtension driCoreExtension;
+extern const __DRIdri2Extension driDRI2Extension;
 extern const __DRIextension driReadDrawableExtension;
 extern const __DRIcopySubBufferExtension driCopySubBufferExtension;
 extern const __DRIswapControlExtension driSwapControlExtension;
@@ -380,6 +376,8 @@ struct __DRIdrawableRec {
      * GLX_MESA_swap_control.
      */
     unsigned int swap_interval;
+
+    GLboolean validBuffers;
 };
 
 /**
diff --git a/src/mesa/drivers/dri/common/drirenderbuffer.c b/src/mesa/drivers/dri/common/drirenderbuffer.c
index 4e7e92c82b..3126ea8476 100644
--- a/src/mesa/drivers/dri/common/drirenderbuffer.c
+++ b/src/mesa/drivers/dri/common/drirenderbuffer.c
@@ -56,7 +56,7 @@ driDeleteRenderbuffer(struct gl_renderbuffer *rb)
 driRenderbuffer *
 driNewRenderbuffer(gl_format format, GLvoid *addr,
                    GLint cpp, GLint offset, GLint pitch,
-                   __DRIdrawablePrivate *dPriv)
+                   __DRIdrawable *dPriv)
 {
    driRenderbuffer *drb;
 
@@ -196,7 +196,7 @@ driFlipRenderbuffers(struct gl_framebuffer *fb, GLboolean flipped)
  * gl_framebuffer object.
  */
 void
-driUpdateFramebufferSize(GLcontext *ctx, const __DRIdrawablePrivate *dPriv)
+driUpdateFramebufferSize(GLcontext *ctx, const __DRIdrawable *dPriv)
 {
    struct gl_framebuffer *fb = (struct gl_framebuffer *) dPriv->driverPrivate;
    if (fb && (dPriv->w != fb->Width || dPriv->h != fb->Height)) {
diff --git a/src/mesa/drivers/dri/common/drirenderbuffer.h b/src/mesa/drivers/dri/common/drirenderbuffer.h
index 3a5cbcdaac..677511334d 100644
--- a/src/mesa/drivers/dri/common/drirenderbuffer.h
+++ b/src/mesa/drivers/dri/common/drirenderbuffer.h
@@ -43,10 +43,10 @@ typedef struct {
    GLint flippedPitch;
    GLvoid *flippedData;  /* mmap'd address of buffer memory, if used */
 
-   /* Pointer to corresponding __DRIdrawablePrivate.  This is used to compute
+   /* Pointer to corresponding __DRIdrawable.  This is used to compute
     * the window's position within the framebuffer.
     */
-   __DRIdrawablePrivate *dPriv;
+   __DRIdrawable *dPriv;
 
    /* XXX this is for radeon/r200 only.  We should really create a new
     * r200Renderbuffer class, derived from this class...  not a huge deal.
@@ -66,14 +66,14 @@ typedef struct {
 extern driRenderbuffer *
 driNewRenderbuffer(gl_format format, GLvoid *addr,
                    GLint cpp, GLint offset, GLint pitch,
-                   __DRIdrawablePrivate *dPriv);
+                   __DRIdrawable *dPriv);
 
 extern void
 driFlipRenderbuffers(struct gl_framebuffer *fb, GLboolean flipped);
 
 
 extern void
-driUpdateFramebufferSize(GLcontext *ctx, const __DRIdrawablePrivate *dPriv);
+driUpdateFramebufferSize(GLcontext *ctx, const __DRIdrawable *dPriv);
 
 
 #endif /* DRIRENDERBUFFER_H */
diff --git a/src/mesa/drivers/dri/common/spantmp2.h b/src/mesa/drivers/dri/common/spantmp2.h
index 89c815722f..447f3d15b9 100644
--- a/src/mesa/drivers/dri/common/spantmp2.h
+++ b/src/mesa/drivers/dri/common/spantmp2.h
@@ -82,6 +82,72 @@
       rgba[3] = 0xff;							\
    } while (0)
 
+#elif (SPANTMP_PIXEL_FMT == GL_RGB)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_5_6_5_REV)
+
+/**
+ ** GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+#define INIT_MONO_PIXEL(p, color) \
+  p = PACK_COLOR_565_REV( color[0], color[1], color[2] )
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )				\
+   PUT_VALUE(_x, _y, PACK_COLOR_565_REV( r, g, b ))
+
+#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
+
+#define READ_RGBA( rgba, _x, _y )					\
+   do {									\
+      GLushort p = GET_VALUE(_x, _y);					\
+      p = p << 8 | p >> 8;						\
+      rgba[0] = ((p >> 8) & 0xf8) * 255 / 0xf8;				\
+      rgba[1] = ((p >> 3) & 0xfc) * 255 / 0xfc;				\
+      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
+      rgba[3] = 0xff;							\
+   } while (0)
+
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_4_4_4_4)
+
+/**
+ ** GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+#define INIT_MONO_PIXEL(p, color) \
+   p = PACK_COLOR_4444_REV(color[3], color[0], color[1], color[2])
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )				\
+   PUT_VALUE(_x, _y, PACK_COLOR_4444_REV(a, r, g, b))			\
+
+#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
+
+#define READ_RGBA( rgba, _x, _y )					\
+   do {									\
+      GLushort p = GET_VALUE(_x, _y);					\
+      rgba[0] = ((p >> 0) & 0xf) * 0x11;				\
+      rgba[1] = ((p >> 12) & 0xf) * 0x11;				\
+      rgba[2] = ((p >> 4) & 0xf) * 0x11;				\
+      rgba[3] = ((p >> 8) & 0xf) * 0x11;				\
+   } while (0)
+
+
 #elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_4_4_4_4_REV)
 
 /**
@@ -147,6 +213,39 @@
       rgba[3] = ((p >> 15) & 0x1) * 0xff;				\
    } while (0)
 
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_1_5_5_5)
+
+/**
+ ** GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+#define INIT_MONO_PIXEL(p, color) \
+   p = PACK_COLOR_1555_REV(color[3], color[0], color[1], color[2])
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )				\
+   PUT_VALUE(_x, _y, PACK_COLOR_1555_REV(a, r, g, b))			\
+
+#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
+
+#define READ_RGBA( rgba, _x, _y )					\
+   do {									\
+      GLushort p = GET_VALUE(_x, _y);					\
+      p = p << 8 | p >> 8;						\
+      rgba[0] = ((p >> 7) & 0xf8) * 255 / 0xf8;				\
+      rgba[1] = ((p >> 2) & 0xf8) * 255 / 0xf8;				\
+      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
+      rgba[3] = ((p >> 15) & 0x1) * 0xff;				\
+   } while (0)
+
 #elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
 
 /**
@@ -202,6 +301,118 @@
      } while (0)
 # endif
 
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8)
+
+/**
+ ** GL_BGRA, GL_UNSIGNED_INT_8_8_8_8
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (     buf + (_x) * 4 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLuint *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLuint *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+# define INIT_MONO_PIXEL(p, color)                       \
+     p = PACK_COLOR_8888(color[2], color[1], color[0], color[3]) 
+
+# define WRITE_RGBA(_x, _y, r, g, b, a)                                 \
+   PUT_VALUE(_x, _y, ((r << 8) |					\
+		      (g << 16) |					\
+		      (b << 24) |					\
+		      (a << 0)))
+
+#define WRITE_PIXEL(_x, _y, p) PUT_VALUE(_x, _y, p)
+
+# if defined( USE_X86_ASM )
+#  define READ_RGBA(rgba, _x, _y)                                       \
+    do {                                                                \
+       GLuint p = GET_VALUE(_x, _y);					\
+       __asm__ __volatile__( "rorl $8, %0"				\
+				: "=r" (p) : "0" (p) );                 \
+       ((GLuint *)rgba)[0] = p;                                         \
+    } while (0)
+# elif defined( MESA_BIG_ENDIAN )
+    /* On PowerPC with GCC 3.4.2 the shift madness below becomes a single
+     * rotlwi instruction.  It also produces good code on SPARC.
+     */
+#  define READ_RGBA( rgba, _x, _y )				        \
+     do {								\
+        GLuint p = CPU_TO_LE32(GET_VALUE(_x, _y));                      \
+        GLuint t = p;                                                   \
+        *((uint32_t *) rgba) = (t >> 24) | (p << 8);                    \
+     } while (0)
+# else
+#  define READ_RGBA( rgba, _x, _y )				        \
+     do {								\
+        GLuint p = GET_VALUE(_x, _y);					\
+	rgba[0] = (p >>  8) & 0xff;					\
+	rgba[1] = (p >> 16) & 0xff;					\
+	rgba[2] = (p >> 24) & 0xff;					\
+	rgba[3] = (p >>  0) & 0xff;					\
+     } while (0)
+# endif
+
+#elif (SPANTMP_PIXEL_FMT == GL_BGR) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
+
+/**
+ ** GL_BGR, GL_UNSIGNED_INT_8_8_8_8_REV
+ **
+ ** This is really for MESA_FORMAT_XRGB8888.  The spantmp code needs to be
+ ** kicked to the curb, and we need to just code-gen this.
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (     buf + (_x) * 4 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLuint *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLuint *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+# define INIT_MONO_PIXEL(p, color)                       \
+     p = PACK_COLOR_8888(0xff, color[0], color[1], color[2])
+
+# define WRITE_RGBA(_x, _y, r, g, b, a)					\
+   PUT_VALUE(_x, _y, ((r << 16) |					\
+		      (g << 8) |					\
+		      (b << 0) |					\
+		      (0xff << 24)))
+
+#define WRITE_PIXEL(_x, _y, p) PUT_VALUE(_x, _y, p)
+
+# if defined( USE_X86_ASM )
+#  define READ_RGBA(rgba, _x, _y)                                       \
+    do {                                                                \
+       GLuint p = GET_VALUE(_x, _y);					\
+       __asm__ __volatile__( "bswap	%0; rorl $8, %0"                \
+				: "=r" (p) : "0" (p) );                 \
+       ((GLuint *)rgba)[0] = p | 0xff000000;				\
+    } while (0)
+# elif defined( MESA_BIG_ENDIAN )
+    /* On PowerPC with GCC 3.4.2 the shift madness below becomes a single
+     * rotlwi instruction.  It also produces good code on SPARC.
+     */
+#  define READ_RGBA( rgba, _x, _y )				        \
+     do {								\
+        GLuint p = GET_VALUE(_x, _y);					\
+        *((uint32_t *) rgba) = (t << 8) | 0xff;				\
+     } while (0)
+# else
+#  define READ_RGBA( rgba, _x, _y )				        \
+     do {								\
+        GLuint p = GET_VALUE(_x, _y);					\
+	rgba[0] = (p >> 16) & 0xff;					\
+	rgba[1] = (p >>  8) & 0xff;					\
+	rgba[2] = (p >>  0) & 0xff;					\
+	rgba[3] = 0xff;							\
+     } while (0)
+# endif
+
 #else
 #error SPANTMP_PIXEL_FMT must be set to a valid value!
 #endif
diff --git a/src/mesa/drivers/dri/common/vblank.c b/src/mesa/drivers/dri/common/vblank.c
index 12aeaa108f..49b22a2dc7 100644
--- a/src/mesa/drivers/dri/common/vblank.c
+++ b/src/mesa/drivers/dri/common/vblank.c
@@ -34,12 +34,12 @@
 #include "vblank.h"
 #include "xmlpool.h"
 
-static unsigned int msc_to_vblank(__DRIdrawablePrivate * dPriv, int64_t msc)
+static unsigned int msc_to_vblank(__DRIdrawable * dPriv, int64_t msc)
 {
    return (unsigned int)(msc - dPriv->msc_base + dPriv->vblank_base);
 }
 
-static int64_t vblank_to_msc(__DRIdrawablePrivate * dPriv, unsigned int vblank)
+static int64_t vblank_to_msc(__DRIdrawable * dPriv, unsigned int vblank)
 {
    return (int64_t)(vblank - dPriv->vblank_base + dPriv->msc_base);
 }
@@ -64,8 +64,8 @@ static int64_t vblank_to_msc(__DRIdrawablePrivate * dPriv, unsigned int vblank)
  * \return       Zero is returned on success.  A negative errno value
  *               is returned on failure.
  */
-int driDrawableGetMSC32( __DRIscreenPrivate * priv,
-			 __DRIdrawablePrivate * dPriv,
+int driDrawableGetMSC32( __DRIscreen * priv,
+			 __DRIdrawable * dPriv,
 			 int64_t * count)
 {
    drmVBlank vbl;
@@ -122,7 +122,7 @@ int driDrawableGetMSC32( __DRIscreenPrivate * priv,
  * \return            Zero on success or \c GLX_BAD_CONTEXT on failure.
  */
 
-int driWaitForMSC32( __DRIdrawablePrivate *priv,
+int driWaitForMSC32( __DRIdrawable *priv,
 		     int64_t target_msc, int64_t divisor, int64_t remainder,
 		     int64_t * msc )
 {
@@ -278,7 +278,7 @@ static int do_wait( drmVBlank * vbl, GLuint * vbl_seq, int fd )
  */
 
 static unsigned
-driGetDefaultVBlankInterval( const  __DRIdrawablePrivate *priv )
+driGetDefaultVBlankInterval( const  __DRIdrawable *priv )
 {
    if ( (priv->vblFlags & (VBLANK_FLAG_THROTTLE | VBLANK_FLAG_SYNC)) != 0 ) {
       return 1;
@@ -295,7 +295,7 @@ driGetDefaultVBlankInterval( const  __DRIdrawablePrivate *priv )
  * direct rendering context.
  */
 
-void driDrawableInitVBlank( __DRIdrawablePrivate *priv )
+void driDrawableInitVBlank( __DRIdrawable *priv )
 {
    if ( priv->swap_interval == (unsigned)-1 &&
 	!( priv->vblFlags & VBLANK_FLAG_NO_IRQ ) ) {
@@ -320,7 +320,7 @@ void driDrawableInitVBlank( __DRIdrawablePrivate *priv )
  */
 
 unsigned
-driGetVBlankInterval( const  __DRIdrawablePrivate *priv )
+driGetVBlankInterval( const  __DRIdrawable *priv )
 {
    if ( (priv->vblFlags & VBLANK_FLAG_INTERVAL) != 0 ) {
       /* this must have been initialized when the drawable was first bound
@@ -340,7 +340,7 @@ driGetVBlankInterval( const  __DRIdrawablePrivate *priv )
  */
 
 void
-driGetCurrentVBlank( __DRIdrawablePrivate *priv )
+driGetCurrentVBlank( __DRIdrawable *priv )
 {
    drmVBlank vbl;
 
@@ -366,7 +366,7 @@ driGetCurrentVBlank( __DRIdrawablePrivate *priv )
  */
 
 int
-driWaitForVBlank( __DRIdrawablePrivate *priv, GLboolean * missed_deadline )
+driWaitForVBlank( __DRIdrawable *priv, GLboolean * missed_deadline )
 {
    drmVBlank vbl;
    unsigned   original_seq;
diff --git a/src/mesa/drivers/dri/common/vblank.h b/src/mesa/drivers/dri/common/vblank.h
index 8b2c761a11..29d1ad8003 100644
--- a/src/mesa/drivers/dri/common/vblank.h
+++ b/src/mesa/drivers/dri/common/vblank.h
@@ -44,17 +44,17 @@
 #define VBLANK_FLAG_SECONDARY (1U << 8)  /* Wait for secondary vblank.
 					  */
 
-extern int driGetMSC32( __DRIscreenPrivate * priv, int64_t * count );
-extern int driDrawableGetMSC32( __DRIscreenPrivate * priv,
-				__DRIdrawablePrivate * drawablePrivate,
+extern int driGetMSC32( __DRIscreen * priv, int64_t * count );
+extern int driDrawableGetMSC32( __DRIscreen * priv,
+				__DRIdrawable * drawablePrivate,
 				int64_t * count);
-extern int driWaitForMSC32( __DRIdrawablePrivate *priv,
+extern int driWaitForMSC32( __DRIdrawable *priv,
     int64_t target_msc, int64_t divisor, int64_t remainder, int64_t * msc );
 extern GLuint driGetDefaultVBlankFlags( const driOptionCache *optionCache );
-extern void driDrawableInitVBlank ( __DRIdrawablePrivate *priv );
-extern unsigned driGetVBlankInterval( const  __DRIdrawablePrivate *priv );
-extern void driGetCurrentVBlank( __DRIdrawablePrivate *priv );
-extern int driWaitForVBlank( __DRIdrawablePrivate *priv,
+extern void driDrawableInitVBlank ( __DRIdrawable *priv );
+extern unsigned driGetVBlankInterval( const  __DRIdrawable *priv );
+extern void driGetCurrentVBlank( __DRIdrawable *priv );
+extern int driWaitForVBlank( __DRIdrawable *priv,
 			     GLboolean * missed_deadline );
 
 #undef usleep
diff --git a/src/mesa/drivers/dri/fb/fb_dri.c b/src/mesa/drivers/dri/fb/fb_dri.c
index fd869b2fe7..f37241dd69 100644
--- a/src/mesa/drivers/dri/fb/fb_dri.c
+++ b/src/mesa/drivers/dri/fb/fb_dri.c
@@ -64,9 +64,9 @@ typedef struct {
    GLcontext *glCtx;		/* Mesa context */
 
    struct {
-      __DRIcontextPrivate *context;	
-      __DRIscreenPrivate *screen;	
-      __DRIdrawablePrivate *drawable; /* drawable bound to this ctx */
+      __DRIcontext *context;	
+      __DRIscreen *screen;	
+      __DRIdrawable *drawable; /* drawable bound to this ctx */
    } dri;
    
 } fbContext, *fbContextPtr;
@@ -313,14 +313,14 @@ fbSetSpanFunctions(driRenderbuffer *drb, const GLvisual *vis)
 /* Initialize the driver specific screen private data.
  */
 static GLboolean
-fbInitDriver( __DRIscreenPrivate *sPriv )
+fbInitDriver( __DRIscreen *sPriv )
 {
    sPriv->private = NULL;
    return GL_TRUE;
 }
 
 static void
-fbDestroyScreen( __DRIscreenPrivate *sPriv )
+fbDestroyScreen( __DRIscreen *sPriv )
 {
 }
 
@@ -329,7 +329,7 @@ fbDestroyScreen( __DRIscreenPrivate *sPriv )
  */
 static GLboolean
 fbCreateContext( const __GLcontextModes *glVisual,
-		 __DRIcontextPrivate *driContextPriv,
+		 __DRIcontext *driContextPriv,
 		 void *sharedContextPrivate)
 {
    fbContextPtr fbmesa;
@@ -384,7 +384,7 @@ fbCreateContext( const __GLcontextModes *glVisual,
 
 
 static void
-fbDestroyContext( __DRIcontextPrivate *driContextPriv )
+fbDestroyContext( __DRIcontext *driContextPriv )
 {
    GET_CURRENT_CONTEXT(ctx);
    fbContextPtr fbmesa = (fbContextPtr) driContextPriv->driverPrivate;
@@ -415,8 +415,8 @@ fbDestroyContext( __DRIcontextPrivate *driContextPriv )
  * data.
  */
 static GLboolean
-fbCreateBuffer( __DRIscreenPrivate *driScrnPriv,
-		__DRIdrawablePrivate *driDrawPriv,
+fbCreateBuffer( __DRIscreen *driScrnPriv,
+		__DRIdrawable *driDrawPriv,
 		const __GLcontextModes *mesaVis,
 		GLboolean isPixmap )
 {
@@ -478,7 +478,7 @@ fbCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-fbDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+fbDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
@@ -488,7 +488,7 @@ fbDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 /* If the backbuffer is on a videocard, this is extraordinarily slow!
  */
 static void
-fbSwapBuffers( __DRIdrawablePrivate *dPriv )
+fbSwapBuffers( __DRIdrawable *dPriv )
 {
    struct gl_framebuffer *mesa_framebuffer = (struct gl_framebuffer *)dPriv->driverPrivate;
    struct gl_renderbuffer * front_renderbuffer = mesa_framebuffer->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
@@ -532,9 +532,9 @@ fbSwapBuffers( __DRIdrawablePrivate *dPriv )
  * buffer `b'.
  */
 static GLboolean
-fbMakeCurrent( __DRIcontextPrivate *driContextPriv,
-	       __DRIdrawablePrivate *driDrawPriv,
-	       __DRIdrawablePrivate *driReadPriv )
+fbMakeCurrent( __DRIcontext *driContextPriv,
+	       __DRIdrawable *driDrawPriv,
+	       __DRIdrawable *driReadPriv )
 {
    if ( driContextPriv ) {
       fbContextPtr newFbCtx = 
@@ -556,7 +556,7 @@ fbMakeCurrent( __DRIcontextPrivate *driContextPriv,
 /* Force the context `c' to be unbound from its buffer.
  */
 static GLboolean
-fbUnbindContext( __DRIcontextPrivate *driContextPriv )
+fbUnbindContext( __DRIcontext *driContextPriv )
 {
    return GL_TRUE;
 }
@@ -657,7 +657,7 @@ struct DRIDriverRec __driDriver = {
 };
 
 static __GLcontextModes *
-fbFillInModes( __DRIscreenPrivate *psp,
+fbFillInModes( __DRIscreen *psp,
 	       unsigned pixel_bits, unsigned depth_bits,
 	       unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -745,7 +745,7 @@ fbFillInModes( __DRIscreenPrivate *psp,
  * with the \c __GLcontextModes that the driver can support for windows or
  * pbuffers.
  * 
- * \return A pointer to a \c __DRIscreenPrivate on success, or \c NULL on 
+ * \return A pointer to a \c __DRIscreen on success, or \c NULL on 
  *         failure.
  */
 PUBLIC
@@ -759,7 +759,7 @@ void * __driCreateNewScreen( __DRInativeDisplay *dpy, int scrn, __DRIscreen *psc
                                    int internal_api_version,
                                    __GLcontextModes ** driver_modes )
 {
-   __DRIscreenPrivate *psp;
+   __DRIscreen *psp;
    static const __DRIversion ddx_expected = { 4, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
    static const __DRIversion drm_expected = { 1, 5, 0 };
@@ -785,3 +785,10 @@ void * __driCreateNewScreen( __DRInativeDisplay *dpy, int scrn, __DRIscreen *psc
 
           return (void *) psp;
 }
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/fb/fb_egl.c b/src/mesa/drivers/dri/fb/fb_egl.c
index eb7adf8224..02e44bb8ee 100644
--- a/src/mesa/drivers/dri/fb/fb_egl.c
+++ b/src/mesa/drivers/dri/fb/fb_egl.c
@@ -84,9 +84,9 @@ typedef struct fb_context
    _EGLContext Base;  /* base class/object */
    GLcontext *glCtx;
    struct {
-      __DRIcontextPrivate *context;	
-      __DRIscreenPrivate *screen;	
-      __DRIdrawablePrivate *drawable; /* drawable bound to this ctx */
+      __DRIcontext *context;	
+      __DRIscreen *screen;	
+      __DRIdrawable *drawable; /* drawable bound to this ctx */
    } dri;
 } fbContext, *fbContextPtr;
 
diff --git a/src/mesa/drivers/dri/ffb/ffb_bitmap.c b/src/mesa/drivers/dri/ffb/ffb_bitmap.c
index f89c0412df..611afddfaf 100644
--- a/src/mesa/drivers/dri/ffb/ffb_bitmap.c
+++ b/src/mesa/drivers/dri/ffb/ffb_bitmap.c
@@ -46,7 +46,7 @@ ffb_bitmap(GLcontext *ctx, GLint px, GLint py,
 {
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
 	ffb_fbcPtr ffb = fmesa->regs;
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	unsigned int ppc, pixel;
 	GLint row, col, row_stride;
 	const GLubyte *src;
diff --git a/src/mesa/drivers/dri/ffb/ffb_clear.c b/src/mesa/drivers/dri/ffb/ffb_clear.c
index 776fb487f8..dfe60f36f2 100644
--- a/src/mesa/drivers/dri/ffb/ffb_clear.c
+++ b/src/mesa/drivers/dri/ffb/ffb_clear.c
@@ -123,7 +123,7 @@ CreatorComputePageFillFixups(struct ff_fixups *fixups,
 }
 
 static void
-ffb_do_clear(GLcontext *ctx, __DRIdrawablePrivate *dPriv)
+ffb_do_clear(GLcontext *ctx, __DRIdrawable *dPriv)
 {
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
 	FFBDRIPtr gDRIPriv = (FFBDRIPtr) fmesa->driScreen->pDevPriv;
@@ -252,7 +252,7 @@ ffb_do_clear(GLcontext *ctx, __DRIdrawablePrivate *dPriv)
 void ffbDDClear(GLcontext *ctx, GLbitfield mask)
 {
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	unsigned int stcmask = BUFFER_BIT_STENCIL;
 
 #ifdef CLEAR_TRACE
diff --git a/src/mesa/drivers/dri/ffb/ffb_context.h b/src/mesa/drivers/dri/ffb/ffb_context.h
index 77f87d41c3..4d1d53ff59 100644
--- a/src/mesa/drivers/dri/ffb/ffb_context.h
+++ b/src/mesa/drivers/dri/ffb/ffb_context.h
@@ -273,8 +273,8 @@ do {	if ((STATE_MASK) & ~((FMESA)->state_dirty)) {	\
 	unsigned int		setupnewinputs;
 	unsigned int		new_gl_state;
 
-	__DRIdrawablePrivate	*driDrawable;
-	__DRIscreenPrivate	*driScreen;
+	__DRIdrawable	*driDrawable;
+	__DRIscreen	*driScreen;
 	ffbScreenPrivate	*ffbScreen;
 	ffb_dri_state_t		*ffb_sarea;
 } ffbContextRec, *ffbContextPtr;
diff --git a/src/mesa/drivers/dri/ffb/ffb_depth.c b/src/mesa/drivers/dri/ffb/ffb_depth.c
index 71f204d21e..5d509ff696 100644
--- a/src/mesa/drivers/dri/ffb/ffb_depth.c
+++ b/src/mesa/drivers/dri/ffb/ffb_depth.c
@@ -49,7 +49,7 @@ static void FFBWriteDepthSpan( GLcontext *ctx,
 #endif
 	if (ctx->Depth.Mask) {
 		ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-		__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+		__DRIdrawable *dPriv = fmesa->driDrawable;
 		GLuint *zptr;
 		GLuint i;
 
@@ -110,7 +110,7 @@ static void FFBWriteDepthPixels( GLcontext *ctx,
 #endif
 	if (ctx->Depth.Mask) {
 		ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-		__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+		__DRIdrawable *dPriv = fmesa->driDrawable;
 		char *zbase;
 		GLuint i;
 
@@ -153,7 +153,7 @@ static void FFBReadDepthSpan( GLcontext *ctx,
 {
         GLuint *depth = (GLuint *) values;
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	GLuint *zptr;
 	GLuint i;
 
@@ -194,7 +194,7 @@ static void FFBReadDepthPixels( GLcontext *ctx,
 {
         GLuint *depth = (GLuint *) values;
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	char *zbase;
 	GLuint i;
 
diff --git a/src/mesa/drivers/dri/ffb/ffb_span.c b/src/mesa/drivers/dri/ffb/ffb_span.c
index 0d3d604095..8ec33a11bc 100644
--- a/src/mesa/drivers/dri/ffb/ffb_span.c
+++ b/src/mesa/drivers/dri/ffb/ffb_span.c
@@ -45,7 +45,7 @@
 		UNLOCK_HARDWARE(fmesa); \
 
 #define LOCAL_VARS						\
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;	\
+	__DRIdrawable *dPriv = fmesa->driDrawable;	\
 	GLuint height = dPriv->h;				\
         GLuint p;						\
 	char *buf; 						\
diff --git a/src/mesa/drivers/dri/ffb/ffb_state.c b/src/mesa/drivers/dri/ffb/ffb_state.c
index 5eb8f417ff..6f8a46d1fc 100644
--- a/src/mesa/drivers/dri/ffb/ffb_state.c
+++ b/src/mesa/drivers/dri/ffb/ffb_state.c
@@ -384,7 +384,7 @@ ffbDDStencilOpSeparate(GLcontext *ctx, GLenum face, GLenum fail,
 static void ffbCalcViewportRegs(GLcontext *ctx)
 {
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	GLuint xmin, xmax, ymin, ymax, zmin, zmax;
 	unsigned int vcmin, vcmax;
 
@@ -430,7 +430,7 @@ void ffbCalcViewport(GLcontext *ctx)
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
 	GLfloat *m = fmesa->hw_viewport;
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 
 	m[MAT_SX] =   v[MAT_SX];
 	m[MAT_TX] =   v[MAT_TX] + dPriv->x + SUBPIXEL_X;
@@ -762,7 +762,7 @@ static void ffbDDLineStipple(GLcontext *ctx, GLint factor, GLushort pattern)
 
 void ffbXformAreaPattern(ffbContextPtr fmesa, const GLubyte *mask)
 {
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	int i, lines, xoff;
 
 	lines = 0;
diff --git a/src/mesa/drivers/dri/ffb/ffb_stencil.c b/src/mesa/drivers/dri/ffb/ffb_stencil.c
index 921a83d274..ce8ef43c91 100644
--- a/src/mesa/drivers/dri/ffb/ffb_stencil.c
+++ b/src/mesa/drivers/dri/ffb/ffb_stencil.c
@@ -48,7 +48,7 @@ static void FFBWriteStencilSpan( GLcontext *ctx,
 #endif
 	if (ctx->Depth.Mask) {
 		ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-		__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+		__DRIdrawable *dPriv = fmesa->driDrawable;
 		GLuint *zptr;
 		GLuint i;
 
@@ -93,7 +93,7 @@ static void FFBWriteStencilPixels( GLcontext *ctx,
 #endif
 	if (ctx->Depth.Mask) {
 		ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-		__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+		__DRIdrawable *dPriv = fmesa->driDrawable;
 		char *zbase;
 		GLuint i;
 
@@ -136,7 +136,7 @@ static void FFBReadStencilSpan( GLcontext *ctx,
 {
         GLubyte *stencil = (GLubyte *) values;
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	GLuint *zptr;
 	GLuint i;
 
@@ -176,7 +176,7 @@ static void FFBReadStencilPixels( GLcontext *ctx,
 {
         GLubyte *stencil = (GLubyte *) values;
 	ffbContextPtr fmesa = FFB_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
 	char *zbase;
 	GLuint i;
 
diff --git a/src/mesa/drivers/dri/ffb/ffb_tris.c b/src/mesa/drivers/dri/ffb/ffb_tris.c
index d785c15718..8bf5ae498f 100644
--- a/src/mesa/drivers/dri/ffb/ffb_tris.c
+++ b/src/mesa/drivers/dri/ffb/ffb_tris.c
@@ -351,8 +351,8 @@ static struct {
 
 #define LOCAL_VARS(n)				\
    ffbContextPtr fmesa = FFB_CONTEXT(ctx);	\
-   __DRIdrawablePrivate *dPriv = fmesa->driDrawable; \
-   ffb_color color[n];				\
+   __DRIdrawable *dPriv = fmesa->driDrawable; \
+   ffb_color color[n] = { { 0 } };		\
    (void) color; (void) dPriv;
 
 /***********************************************************************
diff --git a/src/mesa/drivers/dri/ffb/ffb_vbtmp.h b/src/mesa/drivers/dri/ffb/ffb_vbtmp.h
index 0495d0e276..c548ef3ad5 100644
--- a/src/mesa/drivers/dri/ffb/ffb_vbtmp.h
+++ b/src/mesa/drivers/dri/ffb/ffb_vbtmp.h
@@ -38,11 +38,11 @@ static void TAG(emit)(GLcontext *ctx, GLuint start, GLuint end)
 #endif
 
 #if (IND & (FFB_VB_RGBA_BIT))
-	col0 = VB->ColorPtr[0]->data;
-	col0_stride = VB->ColorPtr[0]->stride;
+	col0 = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+	col0_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
 #if (IND & (FFB_VB_TWOSIDE_BIT))
-	col1 = VB->ColorPtr[1]->data;
-	col1_stride = VB->ColorPtr[1]->stride;
+	col1 = VB->BackfaceColorPtr->data;
+	col1_stride = VB->BackfaceColorPtr->stride;
 #endif
 #endif
 
diff --git a/src/mesa/drivers/dri/ffb/ffb_xmesa.c b/src/mesa/drivers/dri/ffb/ffb_xmesa.c
index 09cc26d09e..88285f454e 100644
--- a/src/mesa/drivers/dri/ffb/ffb_xmesa.c
+++ b/src/mesa/drivers/dri/ffb/ffb_xmesa.c
@@ -62,7 +62,7 @@
 #include "drirenderbuffer.h"
 
 static GLboolean
-ffbInitDriver(__DRIscreenPrivate *sPriv)
+ffbInitDriver(__DRIscreen *sPriv)
 {
 	ffbScreenPrivate *ffbScreen;
 	FFBDRIPtr gDRIPriv = (FFBDRIPtr) sPriv->pDevPriv;
@@ -154,7 +154,7 @@ ffbInitDriver(__DRIscreenPrivate *sPriv)
 
 
 static void
-ffbDestroyScreen(__DRIscreenPrivate *sPriv)
+ffbDestroyScreen(__DRIscreen *sPriv)
 {
 	ffbScreenPrivate *ffbScreen = sPriv->private;
 	FFBDRIPtr gDRIPriv = (FFBDRIPtr) sPriv->pDevPriv;
@@ -183,12 +183,12 @@ static const struct tnl_pipeline_stage *ffb_pipeline[] = {
 /* Create and initialize the Mesa and driver specific context data */
 static GLboolean
 ffbCreateContext(const __GLcontextModes *mesaVis,
-                 __DRIcontextPrivate *driContextPriv,
+                 __DRIcontext *driContextPriv,
                  void *sharedContextPrivate)
 {
 	ffbContextPtr fmesa;
 	GLcontext *ctx, *shareCtx;
-	__DRIscreenPrivate *sPriv;
+	__DRIscreen *sPriv;
 	ffbScreenPrivate *ffbScreen;
 	char *debug;
 	struct dd_function_table functions;
@@ -306,7 +306,7 @@ ffbCreateContext(const __GLcontextModes *mesaVis,
 }
 
 static void
-ffbDestroyContext(__DRIcontextPrivate *driContextPriv)
+ffbDestroyContext(__DRIcontext *driContextPriv)
 {
 	ffbContextPtr fmesa = (ffbContextPtr) driContextPriv->driverPrivate;
 
@@ -328,8 +328,8 @@ ffbDestroyContext(__DRIcontextPrivate *driContextPriv)
 
 /* Create and initialize the Mesa and driver specific pixmap buffer data */
 static GLboolean
-ffbCreateBuffer(__DRIscreenPrivate *driScrnPriv,
-                __DRIdrawablePrivate *driDrawPriv,
+ffbCreateBuffer(__DRIscreen *driScrnPriv,
+                __DRIdrawable *driDrawPriv,
                 const __GLcontextModes *mesaVis,
                 GLboolean isPixmap )
 {
@@ -392,7 +392,7 @@ ffbCreateBuffer(__DRIscreenPrivate *driScrnPriv,
 
 
 static void
-ffbDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+ffbDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
@@ -401,7 +401,7 @@ ffbDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 #define USE_FAST_SWAP
 
 static void
-ffbSwapBuffers( __DRIdrawablePrivate *dPriv )
+ffbSwapBuffers( __DRIdrawable *dPriv )
 {
 	ffbContextPtr fmesa = (ffbContextPtr) dPriv->driContextPriv->driverPrivate;
 	unsigned int fbc, wid, wid_reg_val, dac_db_bit;
@@ -532,9 +532,9 @@ static void ffb_init_wid(ffbContextPtr fmesa, unsigned int wid)
 /* Force the context `c' to be the current context and associate with it
    buffer `b' */
 static GLboolean
-ffbMakeCurrent(__DRIcontextPrivate *driContextPriv,
-               __DRIdrawablePrivate *driDrawPriv,
-               __DRIdrawablePrivate *driReadPriv)
+ffbMakeCurrent(__DRIcontext *driContextPriv,
+               __DRIdrawable *driDrawPriv,
+               __DRIdrawable *driReadPriv)
 {
 	if (driContextPriv) {
 		ffbContextPtr fmesa = (ffbContextPtr) driContextPriv->driverPrivate;
@@ -581,15 +581,15 @@ ffbMakeCurrent(__DRIcontextPrivate *driContextPriv,
 
 /* Force the context `c' to be unbound from its buffer */
 static GLboolean
-ffbUnbindContext(__DRIcontextPrivate *driContextPriv)
+ffbUnbindContext(__DRIcontext *driContextPriv)
 {
 	return GL_TRUE;
 }
 
 void ffbXMesaUpdateState(ffbContextPtr fmesa)
 {
-	__DRIdrawablePrivate *dPriv = fmesa->driDrawable;
-	__DRIscreenPrivate *sPriv = fmesa->driScreen;
+	__DRIdrawable *dPriv = fmesa->driDrawable;
+	__DRIscreen *sPriv = fmesa->driScreen;
 	int stamp = dPriv->lastStamp;
 
 	DRI_VALIDATE_DRAWABLE_INFO(sPriv, dPriv);
@@ -607,7 +607,7 @@ void ffbXMesaUpdateState(ffbContextPtr fmesa)
 }
 
 static const __DRIconfig **
-ffbFillInModes( __DRIscreenPrivate *psp,
+ffbFillInModes( __DRIscreen *psp,
 		unsigned pixel_bits, unsigned depth_bits,
 		unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -722,3 +722,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/ffb/ffb_xmesa.h b/src/mesa/drivers/dri/ffb/ffb_xmesa.h
index 255da4c5f8..2b1740d221 100644
--- a/src/mesa/drivers/dri/ffb/ffb_xmesa.h
+++ b/src/mesa/drivers/dri/ffb/ffb_xmesa.h
@@ -11,7 +11,7 @@
 #include "ffb_fifo.h"
 
 typedef struct {
-	__DRIscreenPrivate		*sPriv;
+	__DRIscreen		*sPriv;
 	ffb_fbcPtr			regs;
 	ffb_dacPtr			dac;
 	volatile char			*sfb8r;
diff --git a/src/mesa/drivers/dri/gamma/gamma_context.c b/src/mesa/drivers/dri/gamma/gamma_context.c
index b0ac299daa..bab5b69a8e 100644
--- a/src/mesa/drivers/dri/gamma/gamma_context.c
+++ b/src/mesa/drivers/dri/gamma/gamma_context.c
@@ -68,11 +68,11 @@ static const struct tnl_pipeline_stage *gamma_pipeline[] = {
 };
 
 GLboolean gammaCreateContext( const __GLcontextModes *glVisual,
-			     __DRIcontextPrivate *driContextPriv,
+			     __DRIcontext *driContextPriv,
                      	     void *sharedContextPrivate)
 {
    GLcontext *ctx, *shareCtx;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    gammaContextPtr gmesa;
    gammaScreenPtr gammascrn;
    GLINTSAREADRIPtr saPriv=(GLINTSAREADRIPtr)(((char*)sPriv->pSAREA)+
diff --git a/src/mesa/drivers/dri/gamma/gamma_context.h b/src/mesa/drivers/dri/gamma/gamma_context.h
index a32ccb6007..c386aa3007 100644
--- a/src/mesa/drivers/dri/gamma/gamma_context.h
+++ b/src/mesa/drivers/dri/gamma/gamma_context.h
@@ -58,10 +58,10 @@ typedef union {
 #define MAX_TEXTURE_STACK       2
 
 extern void	  gammaDDUpdateHWState(GLcontext *ctx);
-extern gammaScreenPtr	  gammaCreateScreen(__DRIscreenPrivate *sPriv);
-extern void	  gammaDestroyScreen(__DRIscreenPrivate *sPriv);
+extern gammaScreenPtr	  gammaCreateScreen(__DRIscreen *sPriv);
+extern void	  gammaDestroyScreen(__DRIscreen *sPriv);
 extern GLboolean gammaCreateContext( const __GLcontextModes *glVisual,
-                                     __DRIcontextPrivate *driContextPriv,
+                                     __DRIcontext *driContextPriv,
                                      void *sharedContextPrivate);
 
 #define GAMMA_UPLOAD_ALL	0xffffffff
@@ -230,9 +230,9 @@ typedef void (*gamma_point_func)( gammaContextPtr,
 struct gamma_context {
 	GLcontext 		*glCtx;		/* Mesa context */
 
-	__DRIcontextPrivate	*driContext;
-	__DRIscreenPrivate	*driScreen;
-	__DRIdrawablePrivate	*driDrawable;
+	__DRIcontext	*driContext;
+	__DRIscreen	*driScreen;
+	__DRIdrawable	*driDrawable;
 
 	GLuint 			new_gl_state;
 	GLuint 			new_state;
diff --git a/src/mesa/drivers/dri/gamma/gamma_lock.c b/src/mesa/drivers/dri/gamma/gamma_lock.c
index 8f2d01688c..cd4acef24d 100644
--- a/src/mesa/drivers/dri/gamma/gamma_lock.c
+++ b/src/mesa/drivers/dri/gamma/gamma_lock.c
@@ -19,8 +19,8 @@ int prevLockLine = 0;
  */
 void gammaGetLock( gammaContextPtr gmesa, GLuint flags )
 {
-   __DRIdrawablePrivate *dPriv = gmesa->driDrawable;
-   __DRIscreenPrivate *sPriv = gmesa->driScreen;
+   __DRIdrawable *dPriv = gmesa->driDrawable;
+   __DRIscreen *sPriv = gmesa->driScreen;
 
    drmGetLock( gmesa->driFd, gmesa->hHWContext, flags );
 
diff --git a/src/mesa/drivers/dri/gamma/gamma_macros.h b/src/mesa/drivers/dri/gamma/gamma_macros.h
index c15483b770..d962dcdb56 100644
--- a/src/mesa/drivers/dri/gamma/gamma_macros.h
+++ b/src/mesa/drivers/dri/gamma/gamma_macros.h
@@ -245,8 +245,8 @@ do {                                                                       \
 #ifdef DO_VALIDATE
 #define VALIDATE_DRAWABLE_INFO_NO_LOCK(gcp)                                \
 do {                                                                       \
-    /*__DRIscreenPrivate *psp = gcp->driScreen;*/                          \
-    __DRIdrawablePrivate *pdp = gcp->driDrawable;                          \
+    /*__DRIscreen *psp = gcp->driScreen;*/                          \
+    __DRIdrawable *pdp = gcp->driDrawable;                          \
                                                                            \
     if (*(pdp->pStamp) != pdp->lastStamp) {                                \
 	int old_index = pdp->index;                                        \
@@ -301,7 +301,7 @@ do {                                                                       \
 
 #define VALIDATE_DRAWABLE_INFO(gcp)                                    \
 do {                                                                       \
-    __DRIscreenPrivate *psp = gcp->driScreen;                          \
+    __DRIscreen *psp = gcp->driScreen;                          \
 if (gcp->driDrawable) { \
     DRM_SPINLOCK(&psp->pSAREA->drawable_lock, psp->drawLockID);            \
     VALIDATE_DRAWABLE_INFO_NO_LOCK(gcp);                               \
diff --git a/src/mesa/drivers/dri/gamma/gamma_render.c b/src/mesa/drivers/dri/gamma/gamma_render.c
index 1b9fd169f4..a03a93d132 100644
--- a/src/mesa/drivers/dri/gamma/gamma_render.c
+++ b/src/mesa/drivers/dri/gamma/gamma_render.c
@@ -53,13 +53,13 @@ static void gamma_emit( GLcontext *ctx, GLuint start, GLuint end)
    GLfloat (*tc0)[4] = 0;
    GLuint tc0_size = 0;
 
-   col = VB->ColorPtr[0]->data;
-   col_stride = VB->ColorPtr[0]->stride;
+   col = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+   col_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
 
    if (ctx->Texture.Unit[0]._ReallyEnabled) {
-      tc0_stride = VB->TexCoordPtr[0]->stride;
-      tc0 = VB->TexCoordPtr[0]->data;
-      tc0_size = VB->TexCoordPtr[0]->size;
+      tc0_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0]->stride;
+      tc0 = VB->AttribPtr[_TNL_ATTRIB_TEX0]->data;
+      tc0_size = VB->AttribPtr[_TNL_ATTRIB_TEX0]->size;
       coord = VB->ClipPtr->data;
       coord_stride = VB->ClipPtr->stride;
    } else {
diff --git a/src/mesa/drivers/dri/gamma/gamma_screen.c b/src/mesa/drivers/dri/gamma/gamma_screen.c
index f899ebec96..f72a4a5696 100644
--- a/src/mesa/drivers/dri/gamma/gamma_screen.c
+++ b/src/mesa/drivers/dri/gamma/gamma_screen.c
@@ -29,7 +29,7 @@
 
 #include "main/imports.h"
 
-gammaScreenPtr gammaCreateScreen( __DRIscreenPrivate *sPriv )
+gammaScreenPtr gammaCreateScreen( __DRIscreen *sPriv )
 {
    gammaScreenPtr gammaScreen;
    GLINTDRIPtr gDRIPriv = (GLINTDRIPtr)sPriv->pDevPriv;
@@ -129,7 +129,7 @@ gammaScreenPtr gammaCreateScreen( __DRIscreenPrivate *sPriv )
 
 /* Destroy the device specific screen private data struct.
  */
-void gammaDestroyScreen( __DRIscreenPrivate *sPriv )
+void gammaDestroyScreen( __DRIscreen *sPriv )
 {
     gammaScreenPtr gammaScreen = (gammaScreenPtr)sPriv->private;
 
diff --git a/src/mesa/drivers/dri/gamma/gamma_screen.h b/src/mesa/drivers/dri/gamma/gamma_screen.h
index 7f0ed6f80e..c716ea89c2 100644
--- a/src/mesa/drivers/dri/gamma/gamma_screen.h
+++ b/src/mesa/drivers/dri/gamma/gamma_screen.h
@@ -11,7 +11,7 @@ typedef struct {
 
     drmBufMapPtr  bufs;              /* Map of DMA buffers */
 
-    __DRIscreenPrivate *driScreen; /* Back pointer to DRI screen */
+    __DRIscreen *driScreen; /* Back pointer to DRI screen */
 
     int		cpp;
     int		frontPitch;
diff --git a/src/mesa/drivers/dri/gamma/gamma_span.c b/src/mesa/drivers/dri/gamma/gamma_span.c
index cdaaac3f3a..3f0b81800c 100644
--- a/src/mesa/drivers/dri/gamma/gamma_span.c
+++ b/src/mesa/drivers/dri/gamma/gamma_span.c
@@ -10,8 +10,8 @@
 #define LOCAL_VARS							\
    gammaContextPtr gmesa = GAMMA_CONTEXT(ctx);				\
    gammaScreenPtr gammascrn = gmesa->gammaScreen;			\
-   __DRIscreenPrivate *sPriv = gmesa->driScreen;			\
-   __DRIdrawablePrivate *dPriv = gmesa->driDrawable;			\
+   __DRIscreen *sPriv = gmesa->driScreen;			\
+   __DRIdrawable *dPriv = gmesa->driDrawable;			\
    GLuint pitch = sPriv->fbWidth * gammascrn->cpp;		\
    GLuint height = dPriv->h;						\
    char *buf = (char *)(sPriv->pFB +					\
@@ -24,8 +24,8 @@
 /* FIXME! Depth/Stencil read/writes don't work ! */
 #define LOCAL_DEPTH_VARS				\
    gammaScreenPtr gammascrn = gmesa->gammaScreen;	\
-   __DRIdrawablePrivate *dPriv = gmesa->driDrawable;	\
-   __DRIscreenPrivate *sPriv = gmesa->driScreen;	\
+   __DRIdrawable *dPriv = gmesa->driDrawable;	\
+   __DRIscreen *sPriv = gmesa->driScreen;	\
    GLuint pitch = gammascrn->depthPitch;		\
    GLuint height = dPriv->h;				\
    char *buf = (char *)(sPriv->pFB +			\
diff --git a/src/mesa/drivers/dri/gamma/gamma_state.c b/src/mesa/drivers/dri/gamma/gamma_state.c
index 59272f9bc9..47df37466d 100644
--- a/src/mesa/drivers/dri/gamma/gamma_state.c
+++ b/src/mesa/drivers/dri/gamma/gamma_state.c
@@ -813,10 +813,10 @@ static void gammaUpdateMasks( GLcontext *ctx )
 
 
    GLuint mask = gammaPackColor( gmesa->gammaScreen->cpp,
-				ctx->Color.ColorMask[RCOMP],
-				ctx->Color.ColorMask[GCOMP],
-				ctx->Color.ColorMask[BCOMP],
-				ctx->Color.ColorMask[ACOMP] );
+				ctx->Color.ColorMask[0][RCOMP],
+				ctx->Color.ColorMask[0][GCOMP],
+				ctx->Color.ColorMask[0][BCOMP],
+				ctx->Color.ColorMask[0][ACOMP] );
 
    if (gmesa->gammaScreen->cpp == 2) mask |= mask << 16;
 
@@ -1070,7 +1070,7 @@ static void gammaDDReadBuffer( GLcontext *ctx, GLenum mode )
 void gammaUpdateWindow( GLcontext *ctx )
 {
    gammaContextPtr gmesa = GAMMA_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = gmesa->driDrawable;
+   __DRIdrawable *dPriv = gmesa->driDrawable;
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = gmesa->driScreen->fbHeight - (GLfloat)dPriv->y - dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1109,7 +1109,7 @@ static void gammaDDDepthRange( GLcontext *ctx, GLclampd nearval,
 void gammaUpdateViewportOffset( GLcontext *ctx )
 {
    gammaContextPtr gmesa = GAMMA_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = gmesa->driDrawable;
+   __DRIdrawable *dPriv = gmesa->driDrawable;
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = gmesa->driScreen->fbHeight - (GLfloat)dPriv->y - dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
diff --git a/src/mesa/drivers/dri/gamma/gamma_tex.c b/src/mesa/drivers/dri/gamma/gamma_tex.c
index 0dad250e4d..694e5eba5b 100644
--- a/src/mesa/drivers/dri/gamma/gamma_tex.c
+++ b/src/mesa/drivers/dri/gamma/gamma_tex.c
@@ -145,7 +145,7 @@ static void gammaTexParameter( GLcontext *ctx, GLenum target,
       break;
   
    case GL_TEXTURE_BORDER_COLOR:
-      gammaSetTexBorderColor( gmesa, t, tObj->BorderColor );
+      gammaSetTexBorderColor( gmesa, t, tObj->BorderColor.f );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
@@ -349,7 +349,7 @@ static void gammaBindTexture( GLcontext *ctx, GLenum target,
 
 	 gammaSetTexWrapping( t, tObj->WrapS, tObj->WrapT );
 	 gammaSetTexFilter( gmesa, t, tObj->MinFilter, tObj->MagFilter, bias );
-	 gammaSetTexBorderColor( gmesa, t, tObj->BorderColor );
+	 gammaSetTexBorderColor( gmesa, t, tObj->BorderColor.f );
       }
 }
 
diff --git a/src/mesa/drivers/dri/gamma/gamma_xmesa.c b/src/mesa/drivers/dri/gamma/gamma_xmesa.c
index 7b5b53589c..e49ab5bae3 100644
--- a/src/mesa/drivers/dri/gamma/gamma_xmesa.c
+++ b/src/mesa/drivers/dri/gamma/gamma_xmesa.c
@@ -36,7 +36,7 @@
 #include "vbo/vbo.h"
 
 static GLboolean 
-gammaInitDriver(__DRIscreenPrivate *sPriv)
+gammaInitDriver(__DRIscreen *sPriv)
 {
     sPriv->private = (void *) gammaCreateScreen( sPriv );
 
@@ -49,7 +49,7 @@ gammaInitDriver(__DRIscreenPrivate *sPriv)
 }
 
 static void 
-gammaDestroyContext(__DRIcontextPrivate *driContextPriv)
+gammaDestroyContext(__DRIcontext *driContextPriv)
 {
     gammaContextPtr gmesa = (gammaContextPtr)driContextPriv->driverPrivate;
 
@@ -72,8 +72,8 @@ gammaDestroyContext(__DRIcontextPrivate *driContextPriv)
 
 
 static GLboolean
-gammaCreateBuffer( __DRIscreenPrivate *driScrnPriv,
-                   __DRIdrawablePrivate *driDrawPriv,
+gammaCreateBuffer( __DRIscreen *driScrnPriv,
+                   __DRIdrawable *driDrawPriv,
                    const __GLcontextModes *mesaVis,
                    GLboolean isPixmap )
 {
@@ -94,17 +94,17 @@ gammaCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-gammaDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+gammaDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static void
-gammaSwapBuffers( __DRIdrawablePrivate *dPriv )
+gammaSwapBuffers( __DRIdrawable *dPriv )
 {
    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
     gammaContextPtr gmesa;
-    __DRIscreenPrivate *driScrnPriv;
+    __DRIscreen *driScrnPriv;
     GLcontext *ctx;
 
     gmesa = (gammaContextPtr) dPriv->driContextPriv->driverPrivate;
@@ -127,7 +127,7 @@ gammaSwapBuffers( __DRIdrawablePrivate *dPriv )
 	int i;
 	int nRect = dPriv->numClipRects;
 	drm_clip_rect_t *pRect = dPriv->pClipRects;
-	__DRIscreenPrivate *driScrnPriv = gmesa->driScreen;
+	__DRIscreen *driScrnPriv = gmesa->driScreen;
    	GLINTDRIPtr gDRIPriv = (GLINTDRIPtr)driScrnPriv->pDevPriv;
 
 	CHECK_DMA_BUFFER(gmesa, 2);
@@ -193,9 +193,9 @@ gammaSwapBuffers( __DRIdrawablePrivate *dPriv )
 }
 
 static GLboolean 
-gammaMakeCurrent(__DRIcontextPrivate *driContextPriv,
-		 __DRIdrawablePrivate *driDrawPriv,
-		 __DRIdrawablePrivate *driReadPriv)
+gammaMakeCurrent(__DRIcontext *driContextPriv,
+		 __DRIdrawable *driDrawPriv,
+		 __DRIdrawable *driReadPriv)
 {
     if (driContextPriv) {
 	GET_CURRENT_CONTEXT(ctx);
@@ -232,7 +232,7 @@ newGammaCtx->new_state |= GAMMA_NEW_WINDOW; /* FIXME */
 
 
 static GLboolean 
-gammaUnbindContext( __DRIcontextPrivate *driContextPriv )
+gammaUnbindContext( __DRIcontext *driContextPriv )
 {
    return GL_TRUE;
 }
@@ -254,12 +254,19 @@ const struct __DriverAPIRec driDriverAPI = {
 /*
  * This is the bootstrap function for the driver.
  * The __driCreateScreen name is the symbol that libGL.so fetches.
- * Return:  pointer to a __DRIscreenPrivate.
+ * Return:  pointer to a __DRIscreen.
  */
 void *__driCreateScreen(Display *dpy, int scrn, __DRIscreen *psc,
                         int numConfigs, __GLXvisualConfig *config)
 {
-   __DRIscreenPrivate *psp;
+   __DRIscreen *psp;
    psp = __driUtilCreateScreen(dpy, scrn, psc, numConfigs, config, &gammaAPI);
    return (void *) psp;
 }
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/i810/i810context.c b/src/mesa/drivers/dri/i810/i810context.c
index 7311b2e765..bd9cfe5c0f 100644
--- a/src/mesa/drivers/dri/i810/i810context.c
+++ b/src/mesa/drivers/dri/i810/i810context.c
@@ -170,12 +170,12 @@ static const struct dri_debug_control debug_control[] =
 
 GLboolean
 i810CreateContext( const __GLcontextModes *mesaVis,
-                   __DRIcontextPrivate *driContextPriv,
+                   __DRIcontext *driContextPriv,
                    void *sharedContextPrivate )
 {
    GLcontext *ctx, *shareCtx;
    i810ContextPtr imesa;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    i810ScreenPrivate *i810Screen = (i810ScreenPrivate *)sPriv->private;
    I810SAREAPtr saPriv = (I810SAREAPtr)
       (((GLubyte *)sPriv->pSAREA) + i810Screen->sarea_priv_offset);
@@ -337,7 +337,7 @@ i810CreateContext( const __GLcontextModes *mesaVis,
 }
 
 void
-i810DestroyContext(__DRIcontextPrivate *driContextPriv)
+i810DestroyContext(__DRIcontext *driContextPriv)
 {
    i810ContextPtr imesa = (i810ContextPtr) driContextPriv->driverPrivate;
 
@@ -378,7 +378,7 @@ i810DestroyContext(__DRIcontextPrivate *driContextPriv)
 
 void i810XMesaSetFrontClipRects( i810ContextPtr imesa )
 {
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;
+   __DRIdrawable *dPriv = imesa->driDrawable;
 
    imesa->numClipRects = dPriv->numClipRects;
    imesa->pClipRects = dPriv->pClipRects;
@@ -392,7 +392,7 @@ void i810XMesaSetFrontClipRects( i810ContextPtr imesa )
 
 void i810XMesaSetBackClipRects( i810ContextPtr imesa )
 {
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;
+   __DRIdrawable *dPriv = imesa->driDrawable;
 
    if (imesa->sarea->pf_enabled == 0 && dPriv->numBackClipRects == 0)
    {
@@ -430,7 +430,7 @@ static void i810XMesaWindowMoved( i810ContextPtr imesa )
 
 
 GLboolean
-i810UnbindContext(__DRIcontextPrivate *driContextPriv)
+i810UnbindContext(__DRIcontext *driContextPriv)
 {
    i810ContextPtr imesa = (i810ContextPtr) driContextPriv->driverPrivate;
    if (imesa) {
@@ -444,9 +444,9 @@ i810UnbindContext(__DRIcontextPrivate *driContextPriv)
 
 
 GLboolean
-i810MakeCurrent(__DRIcontextPrivate *driContextPriv,
-                __DRIdrawablePrivate *driDrawPriv,
-                __DRIdrawablePrivate *driReadPriv)
+i810MakeCurrent(__DRIcontext *driContextPriv,
+                __DRIdrawable *driDrawPriv,
+                __DRIdrawable *driReadPriv)
 {
    if (driContextPriv) {
       i810ContextPtr imesa = (i810ContextPtr) driContextPriv->driverPrivate;
@@ -504,8 +504,8 @@ i810UpdatePageFlipping( i810ContextPtr imesa )
 
 void i810GetLock( i810ContextPtr imesa, GLuint flags )
 {
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;
-   __DRIscreenPrivate *sPriv = imesa->driScreen;
+   __DRIdrawable *dPriv = imesa->driDrawable;
+   __DRIscreen *sPriv = imesa->driScreen;
    I810SAREAPtr sarea = imesa->sarea;
    int me = imesa->hHWContext;
    unsigned i;
@@ -551,7 +551,7 @@ void i810GetLock( i810ContextPtr imesa, GLuint flags )
 
 
 void
-i810SwapBuffers( __DRIdrawablePrivate *dPriv )
+i810SwapBuffers( __DRIdrawable *dPriv )
 {
    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
       i810ContextPtr imesa;
diff --git a/src/mesa/drivers/dri/i810/i810context.h b/src/mesa/drivers/dri/i810/i810context.h
index 4b8c71d7c6..19529db020 100644
--- a/src/mesa/drivers/dri/i810/i810context.h
+++ b/src/mesa/drivers/dri/i810/i810context.h
@@ -170,8 +170,8 @@ struct i810_context_t {
    drm_hw_lock_t *driHwLock;
    int driFd;
 
-   __DRIdrawablePrivate *driDrawable;
-   __DRIscreenPrivate *driScreen;
+   __DRIdrawable *driDrawable;
+   __DRIscreen *driScreen;
    i810ScreenPrivate *i810Screen; 
    I810SAREAPtr sarea;
 };
diff --git a/src/mesa/drivers/dri/i810/i810ioctl.c b/src/mesa/drivers/dri/i810/i810ioctl.c
index 3df9c2ac47..c631543d93 100644
--- a/src/mesa/drivers/dri/i810/i810ioctl.c
+++ b/src/mesa/drivers/dri/i810/i810ioctl.c
@@ -50,8 +50,8 @@ static drmBufPtr i810_get_buffer_ioctl( i810ContextPtr imesa )
 static void i810Clear( GLcontext *ctx, GLbitfield mask )
 {
    i810ContextPtr imesa = I810_CONTEXT( ctx );
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;
-   const GLuint colorMask = *((GLuint *) &ctx->Color.ColorMask);
+   __DRIdrawable *dPriv = imesa->driDrawable;
+   const GLuint colorMask = *((GLuint *) &ctx->Color.ColorMask[0]);
    drmI810Clear clear;
    unsigned int i;
 
@@ -149,7 +149,7 @@ static void i810Clear( GLcontext *ctx, GLbitfield mask )
 /*
  * Copy the back buffer to the front buffer. 
  */
-void i810CopyBuffer( const __DRIdrawablePrivate *dPriv ) 
+void i810CopyBuffer( const __DRIdrawable *dPriv ) 
 {
    i810ContextPtr imesa;
    drm_clip_rect_t *pbox;
@@ -197,7 +197,7 @@ void i810CopyBuffer( const __DRIdrawablePrivate *dPriv )
 /*
  * XXX implement when full-screen extension is done.
  */
-void i810PageFlip( const __DRIdrawablePrivate *dPriv ) 
+void i810PageFlip( const __DRIdrawable *dPriv ) 
 {
   i810ContextPtr imesa;
   int tmp, ret;
diff --git a/src/mesa/drivers/dri/i810/i810ioctl.h b/src/mesa/drivers/dri/i810/i810ioctl.h
index dfd6e21088..926e38ce51 100644
--- a/src/mesa/drivers/dri/i810/i810ioctl.h
+++ b/src/mesa/drivers/dri/i810/i810ioctl.h
@@ -14,8 +14,8 @@ void i810WaitAge( i810ContextPtr imesa, int age );
 void i810DmaFinish( i810ContextPtr imesa );
 void i810RegetLockQuiescent( i810ContextPtr imesa );
 void i810InitIoctlFuncs( struct dd_function_table *functions );
-void i810CopyBuffer( const __DRIdrawablePrivate *dpriv );
-void i810PageFlip( const __DRIdrawablePrivate *dpriv );
+void i810CopyBuffer( const __DRIdrawable *dpriv );
+void i810PageFlip( const __DRIdrawable *dpriv );
 int i810_check_copy(int fd);
 
 #define I810_STATECHANGE(imesa, flag)				\
diff --git a/src/mesa/drivers/dri/i810/i810screen.c b/src/mesa/drivers/dri/i810/i810screen.c
index 2f6b8631ff..2a30782afd 100644
--- a/src/mesa/drivers/dri/i810/i810screen.c
+++ b/src/mesa/drivers/dri/i810/i810screen.c
@@ -54,7 +54,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "GL/internal/dri_interface.h"
 
 static const __DRIconfig **
-i810FillInModes( __DRIscreenPrivate *psp,
+i810FillInModes( __DRIscreen *psp,
 		 unsigned pixel_bits, unsigned depth_bits,
 		 unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -255,7 +255,7 @@ i810InitScreen(__DRIscreen *sPriv)
 }
 
 static void
-i810DestroyScreen(__DRIscreenPrivate *sPriv)
+i810DestroyScreen(__DRIscreen *sPriv)
 {
    i810ScreenPrivate *i810Screen = (i810ScreenPrivate *)sPriv->private;
 
@@ -274,8 +274,8 @@ i810DestroyScreen(__DRIscreenPrivate *sPriv)
  * Create a buffer which corresponds to the window.
  */
 static GLboolean
-i810CreateBuffer( __DRIscreenPrivate *driScrnPriv,
-                  __DRIdrawablePrivate *driDrawPriv,
+i810CreateBuffer( __DRIscreen *driScrnPriv,
+                  __DRIdrawable *driDrawPriv,
                   const __GLcontextModes *mesaVis,
                   GLboolean isPixmap )
 {
@@ -335,7 +335,7 @@ i810CreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-i810DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+i810DestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
@@ -356,3 +356,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/i810/i810screen.h b/src/mesa/drivers/dri/i810/i810screen.h
index b29937665a..734e2fb002 100644
--- a/src/mesa/drivers/dri/i810/i810screen.h
+++ b/src/mesa/drivers/dri/i810/i810screen.h
@@ -71,7 +71,7 @@ typedef struct {
    int textureSize;
    int logTextureGranularity;
 
-   __DRIscreenPrivate *driScrnPriv;
+   __DRIscreen *driScrnPriv;
    drmBufMapPtr  bufs;
    unsigned int sarea_priv_offset;
 } i810ScreenPrivate;
@@ -79,21 +79,21 @@ typedef struct {
 
 extern GLboolean
 i810CreateContext( const __GLcontextModes *mesaVis,
-                   __DRIcontextPrivate *driContextPriv,
+                   __DRIcontext *driContextPriv,
                    void *sharedContextPrivate );
 
 extern void
-i810DestroyContext(__DRIcontextPrivate *driContextPriv);
+i810DestroyContext(__DRIcontext *driContextPriv);
 
 extern GLboolean
-i810UnbindContext(__DRIcontextPrivate *driContextPriv);
+i810UnbindContext(__DRIcontext *driContextPriv);
 
 extern GLboolean
-i810MakeCurrent(__DRIcontextPrivate *driContextPriv,
-                __DRIdrawablePrivate *driDrawPriv,
-                __DRIdrawablePrivate *driReadPriv);
+i810MakeCurrent(__DRIcontext *driContextPriv,
+                __DRIdrawable *driDrawPriv,
+                __DRIdrawable *driReadPriv);
 
 extern void
-i810SwapBuffers(__DRIdrawablePrivate *driDrawPriv);
+i810SwapBuffers(__DRIdrawable *driDrawPriv);
 
 #endif
diff --git a/src/mesa/drivers/dri/i810/i810span.c b/src/mesa/drivers/dri/i810/i810span.c
index 510723f445..6576f6745e 100644
--- a/src/mesa/drivers/dri/i810/i810span.c
+++ b/src/mesa/drivers/dri/i810/i810span.c
@@ -15,7 +15,7 @@
 
 #define LOCAL_VARS					\
    i810ContextPtr imesa = I810_CONTEXT(ctx);	        \
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;	\
+   __DRIdrawable *dPriv = imesa->driDrawable;	\
    driRenderbuffer *drb = (driRenderbuffer *) rb;	\
    GLuint pitch = drb->pitch;				\
    GLuint height = dPriv->h;				\
@@ -27,7 +27,7 @@
 
 #define LOCAL_DEPTH_VARS				\
    i810ContextPtr imesa = I810_CONTEXT(ctx);	        \
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;	\
+   __DRIdrawable *dPriv = imesa->driDrawable;	\
    driRenderbuffer *drb = (driRenderbuffer *) rb;	\
    GLuint pitch = drb->pitch;				\
    GLuint height = dPriv->h;				\
diff --git a/src/mesa/drivers/dri/i810/i810state.c b/src/mesa/drivers/dri/i810/i810state.c
index 1e7a6cfe47..642245c61c 100644
--- a/src/mesa/drivers/dri/i810/i810state.c
+++ b/src/mesa/drivers/dri/i810/i810state.c
@@ -641,7 +641,7 @@ static void i810Enable(GLcontext *ctx, GLenum cap, GLboolean state)
 
 void i810EmitDrawingRectangle( i810ContextPtr imesa )
 {
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;
+   __DRIdrawable *dPriv = imesa->driDrawable;
    i810ScreenPrivate *i810Screen = imesa->i810Screen;
    int x0 = imesa->drawX;
    int y0 = imesa->drawY;
diff --git a/src/mesa/drivers/dri/i810/i810tex.c b/src/mesa/drivers/dri/i810/i810tex.c
index 2f6978f5aa..e764644a6c 100644
--- a/src/mesa/drivers/dri/i810/i810tex.c
+++ b/src/mesa/drivers/dri/i810/i810tex.c
@@ -210,7 +210,7 @@ i810AllocTexObj( GLcontext *ctx, struct gl_texture_object *texObj )
       i810SetTexWrapping( t, texObj->WrapS, texObj->WrapT );
       /*i830SetTexMaxAnisotropy( t, texObj->MaxAnisotropy );*/
       i810SetTexFilter( imesa, t, texObj->MinFilter, texObj->MagFilter, bias );
-      i810SetTexBorderColor( t, texObj->BorderColor );
+      i810SetTexBorderColor( t, texObj->BorderColor.f );
    }
 
    return t;
@@ -251,7 +251,7 @@ static void i810TexParameter( GLcontext *ctx, GLenum target,
       break;
   
    case GL_TEXTURE_BORDER_COLOR:
-      i810SetTexBorderColor( t, tObj->BorderColor );
+      i810SetTexBorderColor( t, tObj->BorderColor.f );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
diff --git a/src/mesa/drivers/dri/i810/i810tex.h b/src/mesa/drivers/dri/i810/i810tex.h
index d980927030..28958dcb4b 100644
--- a/src/mesa/drivers/dri/i810/i810tex.h
+++ b/src/mesa/drivers/dri/i810/i810tex.h
@@ -29,7 +29,6 @@
 #include "main/mtypes.h"
 #include "main/mm.h"
 
-#include "i810context.h"
 #include "i810_3d_reg.h"
 #include "texmem.h"
 
diff --git a/src/mesa/drivers/dri/i810/i810tris.c b/src/mesa/drivers/dri/i810/i810tris.c
index b508496fb6..213ba541ce 100644
--- a/src/mesa/drivers/dri/i810/i810tris.c
+++ b/src/mesa/drivers/dri/i810/i810tris.c
@@ -270,7 +270,8 @@ do {							\
 
 #define LOCAL_VARS(n)							\
    i810ContextPtr imesa = I810_CONTEXT(ctx);				\
-   GLuint color[n], spec[n];						\
+   GLuint color[n] = { 0 };						\
+   GLuint spec[n] = { 0 };						\
    GLuint coloroffset = (imesa->vertex_size == 4 ? 3 : 4);		\
    GLboolean havespec = (imesa->vertex_size > 4);			\
    (void) color; (void) spec; (void) coloroffset; (void) havespec;
diff --git a/src/mesa/drivers/dri/i915/Makefile b/src/mesa/drivers/dri/i915/Makefile
index 37f15aa767..cf32476f40 100644
--- a/src/mesa/drivers/dri/i915/Makefile
+++ b/src/mesa/drivers/dri/i915/Makefile
@@ -34,7 +34,6 @@ DRIVER_SOURCES = \
 	intel_pixel_read.c \
 	intel_buffers.c \
 	intel_blit.c \
-	intel_swapbuffers.c \
 	i915_tex_layout.c \
 	i915_texstate.c \
 	i915_context.c \
@@ -64,7 +63,8 @@ DRIVER_DEFINES = -I../intel -I../intel/server -DI915 \
 	$(shell pkg-config libdrm --atleast-version=2.3.1 \
 				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
 
-DRI_LIB_DEPS += -ldrm_intel
+INCLUDES += $(INTEL_CFLAGS)
+DRI_LIB_DEPS += $(INTEL_LIBS)
 
 include ../Makefile.template
 
diff --git a/src/mesa/drivers/dri/i915/i830_context.c b/src/mesa/drivers/dri/i915/i830_context.c
index 840946f908..4cb6305988 100644
--- a/src/mesa/drivers/dri/i915/i830_context.c
+++ b/src/mesa/drivers/dri/i915/i830_context.c
@@ -53,7 +53,7 @@ extern const struct tnl_pipeline_stage *intel_pipeline[];
 
 GLboolean
 i830CreateContext(const __GLcontextModes * mesaVis,
-                  __DRIcontextPrivate * driContextPriv,
+                  __DRIcontext * driContextPriv,
                   void *sharedContextPrivate)
 {
    struct dd_function_table functions;
diff --git a/src/mesa/drivers/dri/i915/i830_context.h b/src/mesa/drivers/dri/i915/i830_context.h
index f73cbbf88b..592ae53976 100644
--- a/src/mesa/drivers/dri/i915/i830_context.h
+++ b/src/mesa/drivers/dri/i915/i830_context.h
@@ -178,7 +178,7 @@ i830_state_draw_region(struct intel_context *intel,
  */
 extern GLboolean
 i830CreateContext(const __GLcontextModes * mesaVis,
-                  __DRIcontextPrivate * driContextPriv,
+                  __DRIcontext * driContextPriv,
                   void *sharedContextPrivate);
 
 /* i830_tex.c, i830_texstate.c
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index 645ebe3057..acda7e70de 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -620,7 +620,7 @@ i830LineWidth(GLcontext * ctx, GLfloat widthf)
    DBG("%s\n", __FUNCTION__);
    
    width = (int) (widthf * 2);
-   CLAMP_SELF(width, 1, 15);
+   width = CLAMP(width, 1, 15);
 
    state5 = i830->state.Ctx[I830_CTXREG_STATE5] & ~FIXED_LINE_WIDTH_MASK;
    state5 |= (ENABLE_FIXED_LINE_WIDTH | FIXED_LINE_WIDTH(width));
@@ -639,7 +639,7 @@ i830PointSize(GLcontext * ctx, GLfloat size)
 
    DBG("%s\n", __FUNCTION__);
    
-   CLAMP_SELF(point_size, 1, 256);
+   point_size = CLAMP(point_size, 1, 256);
    I830_STATECHANGE(i830, I830_UPLOAD_CTX);
    i830->state.Ctx[I830_CTXREG_STATE5] &= ~FIXED_POINT_WIDTH_MASK;
    i830->state.Ctx[I830_CTXREG_STATE5] |= (ENABLE_FIXED_POINT_WIDTH |
diff --git a/src/mesa/drivers/dri/i915/i830_texstate.c b/src/mesa/drivers/dri/i915/i830_texstate.c
index f4bbb53b86..7525f9f2e0 100644
--- a/src/mesa/drivers/dri/i915/i830_texstate.c
+++ b/src/mesa/drivers/dri/i915/i830_texstate.c
@@ -27,6 +27,7 @@
 
 #include "main/mtypes.h"
 #include "main/enums.h"
+#include "main/colormac.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_tex.h"
@@ -55,10 +56,7 @@ translate_texture_format(GLuint mesa_format, GLuint internal_format)
    case MESA_FORMAT_ARGB4444:
       return MAPSURF_16BIT | MT_16BIT_ARGB4444;
    case MESA_FORMAT_ARGB8888:
-      if (internal_format == GL_RGB)
-	 return MAPSURF_32BIT | MT_32BIT_XRGB8888;
-      else
-	 return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
    case MESA_FORMAT_XRGB8888:
       return MAPSURF_32BIT | MT_32BIT_XRGB8888;
    case MESA_FORMAT_YCBCR_REV:
@@ -306,16 +304,15 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
    /* convert border color from float to ubyte */
-   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor[0]);
-   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor[1]);
-   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor[2]);
-   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor[3]);
-
-   state[I830_TEXREG_TM0S4] = INTEL_PACKCOLOR8888(border[0],
-                                                  border[1],
-                                                  border[2],
-                                                  border[3]);
-
+   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor.f[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor.f[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor.f[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor.f[3]);
+
+   state[I830_TEXREG_TM0S4] = PACK_COLOR_8888(border[3],
+					      border[0],
+					      border[1],
+					      border[2]);
 
    I830_ACTIVESTATE(i830, I830_UPLOAD_TEX(unit), GL_TRUE);
    /* memcmp was already disabled, but definitely won't work as the
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 4133696129..4471ca2bbb 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -126,7 +126,7 @@ i830_render_start(struct intel_context *intel)
 
       for (i = 0; i < I830_TEX_UNITS; i++) {
          if (RENDERINPUTS_TEST(index_bitset, _TNL_ATTRIB_TEX(i))) {
-            GLuint sz = VB->TexCoordPtr[i]->size;
+            GLuint sz = VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->size;
             GLuint emit;
             GLuint mcs = (i830->state.Tex[i][I830_TEXREG_MCS] &
                           ~TEXCOORDTYPE_MASK);
@@ -298,7 +298,7 @@ i830_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(29, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(29);
 
    OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
    OUT_BATCH(0);
@@ -366,7 +366,7 @@ i830_emit_invarient_state(struct intel_context *intel)
 
 
 #define emit( intel, state, size )			\
-   intel_batchbuffer_data(intel->batch, state, size, IGNORE_CLIPRECTS )
+   intel_batchbuffer_data(intel->batch, state, size )
 
 static GLuint
 get_dirty(struct i830_hw_state *state)
@@ -429,13 +429,9 @@ i830_emit_state(struct intel_context *intel)
     * It might be better to talk about explicit places where
     * scheduling is allowed, rather than assume that it is whenever a
     * batchbuffer fills up.
-    *
-    * Set the space as LOOP_CLIPRECTS now, since that's what our primitives
-    * will be emitted under.
     */
    intel_batchbuffer_require_space(intel->batch,
-				   get_state_size(state) + INTEL_PRIM_EMIT_SIZE,
-				   LOOP_CLIPRECTS);
+				   get_state_size(state) + INTEL_PRIM_EMIT_SIZE);
    count = 0;
  again:
    aper_count = 0;
@@ -491,17 +487,14 @@ i830_emit_state(struct intel_context *intel)
    }
 
    if (dirty & I830_UPLOAD_BUFFERS) {
-      GLuint count = 9; 
+      GLuint count = 15;
 
       DBG("I830_UPLOAD_BUFFERS:\n");
 
       if (state->depth_region)
           count += 3;
 
-      if (intel->constant_cliprect)
-          count += 6;
-
-      BEGIN_BATCH(count, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(count);
       OUT_BATCH(state->Buffer[I830_DESTREG_CBUFADDR0]);
       OUT_BATCH(state->Buffer[I830_DESTREG_CBUFADDR1]);
       OUT_RELOC(state->draw_region->buffer,
@@ -523,15 +516,13 @@ i830_emit_state(struct intel_context *intel)
       OUT_BATCH(state->Buffer[I830_DESTREG_SR1]);
       OUT_BATCH(state->Buffer[I830_DESTREG_SR2]);
 
-      if (intel->constant_cliprect) {
-	 assert(state->Buffer[I830_DESTREG_DRAWRECT0] != MI_NOOP);
-	 OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT0]);
-	 OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT1]);
-	 OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT2]);
-	 OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT3]);
-	 OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT4]);
-	 OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT5]);
-      }
+      assert(state->Buffer[I830_DESTREG_DRAWRECT0] != MI_NOOP);
+      OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT0]);
+      OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT1]);
+      OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT2]);
+      OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT3]);
+      OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT4]);
+      OUT_BATCH(state->Buffer[I830_DESTREG_DRAWRECT5]);
       ADVANCE_BATCH();
    }
    
@@ -544,7 +535,7 @@ i830_emit_state(struct intel_context *intel)
       if ((dirty & I830_UPLOAD_TEX(i))) {
          DBG("I830_UPLOAD_TEX(%d):\n", i);
 
-         BEGIN_BATCH(I830_TEX_SETUP_SIZE + 1, IGNORE_CLIPRECTS);
+         BEGIN_BATCH(I830_TEX_SETUP_SIZE + 1);
          OUT_BATCH(state->Tex[i][I830_TEXREG_TM0LI]);
 
          if (state->tex_buffer[i]) {
@@ -645,7 +636,7 @@ i830_state_draw_region(struct intel_context *intel,
             DSTORG_VERT_BIAS(0x8) | DEPTH_IS_Z);    /* .5 */
 
    if (irb != NULL) {
-      switch (irb->texformat) {
+      switch (irb->Base.Format) {
       case MESA_FORMAT_ARGB8888:
       case MESA_FORMAT_XRGB8888:
 	 value |= DV_PF_8888;
@@ -661,7 +652,7 @@ i830_state_draw_region(struct intel_context *intel,
 	 break;
       default:
 	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
-		       irb->texformat);
+		       irb->Base.Format);
       }
    }
 
@@ -673,23 +664,14 @@ i830_state_draw_region(struct intel_context *intel,
    }
    state->Buffer[I830_DESTREG_DV1] = value;
 
-   if (intel->constant_cliprect) {
-      state->Buffer[I830_DESTREG_DRAWRECT0] = _3DSTATE_DRAWRECT_INFO;
-      state->Buffer[I830_DESTREG_DRAWRECT1] = 0;
-      state->Buffer[I830_DESTREG_DRAWRECT2] = 0; /* xmin, ymin */
-      state->Buffer[I830_DESTREG_DRAWRECT3] =
-	 (ctx->DrawBuffer->Width & 0xffff) |
-	 (ctx->DrawBuffer->Height << 16);
-      state->Buffer[I830_DESTREG_DRAWRECT4] = 0; /* xoff, yoff */
-      state->Buffer[I830_DESTREG_DRAWRECT5] = 0;
-   } else {
-      state->Buffer[I830_DESTREG_DRAWRECT0] = MI_NOOP;
-      state->Buffer[I830_DESTREG_DRAWRECT1] = MI_NOOP;
-      state->Buffer[I830_DESTREG_DRAWRECT2] = MI_NOOP;
-      state->Buffer[I830_DESTREG_DRAWRECT3] = MI_NOOP;
-      state->Buffer[I830_DESTREG_DRAWRECT4] = MI_NOOP;
-      state->Buffer[I830_DESTREG_DRAWRECT5] = MI_NOOP;
-   }
+   state->Buffer[I830_DESTREG_DRAWRECT0] = _3DSTATE_DRAWRECT_INFO;
+   state->Buffer[I830_DESTREG_DRAWRECT1] = 0;
+   state->Buffer[I830_DESTREG_DRAWRECT2] = 0; /* xmin, ymin */
+   state->Buffer[I830_DESTREG_DRAWRECT3] =
+      (ctx->DrawBuffer->Width & 0xffff) |
+      (ctx->DrawBuffer->Height << 16);
+   state->Buffer[I830_DESTREG_DRAWRECT4] = 0; /* xoff, yoff */
+   state->Buffer[I830_DESTREG_DRAWRECT5] = 0;
 
    I830_STATECHANGE(i830, I830_UPLOAD_BUFFERS);
 
@@ -714,20 +696,8 @@ i830_new_batch(struct intel_context *intel)
 {
    struct i830_context *i830 = i830_context(&intel->ctx);
    i830->state.emitted = 0;
-
-   /* Check that we didn't just wrap our batchbuffer at a bad time. */
-   assert(!intel->no_batch_wrap);
-}
-
-
-
-static GLuint
-i830_flush_cmd(void)
-{
-   return MI_FLUSH | FLUSH_MAP_CACHE;
 }
 
-
 static void 
 i830_assert_not_dirty( struct intel_context *intel )
 {
@@ -753,7 +723,6 @@ i830InitVtbl(struct i830_context *i830)
    i830->intel.vtbl.reduced_primitive_state = i830_reduced_primitive_state;
    i830->intel.vtbl.set_draw_region = i830_set_draw_region;
    i830->intel.vtbl.update_texture_state = i830UpdateTextureState;
-   i830->intel.vtbl.flush_cmd = i830_flush_cmd;
    i830->intel.vtbl.render_start = i830_render_start;
    i830->intel.vtbl.render_prevalidate = i830_render_prevalidate;
    i830->intel.vtbl.assert_not_dirty = i830_assert_not_dirty;
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 7d4c7cfbab..7c7711da09 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -100,7 +100,7 @@ extern const struct tnl_pipeline_stage *intel_pipeline[];
 
 GLboolean
 i915CreateContext(const __GLcontextModes * mesaVis,
-                  __DRIcontextPrivate * driContextPriv,
+                  __DRIcontext * driContextPriv,
                   void *sharedContextPrivate)
 {
    struct dd_function_table functions;
@@ -143,6 +143,9 @@ i915CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.MaxTextureImageUnits = I915_TEX_UNITS;
    ctx->Const.MaxTextureCoordUnits = I915_TEX_UNITS;
    ctx->Const.MaxVarying = I915_TEX_UNITS;
+   ctx->Const.MaxCombinedTextureImageUnits =
+      ctx->Const.MaxVertexTextureImageUnits +
+      ctx->Const.MaxTextureImageUnits;
 
    /* Advertise the full hardware capabilities.  The new memory
     * manager should cope much better with overload situations:
diff --git a/src/mesa/drivers/dri/i915/i915_context.h b/src/mesa/drivers/dri/i915/i915_context.h
index 082d614442..f55b551139 100644
--- a/src/mesa/drivers/dri/i915/i915_context.h
+++ b/src/mesa/drivers/dri/i915/i915_context.h
@@ -39,6 +39,7 @@
 #define I915_FALLBACK_LOGICOP		 0x20000
 #define I915_FALLBACK_POLYGON_SMOOTH	 0x40000
 #define I915_FALLBACK_POINT_SMOOTH	 0x80000
+#define I915_FALLBACK_POINT_SPRITE_COORD_ORIGIN	 0x100000
 
 #define I915_UPLOAD_CTX              0x1
 #define I915_UPLOAD_BUFFERS          0x2
@@ -317,7 +318,7 @@ do {									\
  * i915_context.c
  */
 extern GLboolean i915CreateContext(const __GLcontextModes * mesaVis,
-                                   __DRIcontextPrivate * driContextPriv,
+                                   __DRIcontext * driContextPriv,
                                    void *sharedContextPrivate);
 
 
diff --git a/src/mesa/drivers/dri/i915/i915_debug.c b/src/mesa/drivers/dri/i915/i915_debug.c
index f7bb7ea44c..fecfac3033 100644
--- a/src/mesa/drivers/dri/i915/i915_debug.c
+++ b/src/mesa/drivers/dri/i915/i915_debug.c
@@ -806,6 +806,7 @@ static GLboolean i915_debug_packet( struct debug_stream *stream )
       default:
 	 return debug(stream, "", 0);
       }
+      break;
    default:
       assert(0);
       return 0;
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index d9c61446f5..a273bd28ea 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -663,7 +663,7 @@ upload_program(struct i915_fragment_program *p)
 			 A0_MOV,
 			 get_result_vector(p, inst),
 			 get_result_flags(inst), 0,
-			 swizzle(src0, ZERO, ZERO, ZERO, ZERO), 0, 0);
+			 swizzle(tmp, ZERO, ZERO, ZERO, ZERO), 0, 0);
 
       case OPCODE_POW:
          src0 = src_vector(p, &inst->SrcReg[0], program);
@@ -1301,7 +1301,7 @@ i915ValidateFragmentProgram(struct i915_context *i915)
 
    for (i = 0; i < p->ctx->Const.MaxTextureCoordUnits; i++) {
       if (inputsRead & FRAG_BIT_TEX(i)) {
-         int sz = VB->TexCoordPtr[i]->size;
+         int sz = VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->size;
 
          s2 &= ~S2_TEXCOORD_FMT(i, S2_TEXCOORD_FMT0_MASK);
          s2 |= S2_TEXCOORD_FMT(i, SZ_TO_HW(sz));
diff --git a/src/mesa/drivers/dri/i915/i915_program.c b/src/mesa/drivers/dri/i915/i915_program.c
index e7908bd48f..3902c69097 100644
--- a/src/mesa/drivers/dri/i915/i915_program.c
+++ b/src/mesa/drivers/dri/i915/i915_program.c
@@ -245,7 +245,7 @@ GLuint i915_emit_texld( struct i915_fragment_program *p,
    }
    else {
       assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
-      assert(dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest)));
+      assert(dest == UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest)));
       /* Can't use unsaved temps for coords, as the phase boundary would result
        * in the contents becoming undefined.
        */
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index b60efea75b..9d7a9e1dfe 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -571,7 +571,7 @@ i915LineWidth(GLcontext * ctx, GLfloat widthf)
    DBG("%s\n", __FUNCTION__);
    
    width = (int) (widthf * 2);
-   CLAMP_SELF(width, 1, 0xf);
+   width = CLAMP(width, 1, 0xf);
    lis4 |= width << S4_LINE_WIDTH_SHIFT;
 
    if (lis4 != i915->state.Ctx[I915_CTXREG_LIS4]) {
@@ -585,11 +585,11 @@ i915PointSize(GLcontext * ctx, GLfloat size)
 {
    struct i915_context *i915 = I915_CONTEXT(ctx);
    int lis4 = i915->state.Ctx[I915_CTXREG_LIS4] & ~S4_POINT_WIDTH_MASK;
-   GLint point_size = (int) size;
+   GLint point_size = (int) round(size);
 
    DBG("%s\n", __FUNCTION__);
    
-   CLAMP_SELF(point_size, 1, 255);
+   point_size = CLAMP(point_size, 1, 255);
    lis4 |= point_size << S4_POINT_WIDTH_SHIFT;
 
    if (lis4 != i915->state.Ctx[I915_CTXREG_LIS4]) {
@@ -599,6 +599,24 @@ i915PointSize(GLcontext * ctx, GLfloat size)
 }
 
 
+static void
+i915PointParameterfv(GLcontext * ctx, GLenum pname, const GLfloat *params)
+{
+   struct i915_context *i915 = I915_CONTEXT(ctx);
+
+   switch (pname) {
+   case GL_POINT_SPRITE_COORD_ORIGIN:
+      /* This could be supported, but it would require modifying the fragment
+       * program to invert the y component of the texture coordinate by
+       * inserting a 'SUB tc.y, {1.0}.xxxx, tc' instruction.
+       */
+      FALLBACK(&i915->intel, I915_FALLBACK_POINT_SPRITE_COORD_ORIGIN,
+	       (params[0] != GL_UPPER_LEFT));
+      break;
+   }
+}
+
+
 /* =============================================================
  * Color masks
  */
@@ -939,6 +957,17 @@ i915Enable(GLcontext * ctx, GLenum cap, GLboolean state)
    case GL_POLYGON_SMOOTH:
       break;
 
+   case GL_POINT_SPRITE:
+      /* This state change is handled in i915_reduced_primitive_state because
+       * the hardware bit should only be set when rendering points.
+       */
+      I915_STATECHANGE(i915, I915_UPLOAD_CTX);
+      if (state)
+	 i915->state.Ctx[I915_CTXREG_LIS4] |= S4_SPRITE_POINT_ENABLE;
+      else
+	 i915->state.Ctx[I915_CTXREG_LIS4] &= ~S4_SPRITE_POINT_ENABLE;
+      break;
+
    case GL_POINT_SMOOTH:
       break;
 
@@ -1108,6 +1137,7 @@ i915InitStateFunctions(struct dd_function_table *functions)
    functions->LineWidth = i915LineWidth;
    functions->LogicOpcode = i915LogicOp;
    functions->PointSize = i915PointSize;
+   functions->PointParameterfv = i915PointParameterfv;
    functions->PolygonStipple = i915PolygonStipple;
    functions->Scissor = i915Scissor;
    functions->ShadeModel = i915ShadeModel;
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index d6689af53f..3ee4c8653a 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -28,6 +28,7 @@
 #include "main/mtypes.h"
 #include "main/enums.h"
 #include "main/macros.h"
+#include "main/colormac.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_tex.h"
@@ -56,10 +57,7 @@ translate_texture_format(gl_format mesa_format, GLuint internal_format,
    case MESA_FORMAT_ARGB4444:
       return MAPSURF_16BIT | MT_16BIT_ARGB4444;
    case MESA_FORMAT_ARGB8888:
-      if (internal_format == GL_RGB)
-	 return MAPSURF_32BIT | MT_32BIT_XRGB8888;
-      else
-	 return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
    case MESA_FORMAT_XRGB8888:
       return MAPSURF_32BIT | MT_32BIT_XRGB8888;
    case MESA_FORMAT_YCBCR_REV:
@@ -141,6 +139,7 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    GLuint *state = i915->state.Tex[unit], format, pitch;
    GLint lodbias, aniso = 0;
    GLubyte border[4];
+   GLfloat maxlod;
 
    memset(state, 0, sizeof(state));
 
@@ -178,18 +177,9 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
       pitch = intelObj->pitchOverride;
    } else {
-      GLuint dst_x, dst_y;
-
-      intel_miptree_get_image_offset(intelObj->mt, intelObj->firstLevel, 0, 0,
-				     &dst_x, &dst_y);
-
       dri_bo_reference(intelObj->mt->region->buffer);
       i915->state.tex_buffer[unit] = intelObj->mt->region->buffer;
-      /* XXX: This calculation is probably broken for tiled images with
-       * a non-page-aligned offset.
-       */
-      i915->state.tex_offset[unit] = (dst_x + dst_y * intelObj->mt->pitch) *
-	 intelObj->mt->cpp;
+      i915->state.tex_offset[unit] = 0; /* Always the origin of the miptree */
 
       format = translate_texture_format(firstImage->TexFormat,
 					firstImage->InternalFormat,
@@ -207,10 +197,15 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 	 state[I915_TEXREG_MS3] |= MS3_TILE_WALK;
    }
 
+   /* We get one field with fraction bits for the maximum addressable
+    * (lowest resolution) LOD.  Use it to cover both MAX_LEVEL and
+    * MAX_LOD.
+    */
+   maxlod = MIN2(tObj->MaxLod, tObj->_MaxLevel - tObj->BaseLevel);
    state[I915_TEXREG_MS4] =
       ((((pitch / 4) - 1) << MS4_PITCH_SHIFT) |
        MS4_CUBE_FACE_ENA_MASK |
-       (U_FIXED(CLAMP(tObj->MaxLod, 0.0, 11.0), 2) << MS4_MAX_LOD_SHIFT) |
+       (U_FIXED(CLAMP(maxlod, 0.0, 11.0), 2) << MS4_MAX_LOD_SHIFT) |
        ((firstImage->Depth - 1) << MS4_VOLUME_DEPTH_SHIFT));
 
 
@@ -353,25 +348,25 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
    /* convert border color from float to ubyte */
-   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor[0]);
-   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor[1]);
-   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor[2]);
-   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor[3]);
+   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor.f[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor.f[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor.f[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor.f[3]);
 
    if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
       /* GL specs that border color for depth textures is taken from the
        * R channel, while the hardware uses A.  Spam R into all the channels
        * for safety.
        */
-      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(border[0],
-						   border[0],
-						   border[0],
-						   border[0]);
+      state[I915_TEXREG_SS4] = PACK_COLOR_8888(border[0],
+					       border[0],
+					       border[0],
+					       border[0]);
    } else {
-      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(border[0],
-						   border[1],
-						   border[2],
-						   border[3]);
+      state[I915_TEXREG_SS4] = PACK_COLOR_8888(border[3],
+					       border[0],
+					       border[1],
+					       border[2]);
    }
 
 
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 3c1b2dd0b0..266e6848c3 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -174,7 +174,7 @@ i915_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(17, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(17);
 
    OUT_BATCH(_3DSTATE_AA_CMD |
              AA_LINE_ECAAR_WIDTH_ENABLE |
@@ -220,7 +220,7 @@ i915_emit_invarient_state(struct intel_context *intel)
 
 
 #define emit(intel, state, size )		     \
-   intel_batchbuffer_data(intel->batch, state, size, IGNORE_CLIPRECTS )
+   intel_batchbuffer_data(intel->batch, state, size)
 
 static GLuint
 get_dirty(struct i915_hw_state *state)
@@ -301,13 +301,9 @@ i915_emit_state(struct intel_context *intel)
     * It might be better to talk about explicit places where
     * scheduling is allowed, rather than assume that it is whenever a
     * batchbuffer fills up.
-    *
-    * Set the space as LOOP_CLIPRECTS now, since that's what our primitives
-    * will be emitted under.
     */
    intel_batchbuffer_require_space(intel->batch,
-				   get_state_size(state) + INTEL_PRIM_EMIT_SIZE,
-				   LOOP_CLIPRECTS);
+				   get_state_size(state) + INTEL_PRIM_EMIT_SIZE);
    count = 0;
  again:
    aper_count = 0;
@@ -373,7 +369,7 @@ i915_emit_state(struct intel_context *intel)
    }
 
    if (dirty & I915_UPLOAD_BUFFERS) {
-      GLuint count = 9;
+      GLuint count = 15;
 
       if (INTEL_DEBUG & DEBUG_STATE)
          fprintf(stderr, "I915_UPLOAD_BUFFERS:\n");
@@ -381,10 +377,7 @@ i915_emit_state(struct intel_context *intel)
       if (state->depth_region)
           count += 3;
 
-      if (intel->constant_cliprect)
-          count += 6;
-
-      BEGIN_BATCH(count, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(count);
       OUT_BATCH(state->Buffer[I915_DESTREG_CBUFADDR0]);
       OUT_BATCH(state->Buffer[I915_DESTREG_CBUFADDR1]);
       OUT_RELOC(state->draw_region->buffer,
@@ -406,15 +399,13 @@ i915_emit_state(struct intel_context *intel)
       OUT_BATCH(state->Buffer[I915_DESTREG_SR1]);
       OUT_BATCH(state->Buffer[I915_DESTREG_SR2]);
 
-      if (intel->constant_cliprect) {
-	 assert(state->Buffer[I915_DESTREG_DRAWRECT0] != MI_NOOP);
-	 OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT0]);
-	 OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT1]);
-	 OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT2]);
-	 OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT3]);
-	 OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT4]);
-	 OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT5]);
-      }
+      assert(state->Buffer[I915_DESTREG_DRAWRECT0] != MI_NOOP);
+      OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT0]);
+      OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT1]);
+      OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT2]);
+      OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT3]);
+      OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT4]);
+      OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT5]);
 
       ADVANCE_BATCH();
    }
@@ -441,7 +432,7 @@ i915_emit_state(struct intel_context *intel)
          if (dirty & I915_UPLOAD_TEX(i))
             nr++;
 
-      BEGIN_BATCH(2 + nr * 3, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(2 + nr * 3);
       OUT_BATCH(_3DSTATE_MAP_STATE | (3 * nr));
       OUT_BATCH((dirty & I915_UPLOAD_TEX_ALL) >> I915_UPLOAD_TEX_0_SHIFT);
       for (i = 0; i < I915_TEX_UNITS; i++)
@@ -465,7 +456,7 @@ i915_emit_state(struct intel_context *intel)
          }
       ADVANCE_BATCH();
 
-      BEGIN_BATCH(2 + nr * 3, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(2 + nr * 3);
       OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * nr));
       OUT_BATCH((dirty & I915_UPLOAD_TEX_ALL) >> I915_UPLOAD_TEX_0_SHIFT);
       for (i = 0; i < I915_TEX_UNITS; i++)
@@ -587,7 +578,7 @@ i915_state_draw_region(struct intel_context *intel,
             DSTORG_VERT_BIAS(0x8) |     /* .5 */
             LOD_PRECLAMP_OGL | TEX_DEFAULT_COLOR_OGL);
    if (irb != NULL) {
-      switch (irb->texformat) {
+      switch (irb->Base.Format) {
       case MESA_FORMAT_ARGB8888:
       case MESA_FORMAT_XRGB8888:
 	 value |= DV_PF_8888;
@@ -603,7 +594,7 @@ i915_state_draw_region(struct intel_context *intel,
 	 break;
       default:
 	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
-		       irb->texformat);
+		       irb->Base.Format);
       }
    }
 
@@ -611,7 +602,7 @@ i915_state_draw_region(struct intel_context *intel,
     * the value of this bit, the pipeline needs to be MI_FLUSHed.  And it
     * can only be set when a depth buffer is already defined.
     */
-   if (IS_945(intel->intelScreen->deviceID) && intel->use_early_z &&
+   if (intel->is_945 && intel->use_early_z &&
        depth_region->tiling != I915_TILING_NONE)
       value |= CLASSIC_EARLY_DEPTH;
 
@@ -623,23 +614,14 @@ i915_state_draw_region(struct intel_context *intel,
    }
    state->Buffer[I915_DESTREG_DV1] = value;
 
-   if (intel->constant_cliprect) {
-      state->Buffer[I915_DESTREG_DRAWRECT0] = _3DSTATE_DRAWRECT_INFO;
-      state->Buffer[I915_DESTREG_DRAWRECT1] = 0;
-      state->Buffer[I915_DESTREG_DRAWRECT2] = 0; /* xmin, ymin */
-      state->Buffer[I915_DESTREG_DRAWRECT3] =
-	 (ctx->DrawBuffer->Width & 0xffff) |
-	 (ctx->DrawBuffer->Height << 16);
-      state->Buffer[I915_DESTREG_DRAWRECT4] = 0; /* xoff, yoff */
-      state->Buffer[I915_DESTREG_DRAWRECT5] = 0;
-   } else {
-      state->Buffer[I915_DESTREG_DRAWRECT0] = MI_NOOP;
-      state->Buffer[I915_DESTREG_DRAWRECT1] = MI_NOOP;
-      state->Buffer[I915_DESTREG_DRAWRECT2] = MI_NOOP;
-      state->Buffer[I915_DESTREG_DRAWRECT3] = MI_NOOP;
-      state->Buffer[I915_DESTREG_DRAWRECT4] = MI_NOOP;
-      state->Buffer[I915_DESTREG_DRAWRECT5] = MI_NOOP;
-   }
+   state->Buffer[I915_DESTREG_DRAWRECT0] = _3DSTATE_DRAWRECT_INFO;
+   state->Buffer[I915_DESTREG_DRAWRECT1] = 0;
+   state->Buffer[I915_DESTREG_DRAWRECT2] = 0; /* xmin, ymin */
+   state->Buffer[I915_DESTREG_DRAWRECT3] =
+      (ctx->DrawBuffer->Width & 0xffff) |
+      (ctx->DrawBuffer->Height << 16);
+   state->Buffer[I915_DESTREG_DRAWRECT4] = 0; /* xoff, yoff */
+   state->Buffer[I915_DESTREG_DRAWRECT5] = 0;
 
    I915_STATECHANGE(i915, I915_UPLOAD_BUFFERS);
 }
@@ -667,15 +649,6 @@ i915_new_batch(struct intel_context *intel)
     * difficulties associated with them (physical address requirements).
     */
    i915->state.emitted = 0;
-
-   /* Check that we didn't just wrap our batchbuffer at a bad time. */
-   assert(!intel->no_batch_wrap);
-}
-
-static GLuint
-i915_flush_cmd(void)
-{
-   return MI_FLUSH | FLUSH_MAP_CACHE;
 }
 
 static void 
@@ -699,7 +672,6 @@ i915InitVtbl(struct i915_context *i915)
    i915->intel.vtbl.render_prevalidate = i915_render_prevalidate;
    i915->intel.vtbl.set_draw_region = i915_set_draw_region;
    i915->intel.vtbl.update_texture_state = i915UpdateTextureState;
-   i915->intel.vtbl.flush_cmd = i915_flush_cmd;
    i915->intel.vtbl.assert_not_dirty = i915_assert_not_dirty;
    i915->intel.vtbl.finish_batch = intel_finish_vb;
 }
diff --git a/src/mesa/drivers/dri/i915/intel_render.c b/src/mesa/drivers/dri/i915/intel_render.c
index 410052b3c2..ec209391ab 100644
--- a/src/mesa/drivers/dri/i915/intel_render.c
+++ b/src/mesa/drivers/dri/i915/intel_render.c
@@ -117,7 +117,7 @@ intelDmaPrimitive(struct intel_context *intel, GLenum prim)
    intel_set_prim(intel, hw_prim[prim]);
 }
 
-static inline GLuint intel_get_vb_max(struct intel_context *intel)
+static INLINE GLuint intel_get_vb_max(struct intel_context *intel)
 {
    GLuint ret;
 
@@ -129,7 +129,7 @@ static inline GLuint intel_get_vb_max(struct intel_context *intel)
    return ret;
 }
 
-static inline GLuint intel_get_current_max(struct intel_context *intel)
+static INLINE GLuint intel_get_current_max(struct intel_context *intel)
 {
 
    if (intel->intelScreen->no_vbo)
diff --git a/src/mesa/drivers/dri/i915/intel_swapbuffers.c b/src/mesa/drivers/dri/i915/intel_swapbuffers.c
deleted file mode 120000
index 148d5215aa..0000000000
--- a/src/mesa/drivers/dri/i915/intel_swapbuffers.c
+++ /dev/null
@@ -1 +0,0 @@
-../intel/intel_swapbuffers.c
-\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index bc527aae47..e99baf8e0e 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -89,7 +89,6 @@ intel_flush_inline_primitive(struct intel_context *intel)
 
 static void intel_start_inline(struct intel_context *intel, uint32_t prim)
 {
-   uint32_t batch_flags = LOOP_CLIPRECTS;
    BATCH_LOCALS;
 
    intel->vtbl.emit_state(intel);
@@ -101,7 +100,7 @@ static void intel_start_inline(struct intel_context *intel, uint32_t prim)
    /* Emit a slot which will be filled with the inline primitive
     * command later.
     */
-   BEGIN_BATCH(2, batch_flags);
+   BEGIN_BATCH(2);
    OUT_BATCH(0);
 
    assert((intel->batch->dirty_state & (1<<1)) == 0);
@@ -221,7 +220,7 @@ void intel_flush_prim(struct intel_context *intel)
    intel->prim.count = 0;
    offset = intel->prim.start_offset;
    intel->prim.start_offset = intel->prim.current_offset;
-   if (!IS_9XX(intel->intelScreen->deviceID))
+   if (!intel->gen >= 3)
       intel->prim.start_offset = ALIGN(intel->prim.start_offset, 128);
    intel->prim.flush = NULL;
 
@@ -251,8 +250,8 @@ void intel_flush_prim(struct intel_context *intel)
 	  intel->vertex_size * 4);
 #endif
 
-   if (IS_9XX(intel->intelScreen->deviceID)) {
-      BEGIN_BATCH(5, LOOP_CLIPRECTS);
+   if (intel->gen >= 3) {
+      BEGIN_BATCH(5);
       OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
 		I1_LOAD_S(0) | I1_LOAD_S(1) | 1);
       assert((offset & !S0_VB_OFFSET_MASK) == 0);
@@ -270,7 +269,7 @@ void intel_flush_prim(struct intel_context *intel)
    } else {
       struct i830_context *i830 = i830_context(&intel->ctx);
 
-      BEGIN_BATCH(5, LOOP_CLIPRECTS);
+      BEGIN_BATCH(5);
       OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 |
 		I1_LOAD_S(0) | I1_LOAD_S(2) | 1);
       /* S0 */
@@ -1250,81 +1249,6 @@ union fi
    GLint i;
 };
 
-
-/**********************************************************************/
-/*             Used only with the metaops callbacks.                  */
-/**********************************************************************/
-static void
-intel_meta_draw_poly(struct intel_context *intel,
-                     GLuint n,
-                     GLfloat xy[][2],
-                     GLfloat z, GLuint color, GLfloat tex[][2])
-{
-   union fi *vb;
-   GLint i;
-   unsigned int saved_vertex_size = intel->vertex_size;
-
-   LOCK_HARDWARE(intel);
-
-   intel->vertex_size = 6;
-
-   /* All 3d primitives should be emitted with LOOP_CLIPRECTS,
-    * otherwise the drawing origin (DR4) might not be set correctly.
-    */
-   intel_set_prim(intel, PRIM3D_TRIFAN);
-   vb = (union fi *) intel_get_prim_space(intel, n);
-
-   for (i = 0; i < n; i++) {
-      vb[0].f = xy[i][0];
-      vb[1].f = xy[i][1];
-      vb[2].f = z;
-      vb[3].i = color;
-      vb[4].f = tex[i][0];
-      vb[5].f = tex[i][1];
-      vb += 6;
-   }
-
-   INTEL_FIREVERTICES(intel);
-
-   intel->vertex_size = saved_vertex_size;
-
-   UNLOCK_HARDWARE(intel);
-}
-
-static void
-intel_meta_draw_quad(struct intel_context *intel,
-                     GLfloat x0, GLfloat x1,
-                     GLfloat y0, GLfloat y1,
-                     GLfloat z,
-                     GLuint color,
-                     GLfloat s0, GLfloat s1, GLfloat t0, GLfloat t1)
-{
-   GLfloat xy[4][2];
-   GLfloat tex[4][2];
-
-   xy[0][0] = x0;
-   xy[0][1] = y0;
-   xy[1][0] = x1;
-   xy[1][1] = y0;
-   xy[2][0] = x1;
-   xy[2][1] = y1;
-   xy[3][0] = x0;
-   xy[3][1] = y1;
-
-   tex[0][0] = s0;
-   tex[0][1] = t0;
-   tex[1][0] = s1;
-   tex[1][1] = t0;
-   tex[2][0] = s1;
-   tex[2][1] = t1;
-   tex[3][0] = s0;
-   tex[3][1] = t1;
-
-   intel_meta_draw_poly(intel, 4, xy, z, color, tex);
-}
-
-
-
 /**********************************************************************/
 /*                            Initialization.                         */
 /**********************************************************************/
@@ -1333,7 +1257,6 @@ intel_meta_draw_quad(struct intel_context *intel,
 void
 intelInitTriFuncs(GLcontext * ctx)
 {
-   struct intel_context *intel = intel_context(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    static int firsttime = 1;
 
@@ -1350,6 +1273,4 @@ intelInitTriFuncs(GLcontext * ctx)
    tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
    tnl->Driver.Render.CopyPV = _tnl_copy_pv;
    tnl->Driver.Render.Interp = _tnl_interp;
-
-   intel->vtbl.meta_draw_quad = intel_meta_draw_quad;
 }
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 7a55333e89..7758a792fd 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -24,7 +24,6 @@ DRIVER_SOURCES = \
 	intel_pixel_draw.c \
 	intel_pixel_read.c \
 	intel_state.c \
-	intel_swapbuffers.c \
 	intel_syncobj.c \
 	intel_tex.c \
 	intel_tex_copy.c \
@@ -96,7 +95,8 @@ ASM_SOURCES =
 
 DRIVER_DEFINES = -I../intel -I../intel/server
 
-DRI_LIB_DEPS += -ldrm_intel
+INCLUDES += $(INTEL_CFLAGS)
+DRI_LIB_DEPS += $(INTEL_LIBS)
 
 include ../Makefile.template
 
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index 5cca605c3f..bac1c3a49c 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -34,6 +34,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_util.h"
+#include "intel_fbo.h"
 #include "main/macros.h"
 #include "main/enums.h"
 
@@ -55,7 +56,8 @@ static void prepare_cc_vp( struct brw_context *brw )
    }
 
    dri_bo_unreference(brw->cc.vp_bo);
-   brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
+   brw->cc.vp_bo = brw_cache_data(&brw->cache, BRW_CC_VP, &ccv, sizeof(ccv),
+				  NULL, 0);
 }
 
 const struct brw_tracked_state brw_cc_vp = {
@@ -88,6 +90,28 @@ struct brw_cc_unit_key {
    GLenum depth_func;
 };
 
+/**
+ * Modify blend function to force destination alpha to 1.0
+ *
+ * If \c function specifies a blend function that uses destination alpha,
+ * replace it with a function that hard-wires destination alpha to 1.0.  This
+ * is used when rendering to xRGB targets.
+ */
+static GLenum
+fix_xRGB_alpha(GLenum function)
+{
+   switch (function) {
+   case GL_DST_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_DST_ALPHA:
+   case GL_SRC_ALPHA_SATURATE:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
 static void
 cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
@@ -131,6 +155,17 @@ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
       key->blend_dst_rgb = ctx->Color.BlendDstRGB;
       key->blend_src_a = ctx->Color.BlendSrcA;
       key->blend_dst_a = ctx->Color.BlendDstA;
+
+      /* If the renderbuffer is XRGB, we have to frob the blend function to
+       * force the destination alpha to 1.0.  This means replacing GL_DST_ALPHA
+       * with GL_ONE and GL_ONE_MINUS_DST_ALPHA with GL_ZERO.
+       */
+      if (ctx->DrawBuffer->Visual.alphaBits == 0) {
+	 key->blend_src_rgb = fix_xRGB_alpha(key->blend_src_rgb);
+	 key->blend_src_a   = fix_xRGB_alpha(key->blend_src_a);
+	 key->blend_dst_rgb = fix_xRGB_alpha(key->blend_dst_rgb);
+	 key->blend_dst_a   = fix_xRGB_alpha(key->blend_dst_a);
+      }
    }
 
    key->alpha_enabled = ctx->Color.AlphaEnabled;
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 20a927cf38..af1d975de9 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -50,6 +50,7 @@
 static void compile_clip_prog( struct brw_context *brw,
 			     struct brw_clip_prog_key *key )
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_clip_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -65,27 +66,26 @@ static void compile_clip_prog( struct brw_context *brw,
    c.func.single_program_flow = 1;
 
    c.key = *key;
-   c.need_ff_sync = BRW_IS_IGDNG(brw);
 
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.header_position_offset = ATTR_SIZE;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        delta = 3 * REG_SIZE;
    else
        delta = REG_SIZE;
 
    for (i = 0; i < VERT_RESULT_MAX; i++)
-      if (c.key.attrs & (1<<i)) {
+      if (c.key.attrs & BITFIELD64_BIT(i)) {
 	 c.offset[i] = delta;
 	 delta += ATTR_SIZE;
       }
 
    c.nr_attrs = brw_count_bits(c.key.attrs);
    
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
    else
        c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
@@ -143,7 +143,8 @@ static void compile_clip_prog( struct brw_context *brw,
  */
 static void upload_clip_prog(struct brw_context *brw)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   GLcontext *ctx = &intel->ctx;
    struct brw_clip_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -156,10 +157,11 @@ static void upload_clip_prog(struct brw_context *brw)
    key.attrs = brw->vs.prog_data->outputs_written;
    /* _NEW_LIGHT */
    key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
+   key.pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
    /* _NEW_TRANSFORM */
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
    else
        key.clip_mode = BRW_CLIPMODE_NORMAL;
diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h
index 957df441ab..d71bac7f61 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.h
+++ b/src/mesa/drivers/dri/i965/brw_clip.h
@@ -42,22 +42,21 @@
  * up polygon offset and flatshading at this point:
  */
 struct brw_clip_prog_key {
-   GLuint attrs:32;		
+   GLbitfield64 attrs;
    GLuint primitive:4;
    GLuint nr_userclip:3;
    GLuint do_flat_shading:1;
+   GLuint pv_first:1;
    GLuint do_unfilled:1;
    GLuint fill_cw:2;		/* includes cull information */
    GLuint fill_ccw:2;		/* includes cull information */
    GLuint offset_cw:1;
    GLuint offset_ccw:1;
-   GLuint pad0:17;
-
    GLuint copy_bfc_cw:1;
    GLuint copy_bfc_ccw:1;
    GLuint clip_mode:3;
-   GLuint pad1:27;
-   
+   GLuint pad0:11;
+
    GLfloat offset_factor;
    GLfloat offset_units;
 };
@@ -119,7 +118,6 @@ struct brw_clip_compile {
 
    GLuint header_position_offset;
    GLuint offset[VERT_ATTRIB_MAX];
-   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index 048ca620fa..afc0b11049 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -46,6 +46,7 @@
 
 static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i = 0,j;
 
    /* Register usage is static, precompute here:
@@ -85,7 +86,7 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
       i++;
    }
 
-   if (c->need_ff_sync) {
+   if (intel->needs_ff_sync) {
       c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
       i++;
    }
@@ -126,6 +127,7 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
 static void clip_and_emit_line( struct brw_clip_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    struct brw_indirect vtx0     = brw_indirect(0, 0);
    struct brw_indirect vtx1      = brw_indirect(1, 0);
    struct brw_indirect newvtx0   = brw_indirect(2, 0);
@@ -152,7 +154,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   if (BRW_IS_965(p->brw)) {
+   if (brw->has_negative_rhw_bug) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
               brw_imm_ud(1<<20));
@@ -189,7 +191,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
               * Both can be negative on GM965/G965 due to RHW workaround
               * if so, this object should be rejected.
               */
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
                  {
@@ -214,7 +216,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 
              /* If both are positive, do nothing */
              /* Only on GM965/G965 */
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
              }
@@ -229,7 +231,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
                  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
              }
 
-             if (BRW_IS_965(p->brw)) {
+             if (brw->has_negative_rhw_bug) {
                  brw_ENDIF(p, is_neg2);
              }
          }
@@ -269,8 +271,12 @@ void brw_emit_line_clip( struct brw_clip_compile *c )
    brw_clip_line_alloc_regs(c);
    brw_clip_init_ff_sync(c);
 
-   if (c->key.do_flat_shading)
-      brw_clip_copy_colors(c, 0, 1);
+   if (c->key.do_flat_shading) {
+      if (c->key.pv_first)
+         brw_clip_copy_colors(c, 1, 0);
+      else
+         brw_clip_copy_colors(c, 0, 1);
+   }
                 
    clip_and_emit_line(c);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 234b3744bf..c8f24a94e4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -74,6 +74,7 @@ static dri_bo *
 clip_unit_create_from_key(struct brw_context *brw,
 			  struct brw_clip_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_clip_unit_state clip;
    dri_bo *bo;
 
@@ -105,7 +106,7 @@ clip_unit_create_from_key(struct brw_context *brw,
       /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
        * only 2 threads can output VUEs at a time.
        */
-      if (BRW_IS_IGDNG(brw))
+      if (intel->is_ironlake)
          clip.thread4.max_threads = 16 - 1;        
       else
          clip.thread4.max_threads = 2 - 1;
@@ -130,7 +131,7 @@ clip_unit_create_from_key(struct brw_context *brw,
    clip.clip5.api_mode = BRW_CLIP_API_OGL;
    clip.clip5.clip_mode = key->clip_mode;
 
-   if (BRW_IS_G4X(brw))
+   if (intel->is_g4x)
       clip.clip5.negative_w_clip_test = 1;
 
    clip.clip6.clipper_viewport_state_ptr = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 0efd77225e..cfbb8f2686 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -51,6 +51,7 @@ static void release_tmps( struct brw_clip_compile *c )
 void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
 			      GLuint nr_verts )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i = 0,j;
 
    /* Register usage is static, precompute here:
@@ -78,7 +79,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       for (j = 0; j < 3; j++) {
 	 GLuint delta = c->nr_attrs*16 + 32;
 
-         if (BRW_IS_IGDNG(c->func.brw))
+         if (intel->is_ironlake)
              delta = c->nr_attrs * 16 + 32 * 3;
 
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
@@ -119,7 +120,7 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       i++;
    }
 
-   if (c->need_ff_sync) {
+   if (intel->needs_ff_sync) {
       c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
       i++;
    }
@@ -188,14 +189,20 @@ void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
 	   brw_imm_ud(_3DPRIM_POLYGON));
 
    is_poly = brw_IF(p, BRW_EXECUTE_1);
-   {   
+   {
       brw_clip_copy_colors(c, 1, 0);
       brw_clip_copy_colors(c, 2, 0);
    }
    is_poly = brw_ELSE(p, is_poly);
    {
-      brw_clip_copy_colors(c, 0, 2);
-      brw_clip_copy_colors(c, 1, 2);
+      if (c->key.pv_first) {
+         brw_clip_copy_colors(c, 1, 0);
+         brw_clip_copy_colors(c, 2, 0);
+      }
+      else {
+         brw_clip_copy_colors(c, 0, 2);
+         brw_clip_copy_colors(c, 1, 2);
+      }
    }
    brw_ENDIF(p, is_poly);
 }
@@ -565,6 +572,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 {
    struct brw_instruction *neg_rhw;
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
    brw_clip_init_clipmask(c);
@@ -572,7 +580,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   if (BRW_IS_965(p->brw)) {
+   if (brw->has_negative_rhw_bug) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
               brw_imm_ud(1<<20));
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 5a73abdfee..86fed59fa4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -135,6 +135,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 			     GLboolean force_edgeflag)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = get_tmp(c);
    GLuint i;
 
@@ -142,7 +143,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
     */
    /*
     * After CLIP stage, only first 256 bits of the VUE are read
-    * back on IGDNG, so needn't change it
+    * back on Ironlake, so needn't change it
     */
    brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
       
@@ -151,7 +152,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    for (i = 0; i < c->nr_attrs; i++) {
       GLuint delta = i*16 + 32;
 
-      if (BRW_IS_IGDNG(p->brw))
+      if (intel->is_ironlake)
           delta = i * 16 + 32 * 3;
 
       if (delta == c->offset[VERT_RESULT_EDGE]) {
@@ -185,7 +186,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    if (i & 1) {
       GLuint delta = i*16 + 32;
 
-      if (BRW_IS_IGDNG(p->brw))
+      if (intel->is_ironlake)
           delta = i * 16 + 32 * 3;
 
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
@@ -359,7 +360,9 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
 
 void brw_clip_ff_sync(struct brw_clip_compile *c)
 {
-    if (c->need_ff_sync) {
+    struct intel_context *intel = &c->func.brw->intel;
+
+    if (intel->needs_ff_sync) {
         struct brw_compile *p = &c->func;
         struct brw_instruction *need_ff_sync;
 
@@ -388,7 +391,9 @@ void brw_clip_ff_sync(struct brw_clip_compile *c)
 
 void brw_clip_init_ff_sync(struct brw_clip_compile *c)
 {
-    if (c->need_ff_sync) {
+    struct intel_context *intel = &c->func.brw->intel;
+
+    if (intel->needs_ff_sync) {
 	struct brw_compile *p = &c->func;
         
         brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 48685c087b..7bb15956b5 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -77,7 +77,7 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
 }
 
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
+			    __DRIcontext *driContextPriv,
 			    void *sharedContextPrivate)
 {
    struct dd_function_table functions;
@@ -111,6 +111,9 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureCoordUnits,
                                      ctx->Const.MaxTextureImageUnits);
    ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
+   ctx->Const.MaxCombinedTextureImageUnits =
+      ctx->Const.MaxVertexTextureImageUnits +
+      ctx->Const.MaxTextureImageUnits;
 
    /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
     */
@@ -155,6 +158,38 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
       MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
 	   ctx->Const.FragmentProgram.MaxEnvParams);
 
+   if (intel->is_ironlake || intel->is_g4x) {
+      brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_GM45;
+      brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_GM45;
+      brw->has_surface_tile_offset = GL_TRUE;
+      brw->has_compr4 = GL_TRUE;
+      brw->has_aa_line_parameters = GL_TRUE;
+  } else {
+      brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_965;
+      brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_965;
+   }
+
+   /* WM maximum threads is number of EUs times number of threads per EU. */
+   if (intel->is_ironlake) {
+      brw->urb.size = 1024;
+      brw->vs_max_threads = 72;
+      brw->wm_max_threads = 12 * 6;
+   } else if (intel->is_g4x) {
+      brw->urb.size = 384;
+      brw->vs_max_threads = 32;
+      brw->wm_max_threads = 10 * 5;
+   } else {
+      brw->urb.size = 256;
+      brw->vs_max_threads = 16;
+      brw->wm_max_threads = 8 * 4;
+      brw->has_negative_rhw_bug = GL_TRUE;
+   }
+
+   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD) {
+      brw->vs_max_threads = 1;
+      brw->wm_max_threads = 1;
+   }
+
    brw_init_state( brw );
 
    brw->state.dirty.mesa = ~0;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 59f9475b5a..0dd3087143 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -172,8 +172,8 @@ struct brw_fragment_program {
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
    GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
-   dri_bo *const_buffer;    /** Program constant buffer/surface */
    GLboolean use_const_buffer;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -231,7 +231,7 @@ struct brw_vs_prog_data {
    GLuint curb_read_length;
    GLuint urb_read_length;
    GLuint total_grf;
-   GLuint outputs_written;
+   GLbitfield64 outputs_written;
    GLuint nr_params;       /**< number of float params/constants */
 
    GLuint inputs_read;
@@ -320,7 +320,6 @@ struct brw_cache_item {
    GLuint nr_reloc_bufs;
 
    dri_bo *bo;
-   GLuint data_size;
 
    struct brw_cache_item *next;
 };   
@@ -333,7 +332,6 @@ struct brw_cache {
    struct brw_cache_item **items;
    GLuint size, n_items;
 
-   GLuint key_size[BRW_MAX_CACHE];		/* for fixed-size keys */
    GLuint aux_size[BRW_MAX_CACHE];
    char *name[BRW_MAX_CACHE];
 
@@ -413,23 +411,6 @@ struct brw_vertex_info {
    GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
 };
 
-
-
-
-/* Cache for TNL programs.
- */
-struct brw_tnl_cache_item {
-   GLuint hash;
-   void *key;
-   void *data;
-   struct brw_tnl_cache_item *next;
-};
-
-struct brw_tnl_cache {
-   struct brw_tnl_cache_item **items;
-   GLuint size, n_items;
-};
-
 struct brw_query_object {
    struct gl_query_object Base;
 
@@ -457,8 +438,11 @@ struct brw_context
    GLuint primitive;
 
    GLboolean emit_state_always;
-   GLboolean no_batch_wrap;
-
+   GLboolean has_surface_tile_offset;
+   GLboolean has_compr4;
+   GLboolean has_negative_rhw_bug;
+   GLboolean has_aa_line_parameters;
+;
    struct {
       struct brw_state_flags dirty;
 
@@ -534,6 +518,12 @@ struct brw_context
     */
    GLuint next_free_page;
 
+   /* hw-dependent 3DSTATE_VF_STATISTICS opcode */
+   uint32_t CMD_VF_STATISTICS;
+   /* hw-dependent 3DSTATE_PIPELINE_SELECT opcode */
+   uint32_t CMD_PIPELINE_SELECT;
+   int vs_max_threads;
+   int wm_max_threads;
 
    /* BRW_NEW_URB_ALLOCATIONS:
     */
@@ -561,6 +551,7 @@ struct brw_context
       GLuint clip_start;
       GLuint sf_start;
       GLuint cs_start;
+      GLuint size; /* Hardware URB size, in KB. */
    } urb;
 
    
@@ -688,7 +679,7 @@ void brwInitVtbl( struct brw_context *brw );
  * brw_context.c
  */
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
-			    __DRIcontextPrivate *driContextPriv,
+			    __DRIcontext *driContextPriv,
 			    void *sharedContextPrivate);
 
 /*======================================================================
@@ -761,9 +752,5 @@ brw_fragment_program_const(const struct gl_fragment_program *p)
    return (const struct brw_fragment_program *) p;
 }
 
-
-
-#define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1)
-
 #endif
 
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 4be6c77aa1..190310afbb 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -130,7 +130,7 @@ static void calculate_curbe_offsets( struct brw_context *brw )
 const struct brw_tracked_state brw_curbe_offsets = {
    .dirty = {
       .mesa = _NEW_TRANSFORM,
-      .brw  = BRW_NEW_VERTEX_PROGRAM,
+      .brw  = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_CONTEXT,
       .cache = CACHE_NEW_WM_PROG
    },
    .prepare = calculate_curbe_offsets
@@ -340,7 +340,7 @@ static void emit_constant_buffer(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
-   BEGIN_BATCH(2, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(2);
    if (sz == 0) {
       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
       OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 78d457ad2b..ea0d7e05d4 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -673,18 +673,10 @@
 #define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
 #define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
 
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
-#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
-#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
-#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+#define BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG    3
 
 /* for IGDNG only */
 #define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
@@ -840,12 +832,4 @@
 
 #include "intel_chipset.h"
 
-#define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
-#define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
-#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
-                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
-
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 9fef230507..a8f6b993ac 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -239,7 +239,7 @@ char *imm_encoding[8] = {
     [2] = "UW",
     [3] = "W",
     [5] = "VF",
-    [5] = "V",
+    [6] = "V",
     [7] = "F"
 };
 
@@ -365,6 +365,7 @@ static int format (FILE *f, char *format, ...)
     va_start (args, format);
 
     vsnprintf (buf, sizeof (buf) - 1, format, args);
+    va_end (args);
     string (f, buf);
     return 0;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 44bb7bd588..df281b27d5 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -145,7 +145,7 @@ static void brw_emit_prim(struct brw_context *brw,
    prim_packet.base_vert_location = prim->basevertex;
 
    /* Can't wrap here, since we rely on the validated state. */
-   brw->no_batch_wrap = GL_TRUE;
+   intel->no_batch_wrap = GL_TRUE;
 
    /* If we're set to always flush, do it before and after the primitive emit.
     * We want to catch both missed flushes that hurt instruction/state cache
@@ -153,21 +153,17 @@ static void brw_emit_prim(struct brw_context *brw,
     * the besides the draw code.
     */
    if (intel->always_flush_cache) {
-      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
-      OUT_BATCH(intel->vtbl.flush_cmd());
-      ADVANCE_BATCH();
+      intel_batchbuffer_emit_mi_flush(intel->batch);
    }
    if (prim_packet.verts_per_instance) {
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
-			      sizeof(prim_packet), LOOP_CLIPRECTS);
+			      sizeof(prim_packet));
    }
    if (intel->always_flush_cache) {
-      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
-      OUT_BATCH(intel->vtbl.flush_cmd());
-      ADVANCE_BATCH();
+      intel_batchbuffer_emit_mi_flush(intel->batch);
    }
 
-   brw->no_batch_wrap = GL_FALSE;
+   intel->no_batch_wrap = GL_FALSE;
 }
 
 static void brw_merge_inputs( struct brw_context *brw,
@@ -343,13 +339,6 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
     * so can't access it earlier.
     */
 
-   LOCK_HARDWARE(intel);
-
-   if (!intel->constant_cliprect && intel->driDrawable->numClipRects == 0) {
-      UNLOCK_HARDWARE(intel);
-      return GL_TRUE;
-   }
-
    for (i = 0; i < nr_prims; i++) {
       uint32_t hw_prim;
 
@@ -360,8 +349,7 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
        * an upper bound of how much we might emit in a single
        * brw_try_draw_prims().
        */
-      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4,
-				      LOOP_CLIPRECTS);
+      intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4);
 
       hw_prim = brw_set_prim(brw, prim[i].mode);
 
@@ -408,7 +396,6 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    if (intel->always_flush_batch)
       intel_batchbuffer_flush(intel->batch);
  out:
-   UNLOCK_HARDWARE(intel);
 
    brw_state_cache_check_size(brw);
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 0fefbd9d81..c773b71507 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -243,14 +243,6 @@ static void wrap_buffers( struct brw_context *brw,
       dri_bo_unreference(brw->vb.upload.bo);
    brw->vb.upload.bo = dri_bo_alloc(brw->intel.bufmgr, "temporary VBO",
 				    size, 1);
-
-   /* Set the internal VBO\ to no-backing-store.  We only use them as a
-    * temporary within a brw_try_draw_prims while the lock is held.
-    */
-   /* DON'T DO THIS AS IF WE HAVE TO RE-ORG MEMORY WE NEED SOMEWHERE WITH
-      FAKE TO PUSH THIS STUFF */
-//   if (!brw->intel.ttm)
-//      dri_bo_fake_disable_backing_store(brw->vb.upload.bo, NULL, NULL);
 }
 
 static void get_space( struct brw_context *brw,
@@ -502,7 +494,7 @@ static void brw_emit_vertices(struct brw_context *brw)
     * a VE loads from them.
     */
    if (brw->vb.nr_enabled == 0) {
-      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(3);
       OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
       OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
 		BRW_VE0_VALID |
@@ -522,7 +514,7 @@ static void brw_emit_vertices(struct brw_context *brw)
     * are interleaved or from the same VBO.  TBD if this makes a
     * performance difference.
     */
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4);
    OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
 	     ((1 + brw->vb.nr_enabled * 4) - 2));
 
@@ -535,24 +527,17 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_RELOC(input->bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
 		input->offset);
-      if (BRW_IS_IGDNG(brw)) {
-          if (input->stride) {
-              OUT_RELOC(input->bo,
-                        I915_GEM_DOMAIN_VERTEX, 0,
-                        input->offset + input->stride * input->count - 1);
-          } else {
-              assert(input->count == 1);
-              OUT_RELOC(input->bo,
-                        I915_GEM_DOMAIN_VERTEX, 0,
-                        input->offset + input->element_size - 1);
-          }
+      if (intel->is_ironlake) {
+	 OUT_RELOC(input->bo,
+		   I915_GEM_DOMAIN_VERTEX, 0,
+		   input->bo->size - 1);
       } else
           OUT_BATCH(input->stride ? input->count : 0);
       OUT_BATCH(0); /* Instance data step rate */
    }
    ADVANCE_BATCH();
 
-   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2);
    OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
@@ -578,7 +563,7 @@ static void brw_emit_vertices(struct brw_context *brw)
 		(format << BRW_VE0_FORMAT_SHIFT) |
 		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
 
-      if (BRW_IS_IGDNG(brw))
+      if (intel->is_ironlake)
           OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
                     (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
                     (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
@@ -719,7 +704,7 @@ static void brw_emit_index_buffer(struct brw_context *brw)
       ib.header.bits.index_format = get_index_type(index_buffer->type);
       ib.header.bits.cut_index_enable = 0;
 
-      BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(4);
       OUT_BATCH( ib.header.dword );
       OUT_RELOC(brw->ib.bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 30603bdd0e..39eb88d7c2 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -170,11 +170,11 @@ static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint writemask )
 {
    struct brw_reg reg;
-   if (type == BRW_GENERAL_REGISTER_FILE)
+   if (file == BRW_GENERAL_REGISTER_FILE)
       assert(nr < BRW_MAX_GRF);
-   else if (type == BRW_MESSAGE_REGISTER_FILE)
-      assert(nr < BRW_MAX_MRF);
-   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+   else if (file == BRW_MESSAGE_REGISTER_FILE)
+      assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
+   else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
       assert(nr <= BRW_ARF_IP);
 
    reg.type = type;
@@ -538,7 +538,7 @@ static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
 
 static INLINE struct brw_reg brw_message_reg( GLuint nr )
 {
-   assert(nr < BRW_MAX_MRF);
+   assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
    return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
 		       nr,
 		       0);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 241cdc33f8..8d6ac00839 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -55,7 +55,8 @@ static void guess_execution_size( struct brw_instruction *insn,
 static void brw_set_dest( struct brw_instruction *insn,
 			  struct brw_reg dest )
 {
-   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.file != BRW_MESSAGE_REGISTER_FILE)
       assert(dest.nr < 128);
 
    insn->bits1.da1.dest_reg_file = dest.file;
@@ -198,7 +199,7 @@ void brw_set_src1( struct brw_instruction *insn,
        * in the future:
        */
       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
-      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
 
       if (insn->header.access_mode == BRW_ALIGN_1) {
 	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
@@ -251,9 +252,10 @@ static void brw_set_math_message( struct brw_context *brw,
 				  GLboolean saturate,
 				  GLuint dataType )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.math_igdng.function = function;
        insn->bits3.math_igdng.int_type = integer_type;
        insn->bits3.math_igdng.precision = low_precision;
@@ -318,9 +320,10 @@ static void brw_set_urb_message( struct brw_context *brw,
 				 GLuint offset,
 				 GLuint swizzle_control )
 {
+    struct intel_context *intel = &brw->intel;
     brw_set_src1(insn, brw_imm_d(0));
 
-    if (BRW_IS_IGDNG(brw)) {
+    if (intel->is_ironlake) {
         insn->bits3.urb_igdng.opcode = 0;	/* ? */
         insn->bits3.urb_igdng.offset = offset;
         insn->bits3.urb_igdng.swizzle_control = swizzle_control;
@@ -357,9 +360,10 @@ static void brw_set_dp_write_message( struct brw_context *brw,
 				      GLuint response_length,
 				      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
        insn->bits3.dp_write_igdng.msg_control = msg_control;
        insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
@@ -394,9 +398,10 @@ static void brw_set_dp_read_message( struct brw_context *brw,
 				      GLuint response_length,
 				      GLuint end_of_thread )
 {
+   struct intel_context *intel = &brw->intel;
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
        insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
        insn->bits3.dp_read_igdng.msg_control = msg_control;
        insn->bits3.dp_read_igdng.msg_type = msg_type;
@@ -432,10 +437,11 @@ static void brw_set_sampler_message(struct brw_context *brw,
                                     GLuint header_present,
                                     GLuint simd_mode)
 {
+   struct intel_context *intel = &brw->intel;
    assert(eot == 0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_IGDNG(brw)) {
+   if (intel->is_ironlake) {
       insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
       insn->bits3.sampler_igdng.sampler = sampler;
       insn->bits3.sampler_igdng.msg_type = msg_type;
@@ -446,7 +452,7 @@ static void brw_set_sampler_message(struct brw_context *brw,
       insn->bits3.sampler_igdng.end_of_thread = eot;
       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
       insn->bits2.send_igdng.end_of_thread = eot;
-   } else if (BRW_IS_G4X(brw)) {
+   } else if (intel->is_g4x) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -647,10 +653,11 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_ELSE(struct brw_compile *p, 
 				 struct brw_instruction *if_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2;
 
    if (p->single_program_flow) {
@@ -689,9 +696,10 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 void brw_ENDIF(struct brw_compile *p, 
 	       struct brw_instruction *patch_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2; 
  
    if (p->single_program_flow) {
@@ -802,10 +810,11 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 struct brw_instruction *brw_WHILE(struct brw_compile *p, 
                                   struct brw_instruction *do_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint br = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
       br = 2;
 
    if (p->single_program_flow)
@@ -845,14 +854,15 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 void brw_land_fwd_jump(struct brw_compile *p, 
 		       struct brw_instruction *jmp_insn)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *landing = &p->store[p->nr_insn];
    GLuint jmpi = 1;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
-   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
 
    jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index 562a17844b..fe5c1ae279 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -47,7 +47,6 @@
 
 static GLboolean do_check_fallback(struct brw_context *brw)
 {
-   struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
@@ -86,8 +85,7 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    }
 
    /* _NEW_BUFFERS */
-   if (IS_965(intel->intelScreen->deviceID) &&
-       !IS_G4X(intel->intelScreen->deviceID)) {
+   if (!brw->has_surface_tile_offset) {
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
 	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 48c2b9a41c..1bc3eccf49 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -47,6 +47,7 @@
 static void compile_gs_prog( struct brw_context *brw,
 			     struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_gs_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -54,13 +55,12 @@ static void compile_gs_prog( struct brw_context *brw,
    memset(&c, 0, sizeof(c));
    
    c.key = *key;
-   c.need_ff_sync = BRW_IS_IGDNG(brw);
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.nr_attrs = brw_count_bits(c.key.attrs);
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
    else
        c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
@@ -85,10 +85,10 @@ static void compile_gs_prog( struct brw_context *brw,
     */
    switch (key->primitive) {
    case GL_QUADS:
-      brw_gs_quads( &c ); 
+      brw_gs_quads( &c, key );
       break;
    case GL_QUAD_STRIP:
-      brw_gs_quad_strip( &c );
+      brw_gs_quad_strip( &c, key );
       break;
    case GL_LINE_LOOP:
       brw_gs_lines( &c );
@@ -149,6 +149,7 @@ static const GLenum gs_prim[GL_POLYGON+1] = {
 static void populate_key( struct brw_context *brw,
 			  struct brw_gs_prog_key *key )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
@@ -158,6 +159,9 @@ static void populate_key( struct brw_context *brw,
    key->primitive = gs_prim[brw->primitive];
 
    key->hint_gs_always = 0;	/* debug code? */
+   
+   /* _NEW_LIGHT */
+   key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
 
    key->need_gs_prog = (key->hint_gs_always ||
 			brw->primitive == GL_QUADS ||
@@ -193,7 +197,7 @@ static void prepare_gs_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_gs_prog = {
    .dirty = {
-      .mesa  = 0,
+      .mesa  = _NEW_LIGHT,
       .brw   = BRW_NEW_PRIMITIVE,
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index bbb991ea2e..813b8d447a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -40,11 +40,12 @@
 #define MAX_GS_VERTS (4)	     
 
 struct brw_gs_prog_key {
-   GLuint attrs:32;
+   GLbitfield64 attrs;
    GLuint primitive:4;
    GLuint hint_gs_always:1;
+   GLuint pv_first:1;
    GLuint need_gs_prog:1;
-   GLuint pad:26;
+   GLuint pad:25;
 };
 
 struct brw_gs_compile {
@@ -62,13 +63,12 @@ struct brw_gs_compile {
    GLuint nr_attrs;
    GLuint nr_regs;
    GLuint nr_bytes;
-   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
 
-void brw_gs_quads( struct brw_gs_compile *c );
-void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
+void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key );
 void brw_gs_tris( struct brw_gs_compile *c );
 void brw_gs_lines( struct brw_gs_compile *c );
 void brw_gs_points( struct brw_gs_compile *c );
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index a9b2aa2eac..a81b972ef4 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -120,38 +120,60 @@ static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
 }
 
 
-void brw_gs_quads( struct brw_gs_compile *c )
+void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 4);
    
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
     */
-   if (c->need_ff_sync)
-	   brw_gs_ff_sync(c, 1);    
-   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
-   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
-   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   if (intel->needs_ff_sync)
+	   brw_gs_ff_sync(c, 1);
+   if (key->pv_first) {
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[3], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
+   else {
+      brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
 }
 
-void brw_gs_quad_strip( struct brw_gs_compile *c )
+void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 4);
    
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
-   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
-   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
-   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
-   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   if (key->pv_first) {
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[3], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
+   else {
+      brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+      brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+      brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+   }
 }
 
 void brw_gs_tris( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 3);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
@@ -160,9 +182,11 @@ void brw_gs_tris( struct brw_gs_compile *c )
 
 void brw_gs_lines( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 2);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
@@ -170,9 +194,11 @@ void brw_gs_lines( struct brw_gs_compile *c )
 
 void brw_gs_points( struct brw_gs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
+
    brw_gs_alloc_regs(c, 1);
 
-   if (c->need_ff_sync)
+   if (intel->needs_ff_sync)
 	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
 }
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index ed9d2ffe60..1af5790a67 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -72,6 +72,7 @@ gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 static dri_bo *
 gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_gs_unit_state gs;
    dri_bo *bo;
 
@@ -98,7 +99,7 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    else
       gs.thread4.max_threads = 0;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       gs.thread4.rendering_enable = 1;
 
    if (INTEL_DEBUG & DEBUG_STATS)
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index ea71857548..7b70f787b7 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -66,7 +66,7 @@ static void upload_blend_constant_color(struct brw_context *brw)
 const struct brw_tracked_state brw_blend_constant_color = {
    .dirty = {
       .mesa = _NEW_COLOR,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_blend_constant_color
@@ -78,10 +78,7 @@ static void upload_drawing_rect(struct brw_context *brw)
    struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &intel->ctx;
 
-   if (!intel->constant_cliprect)
-      return;
-
-   BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
    OUT_BATCH(0); /* xmin, ymin */
    OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
@@ -93,7 +90,7 @@ static void upload_drawing_rect(struct brw_context *brw)
 const struct brw_tracked_state brw_drawing_rect = {
    .dirty = {
       .mesa = _NEW_BUFFERS,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_drawing_rect
@@ -116,7 +113,7 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
 
-   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(6);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
    if (brw->vs.bind_bo != NULL)
       OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
@@ -150,7 +147,7 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
 {
    struct intel_context *intel = &brw->intel;
 
-   BEGIN_BATCH(7, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(7);
    OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
    OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    if (brw->gs.prog_active)
@@ -212,10 +209,10 @@ static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
-   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
+   unsigned int len = (intel->is_g4x || intel->is_ironlake) ? 6 : 5;
 
    if (region == NULL) {
-      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(len);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
 		(BRW_SURFACE_NULL << 29));
@@ -223,7 +220,7 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+      if (intel->is_g4x || intel->is_ironlake)
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -247,7 +244,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 
       assert(region->tiling != I915_TILING_X);
 
-      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      BEGIN_BATCH(len);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
 		(format << 18) |
@@ -262,7 +259,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 		((region->height - 1) << 19));
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+      if (intel->is_g4x || intel->is_ironlake)
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -317,7 +314,7 @@ static void upload_polygon_stipple(struct brw_context *brw)
 const struct brw_tracked_state brw_polygon_stipple = {
    .dirty = {
       .mesa = _NEW_POLYGONSTIPPLE,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_polygon_stipple
@@ -330,7 +327,7 @@ const struct brw_tracked_state brw_polygon_stipple = {
 
 static void upload_polygon_stipple_offset(struct brw_context *brw)
 {
-   __DRIdrawablePrivate *dPriv = brw->intel.driDrawable;
+   __DRIdrawable *dPriv = brw->intel.driDrawable;
    struct brw_polygon_stipple_offset bpso;
 
    memset(&bpso, 0, sizeof(bpso));
@@ -362,7 +359,7 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
 const struct brw_tracked_state brw_polygon_stipple_offset = {
    .dirty = {
       .mesa = _NEW_WINDOW_POS,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_polygon_stipple_offset
@@ -374,8 +371,8 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
 static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
-   
-   if (BRW_IS_965(brw))
+
+   if (!brw->has_aa_line_parameters)
       return;
 
    /* use legacy aa line coverage computation */
@@ -425,7 +422,7 @@ static void upload_line_stipple(struct brw_context *brw)
 const struct brw_tracked_state brw_line_stipple = {
    .dirty = {
       .mesa = _NEW_LINE,
-      .brw = 0,
+      .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
    .emit = upload_line_stipple
@@ -444,7 +441,7 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_pipeline_select ps;
 
       memset(&ps, 0, sizeof(ps));
-      ps.header.opcode = CMD_PIPELINE_SELECT(brw);
+      ps.header.opcode = brw->CMD_PIPELINE_SELECT;
       ps.header.pipeline_select = 0;
       BRW_BATCH_STRUCT(brw, &ps);
    }
@@ -480,7 +477,7 @@ static void upload_invarient_state( struct brw_context *brw )
       struct brw_vf_statistics vfs;
       memset(&vfs, 0, sizeof(vfs));
 
-      vfs.opcode = CMD_VF_STATISTICS(brw);
+      vfs.opcode = brw->CMD_VF_STATISTICS;
       if (INTEL_DEBUG & DEBUG_STATS)
 	 vfs.statistics_enable = 1; 
 
@@ -512,8 +509,8 @@ static void upload_state_base_address( struct brw_context *brw )
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
     */
-   if (BRW_IS_IGDNG(brw)) {
-       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+   if (intel->is_ironlake) {
+       BEGIN_BATCH(8);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
        OUT_BATCH(1); /* General state base address */
        OUT_BATCH(1); /* Surface state base address */
@@ -524,7 +521,7 @@ static void upload_state_base_address( struct brw_context *brw )
        OUT_BATCH(1); /* Instruction access upper bound */
        ADVANCE_BATCH();
    } else {
-       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       BEGIN_BATCH(6);
        OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
        OUT_BATCH(1); /* General state base address */
        OUT_BATCH(1); /* Surface state base address */
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index a195bc32b0..5399a74244 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -188,7 +188,7 @@ brw_emit_query_begin(struct brw_context *brw)
    if (brw->query.active || is_empty_list(&brw->query.active_head))
       return;
 
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_PIPE_CONTROL |
 	     PIPE_CONTROL_DEPTH_STALL |
 	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
@@ -227,7 +227,7 @@ brw_emit_query_end(struct brw_context *brw)
    if (!brw->query.active)
       return;
 
-   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_PIPE_CONTROL |
 	     PIPE_CONTROL_DEPTH_STALL |
 	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index f350cbd74e..968890f7fb 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -61,7 +61,7 @@ static void compile_sf_prog( struct brw_context *brw,
    c.key = *key;
    c.nr_attrs = brw_count_bits(c.key.attrs);
    c.nr_attr_regs = (c.nr_attrs+1)/2;
-   c.nr_setup_attrs = brw_count_bits(c.key.attrs & DO_SETUP_BITS);
+   c.nr_setup_attrs = brw_count_bits(c.key.attrs);
    c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
 
    c.prog_data.urb_read_length = c.nr_attr_regs;
@@ -70,7 +70,7 @@ static void compile_sf_prog( struct brw_context *brw,
    /* Construct map from attribute number to position in the vertex.
     */
    for (i = idx = 0; i < VERT_RESULT_MAX; i++) 
-      if (c.key.attrs & (1<<i)) {
+      if (c.key.attrs & BITFIELD64_BIT(i)) {
 	 c.attr_to_idx[i] = idx;
 	 c.idx_to_attr[idx] = i;
 	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
@@ -147,7 +147,7 @@ static void upload_sf_prog(struct brw_context *brw)
        * edgeflag testing here, it is already done in the clip
        * program.
        */
-      if (key.attrs & (1<<VERT_RESULT_EDGE))
+      if (key.attrs & BITFIELD64_BIT(VERT_RESULT_EDGE))
 	 key.primitive = SF_UNFILLED_TRIS;
       else
 	 key.primitive = SF_TRIANGLES;
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
index e835229a1d..0ba731fac9 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ b/src/mesa/drivers/dri/i965/brw_sf.h
@@ -45,7 +45,7 @@
 #define SF_UNFILLED_TRIS   3
 
 struct brw_sf_prog_key {
-   GLuint attrs:32;
+   GLbitfield64 attrs;
    GLuint primitive:2;
    GLuint do_twoside_color:1;
    GLuint do_flat_shading:1;
diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/mesa/drivers/dri/i965/brw_sf_emit.c
index 561fcd501b..bb08055e3b 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_emit.c
@@ -56,7 +56,7 @@ static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
 static GLboolean have_attr(struct brw_sf_compile *c,
 			   GLuint attr)
 {
-   return (c->key.attrs & (1<<attr)) ? 1 : 0;
+   return (c->key.attrs & BITFIELD64_BIT(attr)) ? 1 : 0;
 }
 
 /*********************************************************************** 
@@ -122,8 +122,8 @@ static void do_twoside_color( struct brw_sf_compile *c )
  * Flat shading
  */
 
-#define VERT_RESULT_COLOR_BITS ((1<<VERT_RESULT_COL0) | \
-                                 (1<<VERT_RESULT_COL1))
+#define VERT_RESULT_COLOR_BITS (BITFIELD64_BIT(VERT_RESULT_COL0) | \
+				BITFIELD64_BIT(VERT_RESULT_COL1))
 
 static void copy_colors( struct brw_sf_compile *c,
 		     struct brw_reg dst,
@@ -149,6 +149,7 @@ static void copy_colors( struct brw_sf_compile *c,
 static void do_flatshade_triangle( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
@@ -161,7 +162,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    brw_push_insn_state(p);
@@ -187,6 +188,7 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
 static void do_flatshade_line( struct brw_sf_compile *c )
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
    GLuint jmpi = 1;
@@ -199,7 +201,7 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
-   if (BRW_IS_IGDNG(p->brw))
+   if (intel->is_ironlake)
        jmpi = 2;
 
    brw_push_insn_state(p);
@@ -312,8 +314,8 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLushort *pc_linear)
 {
    GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   GLuint persp_mask;
-   GLuint linear_mask;
+   GLbitfield64 persp_mask;
+   GLbitfield64 linear_mask;
 
    if (c->key.do_flat_shading || c->key.linear_color)
       persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
@@ -331,10 +333,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    *pc_linear = 0;
    *pc = 0xf;
       
-   if (persp_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (persp_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2]))
       *pc_persp = 0xf;
 
-   if (linear_mask & (1 << c->idx_to_attr[reg*2])) 
+   if (linear_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2]))
       *pc_linear = 0xf;
 
    /* Maybe only processs one attribute on the final round:
@@ -342,10 +344,10 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
    if (reg*2+1 < c->nr_setup_attrs) {
       *pc |= 0xf0;
 
-      if (persp_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (persp_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2+1]))
 	 *pc_persp |= 0xf0;
 
-      if (linear_mask & (1 << c->idx_to_attr[reg*2+1])) 
+      if (linear_mask & BITFIELD64_BIT(c->idx_to_attr[reg*2+1]))
 	 *pc_linear |= 0xf0;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index bc0f076073..09223b7cfb 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -93,7 +93,8 @@ static void upload_sf_vp(struct brw_context *brw)
    }
 
    dri_bo_unreference(brw->sf.vp_bo);
-   brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
+   brw->sf.vp_bo = brw_cache_data(&brw->cache, BRW_SF_VP, &sfv, sizeof(sfv),
+				  NULL, 0);
 }
 
 const struct brw_tracked_state brw_sf_vp = {
@@ -113,7 +114,8 @@ struct brw_sf_unit_key {
 
    unsigned int nr_urb_entries, urb_size, sfsize;
 
-   GLenum front_face, cull_face, provoking_vertex;
+   GLenum front_face, cull_face;
+   unsigned pv_first:1;
    unsigned scissor:1;
    unsigned line_smooth:1;
    unsigned point_sprite:1;
@@ -154,7 +156,7 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->point_attenuated = ctx->Point._Attenuated;
 
    /* _NEW_LIGHT */
-   key->provoking_vertex = ctx->Light.ProvokingVertex;
+   key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION);
 
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
 }
@@ -163,6 +165,7 @@ static dri_bo *
 sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 			dri_bo **reloc_bufs)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_sf_unit_state sf;
    dri_bo *bo;
    int chipset_max_threads;
@@ -175,7 +178,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    sf.thread3.dispatch_grf_start_reg = 3;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
        sf.thread3.urb_entry_read_offset = 3;
    else
        sf.thread3.urb_entry_read_offset = 1;
@@ -185,10 +188,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    sf.thread4.nr_urb_entries = key->nr_urb_entries;
    sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
 
-   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
-    * 48(IGDNG) threads 
+   /* Each SF thread produces 1 PUE, and there can be up to 24 (Pre-Ironlake) or
+    * 48 (Ironlake) threads.
     */
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       chipset_max_threads = 48;
    else
       chipset_max_threads = 24;
@@ -287,7 +290,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
-   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+   if (!key->pv_first) {
       sf.sf7.trifan_pv = 2;
       sf.sf7.linestrip_pv = 1;
       sf.sf7.tristrip_pv = 2;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index d639656b9d..9c9d145c4b 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -35,7 +35,7 @@
 
 #include "brw_context.h"
 
-static inline void
+static INLINE void
 brw_add_validated_bo(struct brw_context *brw, dri_bo *bo)
 {
    assert(brw->state.validated_bo_count < ARRAY_SIZE(brw->state.validated_bos));
@@ -112,6 +112,7 @@ void brw_validate_state(struct brw_context *brw);
 void brw_upload_state(struct brw_context *brw);
 void brw_init_state(struct brw_context *brw);
 void brw_destroy_state(struct brw_context *brw);
+void brw_clear_validated_bos(struct brw_context *brw);
 
 /***********************************************************************
  * brw_state_cache.c
@@ -119,16 +120,10 @@ void brw_destroy_state(struct brw_context *brw);
 dri_bo *brw_cache_data(struct brw_cache *cache,
 		       enum brw_cache_id cache_id,
 		       const void *data,
+		       GLuint size,
 		       dri_bo **reloc_bufs,
 		       GLuint nr_reloc_bufs);
 
-dri_bo *brw_cache_data_sz(struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *data,
-			  GLuint data_size,
-			  dri_bo **reloc_bufs,
-			  GLuint nr_reloc_bufs);
-
 dri_bo *brw_upload_cache( struct brw_cache *cache,
 			  enum brw_cache_id cache_id,
 			  const void *key,
@@ -156,7 +151,7 @@ void brw_state_cache_bo_delete(struct brw_cache *cache, dri_bo *bo);
 /***********************************************************************
  * brw_state_batch.c
  */
-#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s)))
 #define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
 GLboolean brw_cached_batch_struct( struct brw_context *brw,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 7821898cf9..ed8120d617 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -48,7 +48,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
    struct header *newheader = (struct header *)data;
 
    if (brw->emit_state_always) {
-      intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+      intel_batchbuffer_data(brw->intel.batch, data, sz);
       return GL_TRUE;
    }
 
@@ -75,7 +75,7 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 
  emit:
    memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(brw->intel.batch, data, sz, IGNORE_CLIPRECTS);
+   intel_batchbuffer_data(brw->intel.batch, data, sz);
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index c262e1db8b..e4c9ba7d87 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -245,7 +245,6 @@ brw_upload_cache( struct brw_cache *cache,
 
    item->bo = bo;
    dri_bo_reference(bo);
-   item->data_size = data_size;
 
    if (cache->n_items > cache->size * 1.5)
       rehash(cache);
@@ -275,15 +274,22 @@ brw_upload_cache( struct brw_cache *cache,
 
 
 /**
- * This doesn't really work with aux data.  Use search/upload instead
+ * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
+ *
+ * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be
+ * better to use, as the potentially changing offsets in the data-used-as-key
+ * will result in excessive cache misses.
+ *
+ * If aux data is involved, use search/upload instead.
+
  */
 dri_bo *
-brw_cache_data_sz(struct brw_cache *cache,
-		  enum brw_cache_id cache_id,
-		  const void *data,
-		  GLuint data_size,
-		  dri_bo **reloc_bufs,
-		  GLuint nr_reloc_bufs)
+brw_cache_data(struct brw_cache *cache,
+	       enum brw_cache_id cache_id,
+	       const void *data,
+	       GLuint data_size,
+	       dri_bo **reloc_bufs,
+	       GLuint nr_reloc_bufs)
 {
    dri_bo *bo;
    struct brw_cache_item *item;
@@ -306,25 +312,6 @@ brw_cache_data_sz(struct brw_cache *cache,
    return bo;
 }
 
-
-/**
- * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
- *
- * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be
- * better to use, as the potentially changing offsets in the data-used-as-key
- * will result in excessive cache misses.
- */
-dri_bo *
-brw_cache_data(struct brw_cache *cache,
-	       enum brw_cache_id cache_id,
-	       const void *data,
-	       dri_bo **reloc_bufs,
-	       GLuint nr_reloc_bufs)
-{
-   return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id],
-			    reloc_bufs, nr_reloc_bufs);
-}
-
 enum pool_type {
    DW_SURFACE_STATE,
    DW_GENERAL_STATE
@@ -335,11 +322,9 @@ static void
 brw_init_cache_id(struct brw_cache *cache,
                   const char *name,
                   enum brw_cache_id id,
-                  GLuint key_size,
                   GLuint aux_size)
 {
    cache->name[id] = strdup(name);
-   cache->key_size[id] = key_size;
    cache->aux_size[id] = aux_size;
 }
 
@@ -359,91 +344,76 @@ brw_init_non_surface_cache(struct brw_context *brw)
    brw_init_cache_id(cache,
 		     "CC_VP",
 		     BRW_CC_VP,
-		     sizeof(struct brw_cc_viewport),
 		     0);
 
    brw_init_cache_id(cache,
 		     "CC_UNIT",
 		     BRW_CC_UNIT,
-		     sizeof(struct brw_cc_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "WM_PROG",
 		     BRW_WM_PROG,
-		     sizeof(struct brw_wm_prog_key),
 		     sizeof(struct brw_wm_prog_data));
 
    brw_init_cache_id(cache,
 		     "SAMPLER_DEFAULT_COLOR",
 		     BRW_SAMPLER_DEFAULT_COLOR,
-		     sizeof(struct brw_sampler_default_color),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SAMPLER",
 		     BRW_SAMPLER,
-		     0,		/* variable key/data size */
 		     0);
 
    brw_init_cache_id(cache,
 		     "WM_UNIT",
 		     BRW_WM_UNIT,
-		     sizeof(struct brw_wm_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SF_PROG",
 		     BRW_SF_PROG,
-		     sizeof(struct brw_sf_prog_key),
 		     sizeof(struct brw_sf_prog_data));
 
    brw_init_cache_id(cache,
 		     "SF_VP",
 		     BRW_SF_VP,
-		     sizeof(struct brw_sf_viewport),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SF_UNIT",
 		     BRW_SF_UNIT,
-		     sizeof(struct brw_sf_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "VS_UNIT",
 		     BRW_VS_UNIT,
-		     sizeof(struct brw_vs_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "VS_PROG",
 		     BRW_VS_PROG,
-		     sizeof(struct brw_vs_prog_key),
 		     sizeof(struct brw_vs_prog_data));
 
    brw_init_cache_id(cache,
 		     "CLIP_UNIT",
 		     BRW_CLIP_UNIT,
-		     sizeof(struct brw_clip_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "CLIP_PROG",
 		     BRW_CLIP_PROG,
-		     sizeof(struct brw_clip_prog_key),
 		     sizeof(struct brw_clip_prog_data));
 
    brw_init_cache_id(cache,
 		     "GS_UNIT",
 		     BRW_GS_UNIT,
-		     sizeof(struct brw_gs_unit_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "GS_PROG",
 		     BRW_GS_PROG,
-		     sizeof(struct brw_gs_prog_key),
 		     sizeof(struct brw_gs_prog_data));
 }
 
@@ -463,13 +433,11 @@ brw_init_surface_cache(struct brw_context *brw)
    brw_init_cache_id(cache,
 		     "SS_SURFACE",
 		     BRW_SS_SURFACE,
-		     sizeof(struct brw_surface_state),
 		     0);
 
    brw_init_cache_id(cache,
 		     "SS_SURF_BIND",
 		     BRW_SS_SURF_BIND,
-		     0,
 		     0);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index f4283bda1b..af8dfb4c15 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -143,7 +143,7 @@ static void xor_states( struct brw_state_flags *result,
    result->cache = a->cache ^ b->cache;
 }
 
-static void
+void
 brw_clear_validated_bos(struct brw_context *brw)
 {
    int i;
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index e59e52ed86..64a9535282 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -49,7 +49,7 @@ GLboolean brw_miptree_layout(struct intel_context *intel,
 
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+      if (intel->is_ironlake) {
           GLuint align_h = 2, align_w = 4;
           GLuint level;
           GLuint x = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 8c6f4355a6..f2cdb203b8 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -105,7 +105,8 @@ static GLboolean check_urb_layout( struct brw_context *brw )
    brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
    brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
 
-   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw);
+   return brw->urb.cs_start + brw->urb.nr_cs_entries *
+      brw->urb.csize <= brw->urb.size;
 }
 
 /* Most minimal update, forces re-emit of URB fence packet after GS
@@ -113,6 +114,7 @@ static GLboolean check_urb_layout( struct brw_context *brw )
  */
 static void recalculate_urb_fence( struct brw_context *brw )
 {
+   struct intel_context *intel = &brw->intel;
    GLuint csize = brw->curbe.total_size;
    GLuint vsize = brw->vs.prog_data->urb_entry_size;
    GLuint sfsize = brw->sf.prog_data->urb_entry_size;
@@ -146,7 +148,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
 
       brw->urb.constrained = 0;
 
-      if (BRW_IS_IGDNG(brw)) {
+      if (intel->is_ironlake) {
          brw->urb.nr_vs_entries = 128;
          brw->urb.nr_sf_entries = 48;
          if (check_urb_layout(brw)) {
@@ -156,7 +158,7 @@ static void recalculate_urb_fence( struct brw_context *brw )
             brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
             brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
          }
-      } else if (BRW_IS_G4X(brw)) {
+      } else if (intel->is_g4x) {
 	 brw->urb.nr_vs_entries = 64;
 	 if (check_urb_layout(brw)) {
 	    goto done;
@@ -200,7 +202,7 @@ done:
 		      brw->urb.clip_start,
 		      brw->urb.sf_start,
 		      brw->urb.cs_start, 
-		      URB_SIZES(brw));
+		      brw->urb.size);
       
       brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
    }
@@ -244,7 +246,7 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits0.gs_fence  = brw->urb.clip_start; 
    uf.bits0.clp_fence = brw->urb.sf_start; 
    uf.bits1.sf_fence  = brw->urb.cs_start; 
-   uf.bits1.cs_fence  = URB_SIZES(brw);
+   uf.bits1.cs_fence  = brw->urb.size;
 
    BRW_BATCH_STRUCT(brw, &uf);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c
index ce21aa4869..bba9249d1b 100644
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -35,7 +35,7 @@
 #include "brw_util.h"
 #include "brw_defines.h"
 
-GLuint brw_count_bits( GLuint val )
+GLuint brw_count_bits(uint64_t val)
 {
    GLuint i;
    for (i = 0; val ; val >>= 1)
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 33e7cd87e4..04f3175d3e 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -35,7 +35,7 @@
 
 #include "main/mtypes.h"
 
-extern GLuint brw_count_bits( GLuint val );
+extern GLuint brw_count_bits(uint64_t val);
 extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList);
 extern GLuint brw_translate_blend_factor( GLenum factor );
 extern GLuint brw_translate_blend_equation( GLenum mode );
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index f0c79efbd9..fd055e225e 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -56,7 +56,7 @@ static void do_vs_prog( struct brw_context *brw,
    c.prog_data.inputs_read = vp->program.Base.InputsRead;
 
    if (c.key.copy_edgeflag) {
-      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.outputs_written |= BITFIELD64_BIT(VERT_RESULT_EDGE);
       c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 1638ef8111..1b84dd505f 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -67,6 +67,7 @@ static void release_tmps( struct brw_vs_compile *c )
  */
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
+   struct intel_context *intel = &c->func.brw->intel;
    GLuint i, reg = 0, mrf;
    int attributes_in_vue;
 
@@ -141,13 +142,13 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
    c->first_output = reg;
    c->first_overflow_output = 0;
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (intel->is_ironlake)
        mrf = 8;
    else
        mrf = 4;
 
    for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (c->prog_data.outputs_written & (1 << i)) {
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 	 c->nr_outputs++;
          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 	 if (i == VERT_RESULT_HPOS) {
@@ -238,7 +239,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
-   if (BRW_IS_IGDNG(c->func.brw))
+   if (intel->is_ironlake)
        c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
    else
        c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
@@ -331,63 +332,76 @@ static void unalias3( struct brw_vs_compile *c,
    }
 }
 
-static void emit_sop( struct brw_compile *p,
+static void emit_sop( struct brw_vs_compile *c,
                       struct brw_reg dst,
                       struct brw_reg arg0,
                       struct brw_reg arg1, 
 		      GLuint cond)
 {
+   struct brw_compile *p = &c->func;
+
    brw_MOV(p, dst, brw_imm_f(0.0f));
    brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
    brw_MOV(p, dst, brw_imm_f(1.0f));
    brw_set_predicate_control_flag_value(p, 0xff);
 }
 
-static void emit_seq( struct brw_compile *p,
+static void emit_seq( struct brw_vs_compile *c,
                       struct brw_reg dst,
                       struct brw_reg arg0,
                       struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
 }
 
-static void emit_sne( struct brw_compile *p,
+static void emit_sne( struct brw_vs_compile *c,
                       struct brw_reg dst,
                       struct brw_reg arg0,
                       struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
 }
-static void emit_slt( struct brw_compile *p, 
+static void emit_slt( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
 }
 
-static void emit_sle( struct brw_compile *p, 
+static void emit_sle( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
 }
 
-static void emit_sgt( struct brw_compile *p, 
+static void emit_sgt( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+   emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
 }
 
-static void emit_sge( struct brw_compile *p, 
+static void emit_sge( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+  emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+}
+
+static void emit_cmp( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      struct brw_reg arg2 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
+   brw_SEL(p, dst, arg1, arg2);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 }
 
 static void emit_max( struct brw_compile *p, 
@@ -912,6 +926,7 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_CONSTANT:
    case PROGRAM_UNIFORM:
    case PROGRAM_ENV_PARAM:
+   case PROGRAM_LOCAL_PARAM:
       if (c->vp->use_const_buffer) {
          return get_constant(c, inst, argIndex);
       }
@@ -930,7 +945,6 @@ get_src_reg( struct brw_vs_compile *c,
       /* this is a normal case since we loop over all three src args */
       return brw_null_reg();
 
-   case PROGRAM_LOCAL_PARAM: 
    case PROGRAM_WRITE_ONLY:
    default:
       assert(0);
@@ -1100,6 +1114,8 @@ static void emit_swz( struct brw_vs_compile *c,
 static void emit_vertex_write( struct brw_vs_compile *c)
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
@@ -1122,8 +1138,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
-   if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || BRW_IS_965(p->brw))
+   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
+       c->key.nr_userclip || brw->has_negative_rhw_bug)
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1132,7 +1148,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
       brw_set_access_mode(p, BRW_ALIGN_16);	
 
-      if (c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) {
+      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
 	 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
 	 brw_MUL(p, brw_writemask(header1, WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
 	 brw_AND(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
@@ -1154,7 +1170,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (BRW_IS_965(p->brw)) {
+      if (brw->has_negative_rhw_bug) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -1182,8 +1198,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_MOV(p, offset(m0, 2), ndc);
 
-   if (BRW_IS_IGDNG(p->brw)) {
-       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+   if (intel->is_ironlake) {
+       /* There are 20 DWs (D0-D19) in VUE vertex header on Ironlake */
        brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
        /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
         * Seems it is useless for us.
@@ -1222,7 +1238,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        */
       GLuint i, mrf = 0;
       for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
-         if (c->prog_data.outputs_written & (1 << i)) {
+         if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
             /* move from GRF to MRF */
             brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
             mrf++;
@@ -1269,6 +1285,38 @@ post_vs_emit( struct brw_vs_compile *c,
    }
 }
 
+static GLboolean
+accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
+
+   if (p->nr_insn == 0)
+      return GL_FALSE;
+
+   if (val.address_mode != BRW_ADDRESS_DIRECT)
+      return GL_FALSE;
+
+   switch (prev_insn->header.opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MUL:
+      if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
+	  prev_insn->header.execution_size == val.width &&
+	  prev_insn->bits1.da1.dest_reg_file == val.file &&
+	  prev_insn->bits1.da1.dest_reg_type == val.type &&
+	  prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
+	  prev_insn->bits1.da1.dest_reg_nr == val.nr &&
+	  prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
+	  prev_insn->bits1.da16.dest_writemask == 0xf)
+	 return GL_TRUE;
+      else
+	 return GL_FALSE;
+   default:
+      return GL_FALSE;
+   }
+}
+
 static uint32_t
 get_predicate(const struct prog_instruction *inst)
 {
@@ -1314,6 +1362,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 #define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
    struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    const GLuint nr_insns = c->vp->program.Base.NumInstructions;
    GLuint insn, if_depth = 0, loop_depth = 0;
    GLuint end_offset = 0;
@@ -1447,9 +1496,13 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
 	 break;
       case OPCODE_MAD:
-	 brw_MOV(p, brw_acc_reg(), args[2]);
+	 if (!accumulator_contains(c, args[2]))
+	    brw_MOV(p, brw_acc_reg(), args[2]);
 	 brw_MAC(p, dst, args[0], args[1]);
 	 break;
+      case OPCODE_CMP:
+	 emit_cmp(p, dst, args[0], args[1], args[2]);
+	 break;
       case OPCODE_MAX:
 	 emit_max(p, dst, args[0], args[1]);
 	 break;
@@ -1473,25 +1526,25 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 break;
 
       case OPCODE_SEQ:
-         emit_seq(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_seq);
          break;
       case OPCODE_SIN:
 	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_SNE:
-         emit_sne(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sne);
          break;
       case OPCODE_SGE:
-	 emit_sge(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sge);
 	 break;
       case OPCODE_SGT:
-         emit_sgt(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sgt);
          break;
       case OPCODE_SLT:
-	 emit_slt(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_slt);
 	 break;
       case OPCODE_SLE:
-         emit_sle(p, dst, args[0], args[1]);
+         unalias2(c, dst, args[0], args[1], emit_sle);
          break;
       case OPCODE_SUB:
 	 brw_ADD(p, dst, args[0], negate(args[1]));
@@ -1543,7 +1596,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
             loop_depth--;
 
-	    if (BRW_IS_IGDNG(brw))
+	    if (intel->is_ironlake)
 	       br = 2;
 
             inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index d790ab6555..345ffa7ee1 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -82,9 +82,9 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 static dri_bo *
 vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_vs_unit_state vs;
    dri_bo *bo;
-   int chipset_max_threads;
 
    memset(&vs, 0, sizeof(vs));
 
@@ -98,7 +98,7 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     */
    vs.thread1.single_program_flow = 0;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
    else
       vs.thread1.binding_table_entry_count = key->nr_surfaces;
@@ -109,24 +109,44 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    vs.thread3.urb_entry_read_offset = 0;
    vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   if (BRW_IS_IGDNG(brw))
-       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
-   else
-       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+   if (intel->is_ironlake) {
+      switch (key->nr_urb_entries) {
+      case 8:
+      case 12:
+      case 16:
+      case 32:
+      case 64:
+      case 96:
+      case 128:
+      case 168:
+      case 192:
+      case 224:
+      case 256:
+	 vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+	 break;
+      default:
+	 assert(0);
+      }
+   } else {
+      switch (key->nr_urb_entries) {
+      case 8:
+      case 12:
+      case 16:
+      case 32:
+	 break;
+      case 64:
+	 assert(intel->is_g4x);
+	 break;
+      default:
+	 assert(0);
+      }
+      vs.thread4.nr_urb_entries = key->nr_urb_entries;
+   }
 
    vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
 
-   if (BRW_IS_IGDNG(brw))
-      chipset_max_threads = 72;
-   else if (BRW_IS_G4X(brw))
-      chipset_max_threads = 32;
-   else
-      chipset_max_threads = 16;
    vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
-				  1, chipset_max_threads) - 1;
-
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
-      vs.thread4.max_threads = 0;
+				  1, brw->vs_max_threads) - 1;
 
    /* No samplers for ARB_vp programs:
     */
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 4fa3269bed..3bc9840a97 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -52,6 +52,7 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
    const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
    const int size = params->NumParameters * 4 * sizeof(GLfloat);
    drm_intel_bo *const_buffer;
+   int i;
 
    /* BRW_NEW_VERTEX_PROGRAM */
    if (!vp->use_const_buffer)
@@ -61,7 +62,19 @@ brw_vs_update_constant_buffer(struct brw_context *brw)
 				     size, 64);
 
    /* _NEW_PROGRAM_CONSTANTS */
-   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   /* Updates the ParamaterValues[i] pointers for all parameters of the
+    * basic type of PROGRAM_STATE_VAR.
+    */
+   _mesa_load_state_parameters(&brw->intel.ctx, vp->program.Base.Parameters);
+
+   intel_bo_map_gtt_preferred(intel, const_buffer, GL_TRUE);
+   for (i = 0; i < params->NumParameters; i++) {
+      memcpy(const_buffer->virtual + i * 4 * sizeof(float),
+	     params->ParameterValues[i],
+	     4 * sizeof(float));
+   }
+   intel_bo_unmap_gtt_preferred(intel, const_buffer);
 
    return const_buffer;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 124fde25fe..72749b3859 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -46,7 +46,7 @@
 #include "brw_state.h"
 #include "brw_fallback.h"
 #include "brw_vs.h"
-
+#include "brw_wm.h"
 
 static void
 dri_bo_release(dri_bo **bo)
@@ -66,8 +66,14 @@ static void brw_destroy_context( struct intel_context *intel )
 
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
-
-   _mesa_free(brw->wm.compile_data);
+   brw_clear_validated_bos(brw);
+   if (brw->wm.compile_data) {
+      _mesa_free(brw->wm.compile_data->instruction);
+      _mesa_free(brw->wm.compile_data->vreg);
+      _mesa_free(brw->wm.compile_data->refs);
+      _mesa_free(brw->wm.compile_data->prog_instructions);
+      _mesa_free(brw->wm.compile_data);
+   }
 
    for (i = 0; i < brw->state.nr_color_regions; i++)
       intel_region_release(&brw->state.color_regions[i]);
@@ -144,9 +150,6 @@ static void brw_new_batch( struct intel_context *intel )
 {
    struct brw_context *brw = brw_context(&intel->ctx);
 
-   /* Check that we didn't just wrap our batchbuffer at a bad time. */
-   assert(!brw->no_batch_wrap);
-
    brw->curbe.need_new_bo = GL_TRUE;
 
    /* Mark all context state as needing to be re-emitted.
@@ -175,20 +178,6 @@ static void brw_note_fence( struct intel_context *intel, GLuint fence )
    brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
 }
 
-/* called from intelWaitForIdle() and intelFlush()
- *
- * For now, just flush everything.  Could be smarter later.
- */
-static GLuint brw_flush_cmd( void )
-{
-   struct brw_mi_flush flush;
-   flush.opcode = CMD_MI_FLUSH;
-   flush.pad = 0;
-   flush.flags = BRW_FLUSH_STATE_CACHE;
-   return *(GLuint *)&flush;
-}
-
-
 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
 {
    /* nothing */
@@ -209,6 +198,5 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
-   brw->intel.vtbl.flush_cmd = brw_flush_cmd;
    brw->intel.vtbl.debug_batch = brw_debug_batch;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 964ee104c2..6895f64410 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -152,8 +152,21 @@ static void do_wm_prog( struct brw_context *brw,
           */
          return;
       }
+      c->instruction = _mesa_calloc(BRW_WM_MAX_INSN * sizeof(*c->instruction));
+      c->prog_instructions = _mesa_calloc(BRW_WM_MAX_INSN *
+					  sizeof(*c->prog_instructions));
+      c->vreg = _mesa_calloc(BRW_WM_MAX_VREG * sizeof(*c->vreg));
+      c->refs = _mesa_calloc(BRW_WM_MAX_REF * sizeof(*c->refs));
    } else {
+      void *instruction = c->instruction;
+      void *prog_instructions = c->prog_instructions;
+      void *vreg = c->vreg;
+      void *refs = c->refs;
       memset(c, 0, sizeof(*brw->wm.compile_data));
+      c->instruction = instruction;
+      c->prog_instructions = prog_instructions;
+      c->vreg = vreg;
+      c->refs = refs;
    }
    memcpy(&c->key, key, sizeof(*key));
 
@@ -217,7 +230,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
        ctx->Color.AlphaEnabled)
       lookup |= IZ_PS_KILL_ALPHATEST_BIT;
 
-   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH))
+   if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
       lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
 
    /* _NEW_DEPTH */
@@ -333,7 +346,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    key->nr_color_regions = brw->state.nr_color_regions;
 
    /* CACHE_NEW_VS_PROG */
-   key->vp_outputs_written = brw->vs.prog_data->outputs_written & DO_SETUP_BITS;
+   key->vp_outputs_written = brw->vs.prog_data->outputs_written;
 
    /* The unique fragment program ID */
    key->program_string_id = fp->id;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 47aa4da306..b9b987ea70 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -76,10 +76,10 @@ struct brw_wm_prog_key {
 
    GLushort tex_swizzles[BRW_MAX_TEX_UNIT];
 
-   GLuint program_string_id:32;
    GLushort origin_x, origin_y;
    GLushort drawable_height;
-   GLuint vp_outputs_written;
+   GLbitfield64 vp_outputs_written;
+   GLuint program_string_id:32;
 };
 
 
@@ -162,6 +162,8 @@ struct brw_wm_instruction {
 #define BRW_WM_MAX_CONST 256
 #define BRW_WM_MAX_SUBROUTINE 16
 
+/* used in masks next to WRITEMASK_*. */
+#define SATURATE (1<<5)
 
 
 /* New opcodes to track internal operations required for WM unit.
@@ -200,7 +202,7 @@ struct brw_wm_compile {
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
     */
-   struct prog_instruction prog_instructions[BRW_WM_MAX_INSN];
+   struct prog_instruction *prog_instructions;
    GLuint nr_fp_insns;
    GLuint fp_temp;
    GLuint fp_interp_emitted;
@@ -211,7 +213,7 @@ struct brw_wm_compile {
    struct prog_src_register pixel_w;
 
 
-   struct brw_wm_value vreg[BRW_WM_MAX_VREG];
+   struct brw_wm_value *vreg;
    GLuint nr_vreg;
 
    struct brw_wm_value creg[BRW_WM_MAX_PARAM];
@@ -228,10 +230,10 @@ struct brw_wm_compile {
    struct brw_wm_ref undef_ref;
    struct brw_wm_value undef_value;
 
-   struct brw_wm_ref refs[BRW_WM_MAX_REF];
+   struct brw_wm_ref *refs;
    GLuint nr_refs;
 
-   struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
+   struct brw_wm_instruction *instruction;
    GLuint nr_insns;
 
    struct brw_wm_constref constref[BRW_WM_MAX_CONST];
@@ -306,10 +308,141 @@ void brw_wm_lookup_iz( GLuint line_aa,
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
+/* brw_wm_emit.c */
+void emit_alu1(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0);
+void emit_alu2(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0,
+	       const struct brw_reg *arg1);
+void emit_cinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0);
 void emit_ddxy(struct brw_compile *p,
 	       const struct brw_reg *dst,
 	       GLuint mask,
 	       GLboolean is_ddx,
 	       const struct brw_reg *arg0);
+void emit_delta_xy(struct brw_compile *p,
+		   const struct brw_reg *dst,
+		   GLuint mask,
+		   const struct brw_reg *arg0);
+void emit_dp3(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_dp4(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_dph(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_fb_write(struct brw_wm_compile *c,
+		   struct brw_reg *arg0,
+		   struct brw_reg *arg1,
+		   struct brw_reg *arg2,
+		   GLuint target,
+		   GLuint eot);
+void emit_frontfacing(struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask);
+void emit_linterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas);
+void emit_lrp(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2);
+void emit_mad(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2);
+void emit_math1(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0);
+void emit_math2(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0,
+		const struct brw_reg *arg1);
+void emit_min(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_max(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_pinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas,
+		  const struct brw_reg *w);
+void emit_pixel_xy(struct brw_wm_compile *c,
+		   const struct brw_reg *dst,
+		   GLuint mask);
+void emit_pixel_w(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas);
+void emit_sop(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      GLuint cond,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
+void emit_tex(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler,
+	      GLboolean shadow);
+void emit_txb(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler);
+void emit_wpos_xy(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0);
+void emit_xpd(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index bf80a2942a..f316e0cda4 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -44,6 +44,7 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
    return reg;
 }
 
+
 /* Payload R0:
  *
  * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
@@ -60,42 +61,50 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
  * R1.8 -- ?
  */
 
-
-static void emit_pixel_xy(struct brw_compile *p,
-			  const struct brw_reg *dst,
-			  GLuint mask)
+void emit_pixel_xy(struct brw_wm_compile *c,
+		   const struct brw_reg *dst,
+		   GLuint mask)
 {
+   struct brw_compile *p = &c->func;
    struct brw_reg r1 = brw_vec1_grf(1, 0);
    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+   struct brw_reg dst0_uw, dst1_uw;
 
+   brw_push_insn_state(p);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
+   if (c->dispatch_width == 16) {
+      dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
+      dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
+   } else {
+      dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
+      dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
+   }
+
    /* Calculate pixel centers by adding 1 or 0 to each of the
     * micro-tile coordinates passed in r1.
     */
    if (mask & WRITEMASK_X) {
       brw_ADD(p,
-	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
+	      dst0_uw,
 	      stride(suboffset(r1_uw, 4), 2, 4, 0),
 	      brw_imm_v(0x10101010));
    }
 
    if (mask & WRITEMASK_Y) {
       brw_ADD(p,
-	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
+	      dst1_uw,
 	      stride(suboffset(r1_uw,5), 2, 4, 0),
 	      brw_imm_v(0x11001100));
    }
-
-   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+   brw_pop_insn_state(p);
 }
 
 
-
-static void emit_delta_xy(struct brw_compile *p,
-			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0)
+void emit_delta_xy(struct brw_compile *p,
+		   const struct brw_reg *dst,
+		   GLuint mask,
+		   const struct brw_reg *arg0)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
 
@@ -118,10 +127,10 @@ static void emit_delta_xy(struct brw_compile *p,
    }
 }
 
-static void emit_wpos_xy(struct brw_wm_compile *c,
-			 const struct brw_reg *dst,
-			 GLuint mask,
-			 const struct brw_reg *arg0)
+void emit_wpos_xy(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0)
 {
    struct brw_compile *p = &c->func;
 
@@ -146,12 +155,14 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 }
 
 
-static void emit_pixel_w( struct brw_compile *p,
-			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0,
-			  const struct brw_reg *deltas)
+void emit_pixel_w(struct brw_wm_compile *c,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas)
 {
+   struct brw_compile *p = &c->func;
+
    /* Don't need this if all you are doing is interpolating color, for
     * instance.
     */
@@ -165,21 +176,29 @@ static void emit_pixel_w( struct brw_compile *p,
       brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
 
       /* Calc w */
-      brw_math_16( p, dst[3],
-		   BRW_MATH_FUNCTION_INV,
-		   BRW_MATH_SATURATE_NONE,
-		   2, brw_null_reg(),
-		   BRW_MATH_PRECISION_FULL);
+      if (c->dispatch_width == 16) {
+	 brw_math_16(p, dst[3],
+		     BRW_MATH_FUNCTION_INV,
+		     BRW_MATH_SATURATE_NONE,
+		     2, brw_null_reg(),
+		     BRW_MATH_PRECISION_FULL);
+      } else {
+	 brw_math(p, dst[3],
+		  BRW_MATH_FUNCTION_INV,
+		  BRW_MATH_SATURATE_NONE,
+		  2, brw_null_reg(),
+		  BRW_MATH_DATA_VECTOR,
+		  BRW_MATH_PRECISION_FULL);
+      }
    }
 }
 
 
-
-static void emit_linterp( struct brw_compile *p, 
-			 const struct brw_reg *dst,
-			 GLuint mask,
-			 const struct brw_reg *arg0,
-			 const struct brw_reg *deltas )
+void emit_linterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas)
 {
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
@@ -199,12 +218,12 @@ static void emit_linterp( struct brw_compile *p,
 }
 
 
-static void emit_pinterp( struct brw_compile *p, 
-			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0,
-			  const struct brw_reg *deltas,
-			  const struct brw_reg *w)
+void emit_pinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0,
+		  const struct brw_reg *deltas,
+		  const struct brw_reg *w)
 {
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
@@ -229,10 +248,10 @@ static void emit_pinterp( struct brw_compile *p,
 }
 
 
-static void emit_cinterp( struct brw_compile *p, 
-			 const struct brw_reg *dst,
-			 GLuint mask,
-			 const struct brw_reg *arg0 )
+void emit_cinterp(struct brw_compile *p,
+		  const struct brw_reg *dst,
+		  GLuint mask,
+		  const struct brw_reg *arg0)
 {
    struct brw_reg interp[4];
    GLuint nr = arg0[0].nr;
@@ -251,9 +270,9 @@ static void emit_cinterp( struct brw_compile *p,
 }
 
 /* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
-static void emit_frontfacing( struct brw_compile *p,
-			      const struct brw_reg *dst,
-			      GLuint mask )
+void emit_frontfacing(struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask)
 {
    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
    GLuint i;
@@ -352,13 +371,13 @@ void emit_ddxy(struct brw_compile *p,
       brw_set_saturate(p, 0);
 }
 
-static void emit_alu1( struct brw_compile *p, 
-		       struct brw_instruction *(*func)(struct brw_compile *, 
-						       struct brw_reg, 
-						       struct brw_reg),
-		       const struct brw_reg *dst,
-		       GLuint mask,
-		       const struct brw_reg *arg0 )
+void emit_alu1(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0)
 {
    GLuint i;
 
@@ -376,15 +395,15 @@ static void emit_alu1( struct brw_compile *p,
 }
 
 
-static void emit_alu2( struct brw_compile *p, 
-		       struct brw_instruction *(*func)(struct brw_compile *, 
-						       struct brw_reg, 
-						       struct brw_reg, 
-						       struct brw_reg),
-		       const struct brw_reg *dst,
-		       GLuint mask,
-		       const struct brw_reg *arg0,
-		       const struct brw_reg *arg1 )
+void emit_alu2(struct brw_compile *p,
+	       struct brw_instruction *(*func)(struct brw_compile *,
+					       struct brw_reg,
+					       struct brw_reg,
+					       struct brw_reg),
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       const struct brw_reg *arg0,
+	       const struct brw_reg *arg1)
 {
    GLuint i;
 
@@ -402,12 +421,12 @@ static void emit_alu2( struct brw_compile *p,
 }
 
 
-static void emit_mad( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1,
-		      const struct brw_reg *arg2 )
+void emit_mad(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2)
 {
    GLuint i;
 
@@ -422,26 +441,12 @@ static void emit_mad( struct brw_compile *p,
    }
 }
 
-static void emit_trunc( struct brw_compile *p,
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0)
-{
-   GLuint i;
-
-   for (i = 0; i < 4; i++) {
-      if (mask & (1<<i)) {
-	 brw_RNDZ(p, dst[i], arg0[i]);
-      }
-   }
-}
-
-static void emit_lrp( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1,
-		      const struct brw_reg *arg2 )
+void emit_lrp(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1,
+	      const struct brw_reg *arg2)
 {
    GLuint i;
 
@@ -461,21 +466,24 @@ static void emit_lrp( struct brw_compile *p,
    }
 }
 
-static void emit_sop( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      GLuint cond,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_sop(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      GLuint cond,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
    for (i = 0; i < 4; i++) {
       if (mask & (1<<i)) {	
-	 brw_MOV(p, dst[i], brw_imm_f(0));
+	 brw_push_insn_state(p);
 	 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 brw_MOV(p, dst[i], brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	 brw_MOV(p, dst[i], brw_imm_f(1.0));
-	 brw_set_predicate_control_flag_value(p, 0xff);
+	 brw_pop_insn_state(p);
       }
    }
 }
@@ -559,11 +567,11 @@ static void emit_cmp( struct brw_compile *p,
    }
 }
 
-static void emit_max( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_max(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
@@ -583,11 +591,11 @@ static void emit_max( struct brw_compile *p,
    }
 }
 
-static void emit_min( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_min(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
@@ -608,11 +616,11 @@ static void emit_min( struct brw_compile *p,
 }
 
 
-static void emit_dp3( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_dp3(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 
@@ -630,11 +638,11 @@ static void emit_dp3( struct brw_compile *p,
 }
 
 
-static void emit_dp4( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_dp4(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 
@@ -653,11 +661,11 @@ static void emit_dp4( struct brw_compile *p,
 }
 
 
-static void emit_dph( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_dph(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
 
@@ -676,15 +684,15 @@ static void emit_dph( struct brw_compile *p,
 }
 
 
-static void emit_xpd( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0,
-		      const struct brw_reg *arg1 )
+void emit_xpd(struct brw_compile *p,
+	      const struct brw_reg *dst,
+	      GLuint mask,
+	      const struct brw_reg *arg0,
+	      const struct brw_reg *arg1)
 {
    GLuint i;
 
-   assert(!(mask & WRITEMASK_W) == WRITEMASK_X);
+   assert((mask & WRITEMASK_W) != WRITEMASK_W);
    
    for (i = 0 ; i < 3; i++) {
       if (mask & (1<<i)) {
@@ -701,41 +709,68 @@ static void emit_xpd( struct brw_compile *p,
 }
 
 
-static void emit_math1( struct brw_compile *p, 
-			GLuint function,
-			const struct brw_reg *dst,
-			GLuint mask,
-			const struct brw_reg *arg0 )
+void emit_math1(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0)
 {
+   struct brw_compile *p = &c->func;
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   GLuint saturate = ((mask & SATURATE) ?
+		      BRW_MATH_SATURATE_SATURATE :
+		      BRW_MATH_SATURATE_NONE);
 
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
    assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
+   /* If compressed, this will write message reg 2,3 from arg0.x's 16
+    * channels.
+    */
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
    /* Send two messages to perform all 16 operations:
     */
-   brw_math_16(p, 
-	       dst[dst_chan],
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_math(p,
+	    dst[dst_chan],
+	    function,
+	    saturate,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_math(p,
+	       offset(dst[dst_chan],1),
 	       function,
-	       (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	       2,
+	       saturate,
+	       3,
 	       brw_null_reg(),
+	       BRW_MATH_DATA_VECTOR,
 	       BRW_MATH_PRECISION_FULL);
+   }
+   brw_pop_insn_state(p);
 }
 
 
-static void emit_math2( struct brw_compile *p, 
-			GLuint function,
-			const struct brw_reg *dst,
-			GLuint mask,
-			const struct brw_reg *arg0,
-			const struct brw_reg *arg1)
+void emit_math2(struct brw_wm_compile *c,
+		GLuint function,
+		const struct brw_reg *dst,
+		GLuint mask,
+		const struct brw_reg *arg0,
+		const struct brw_reg *arg1)
 {
+   struct brw_compile *p = &c->func;
    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+   GLuint saturate = ((mask & SATURATE) ?
+		      BRW_MATH_SATURATE_SATURATE :
+		      BRW_MATH_SATURATE_NONE);
 
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
@@ -746,173 +781,233 @@ static void emit_math2( struct brw_compile *p,
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_MOV(p, brw_message_reg(2), arg0[0]);
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+   }
 
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_MOV(p, brw_message_reg(3), arg1[0]);
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+   }
 
-   
-   /* Send two messages to perform all 16 operations:
-    */
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math(p, 
 	    dst[dst_chan],
 	    function,
-	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    saturate,
 	    2,
 	    brw_null_reg(),
 	    BRW_MATH_DATA_VECTOR,
 	    BRW_MATH_PRECISION_FULL);
 
-   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-   brw_math(p, 
-	    offset(dst[dst_chan],1),
-	    function,
-	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	    4,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
-   
+   /* Send two messages to perform all 16 operations:
+    */
+   if (c->dispatch_width == 16) {
+      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_math(p,
+	       offset(dst[dst_chan],1),
+	       function,
+	       saturate,
+	       4,
+	       brw_null_reg(),
+	       BRW_MATH_DATA_VECTOR,
+	       BRW_MATH_PRECISION_FULL);
+   }
    brw_pop_insn_state(p);
 }
-		     
 
 
-static void emit_tex( struct brw_wm_compile *c,
-		      const struct brw_wm_instruction *inst,
-		      struct brw_reg *dst,
-		      GLuint dst_flags,
-		      struct brw_reg *arg )
+void emit_tex(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler,
+	      GLboolean shadow)
 {
    struct brw_compile *p = &c->func;
-   GLuint msgLength, responseLength;
-   GLuint i, nr;
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_reg dst_retyped;
+   GLuint cur_mrf = 2, response_length;
+   GLuint i, nr_texcoords;
    GLuint emit;
    GLuint msg_type;
+   GLuint mrf_per_channel;
+   GLuint simd_mode;
+
+   if (c->dispatch_width == 16) {
+      mrf_per_channel = 2;
+      response_length = 8;
+      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+   } else {
+      mrf_per_channel = 1;
+      response_length = 4;
+      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+   }
 
    /* How many input regs are there?
     */
-   switch (inst->tex_idx) {
+   switch (tex_idx) {
    case TEXTURE_1D_INDEX:
       emit = WRITEMASK_X;
-      nr = 1;
+      nr_texcoords = 1;
       break;
    case TEXTURE_2D_INDEX:
    case TEXTURE_RECT_INDEX:
       emit = WRITEMASK_XY;
-      nr = 2;
+      nr_texcoords = 2;
       break;
    case TEXTURE_3D_INDEX:
    case TEXTURE_CUBE_INDEX:
       emit = WRITEMASK_XYZ;
-      nr = 3;
+      nr_texcoords = 3;
       break;
    default:
       /* unexpected target */
       abort();
    }
 
-   if (inst->tex_shadow) {
-      nr = 4;
-      emit |= WRITEMASK_W;
-   }
+   /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
+   if (!intel->is_ironlake && c->dispatch_width == 8)
+      nr_texcoords = 3;
 
-   msgLength = 1;
+   /* For shadow comparisons, we have to supply u,v,r. */
+   if (shadow)
+      nr_texcoords = 3;
 
-   for (i = 0; i < nr; i++) {
-      static const GLuint swz[4] = {0,1,2,2};
-      if (emit & (1<<i)) 
-	 brw_MOV(p, brw_message_reg(msgLength+1), arg[swz[i]]);
+   /* Emit the texcoords. */
+   for (i = 0; i < nr_texcoords; i++) {
+      if (emit & (1<<i))
+	 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
       else
-	 brw_MOV(p, brw_message_reg(msgLength+1), brw_imm_f(0));
-      msgLength += 2;
+	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+      cur_mrf += mrf_per_channel;
    }
 
-   responseLength = 8;		/* always */
+   /* Fill in the shadow comparison reference value. */
+   if (shadow) {
+      if (intel->is_ironlake) {
+	 /* Fill in the cube map array index value. */
+	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+	 cur_mrf += mrf_per_channel;
+      } else if (c->dispatch_width == 8) {
+	 /* Fill in the LOD bias value. */
+	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
+	 cur_mrf += mrf_per_channel;
+      }
+      brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
+      cur_mrf += mrf_per_channel;
+   }
 
-   if (BRW_IS_IGDNG(p->brw)) {
-       if (inst->tex_shadow)
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
-       else
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+   if (intel->is_ironlake) {
+      if (shadow)
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_IGDNG;
+      else
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_IGDNG;
    } else {
-       if (inst->tex_shadow)
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
-       else
-           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+      /* Note that G45 and older determines shadow compare and dispatch width
+       * from message length for most messages.
+       */
+      if (c->dispatch_width == 16 && shadow)
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+      else
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
    }
 
-   brw_SAMPLE(p, 
-	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+   brw_SAMPLE(p,
+	      dst_retyped,
 	      1,
-	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-	      inst->tex_unit,	  /* sampler */
-	      inst->writemask,
-	      msg_type, 
-	      responseLength,
-	      msgLength,
-	      0,	
+	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
+              SURF_INDEX_TEXTURE(sampler),
+	      sampler,
+	      dst_flags & WRITEMASK_XYZW,
+	      msg_type,
+	      response_length,
+	      cur_mrf - 1,
+	      0,
 	      1,
-	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
+	      simd_mode);
 }
 
 
-static void emit_txb( struct brw_wm_compile *c,
-		      const struct brw_wm_instruction *inst,
-		      struct brw_reg *dst,
-		      GLuint dst_flags,
-		      struct brw_reg *arg )
+void emit_txb(struct brw_wm_compile *c,
+	      struct brw_reg *dst,
+	      GLuint dst_flags,
+	      struct brw_reg *arg,
+	      struct brw_reg depth_payload,
+	      GLuint tex_idx,
+	      GLuint sampler)
 {
    struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
    GLuint msgLength;
    GLuint msg_type;
-   /* Shadow ignored for txb.
+   GLuint mrf_per_channel;
+   GLuint response_length;
+   struct brw_reg dst_retyped;
+
+   /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
+    * samples, so we'll use the 16-wide instruction, leave the second halves
+    * undefined, and trust the execution mask to keep the undefined pixels
+    * from mattering.
     */
-   switch (inst->tex_idx) {
+   if (c->dispatch_width == 16 || !intel->is_ironlake) {
+      if (intel->is_ironlake)
+	 msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+      else
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+      mrf_per_channel = 2;
+      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
+      response_length = 8;
+   } else {
+      msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_IGDNG;
+      mrf_per_channel = 1;
+      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
+      response_length = 4;
+   }
+
+   /* Shadow ignored for txb. */
+   switch (tex_idx) {
    case TEXTURE_1D_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
       break;
    case TEXTURE_2D_INDEX:
    case TEXTURE_RECT_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
-      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
       break;
    case TEXTURE_3D_INDEX:
    case TEXTURE_CUBE_INDEX:
-      brw_MOV(p, brw_message_reg(2), arg[0]);
-      brw_MOV(p, brw_message_reg(4), arg[1]);
-      brw_MOV(p, brw_message_reg(6), arg[2]);
+      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
+      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
+      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
       break;
    default:
       /* unexpected target */
       abort();
    }
 
-   brw_MOV(p, brw_message_reg(8), arg[3]);
-   msgLength = 9;
-
-   if (BRW_IS_IGDNG(p->brw))
-       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
-   else
-       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+   brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
+   msgLength = 2 + 4 * mrf_per_channel - 1;
 
    brw_SAMPLE(p, 
-	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+	      dst_retyped,
 	      1,
-	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-              SURF_INDEX_TEXTURE(inst->tex_unit),
-	      inst->tex_unit,	  /* sampler */
-	      inst->writemask,
+	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
+              SURF_INDEX_TEXTURE(sampler),
+	      sampler,
+	      dst_flags & WRITEMASK_XYZW,
 	      msg_type,
-	      8,		/* responseLength */
+	      response_length,
 	      msgLength,
 	      0,	
 	      1,
@@ -920,11 +1015,13 @@ static void emit_txb( struct brw_wm_compile *c,
 }
 
 
-static void emit_lit( struct brw_compile *p, 
-		      const struct brw_reg *dst,
-		      GLuint mask,
-		      const struct brw_reg *arg0 )
+static void emit_lit(struct brw_wm_compile *c,
+		     const struct brw_reg *dst,
+		     GLuint mask,
+		     const struct brw_reg *arg0)
 {
+   struct brw_compile *p = &c->func;
+
    assert((mask & WRITEMASK_XW) == 0);
 
    if (mask & WRITEMASK_Y) {
@@ -934,7 +1031,7 @@ static void emit_lit( struct brw_compile *p,
    }
 
    if (mask & WRITEMASK_Z) {
-      emit_math2(p, BRW_MATH_FUNCTION_POW,
+      emit_math2(c, BRW_MATH_FUNCTION_POW,
 		 &dst[2],
 		 WRITEMASK_X | (mask & SATURATE),
 		 &arg0[1],
@@ -989,7 +1086,7 @@ static void emit_kil_nv( struct brw_wm_compile *c )
 
    brw_push_insn_state(p);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
    brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
    brw_pop_insn_state(p);
 }
@@ -1001,7 +1098,13 @@ static void fire_fb_write( struct brw_wm_compile *c,
 			   GLuint eot )
 {
    struct brw_compile *p = &c->func;
-   
+   struct brw_reg dst;
+
+   if (c->dispatch_width == 16)
+      dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+   else
+      dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+
    /* Pass through control information:
     */
 /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
@@ -1018,7 +1121,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
    /* Send framebuffer write message: */
 /*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
    brw_fb_WRITE(p,
-		retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		dst,
 		base_reg,
 		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
 		target,		
@@ -1050,14 +1153,15 @@ static void emit_aa( struct brw_wm_compile *c,
  * \param arg1  the pass-through depth value
  * \param arg2  the shader-computed depth value
  */
-static void emit_fb_write( struct brw_wm_compile *c,
-			   struct brw_reg *arg0,
-			   struct brw_reg *arg1,
-			   struct brw_reg *arg2,
-			   GLuint target,
-			   GLuint eot)
+void emit_fb_write(struct brw_wm_compile *c,
+		   struct brw_reg *arg0,
+		   struct brw_reg *arg1,
+		   struct brw_reg *arg2,
+		   GLuint target,
+		   GLuint eot)
 {
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
    GLuint nr = 2;
    GLuint channel;
 
@@ -1069,30 +1173,37 @@ static void emit_fb_write( struct brw_wm_compile *c,
    /* I don't really understand how this achieves the color interleave
     * (ie RGBARGBA) in the result:  [Do the saturation here]
     */
-   {
-      brw_push_insn_state(p);
-      
-      for (channel = 0; channel < 4; channel++) {
+   brw_push_insn_state(p);
+
+   for (channel = 0; channel < 4; channel++) {
+      if (c->dispatch_width == 16 && brw->has_compr4) {
+	 /* By setting the high bit of the MRF register number, we indicate
+	  * that we want COMPR4 mode - instead of doing the usual destination
+	  * + 1 for the second half we get destination + 4.
+	  */
+	 brw_MOV(p,
+		 brw_message_reg(nr + channel + (1 << 7)),
+		 arg0[channel]);
+      } else {
 	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
 	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-
 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 	 brw_MOV(p,
 		 brw_message_reg(nr + channel),
 		 arg0[channel]);
-       
-	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
-	 brw_MOV(p,
-		 brw_message_reg(nr + channel + 4),
-		 sechalf(arg0[channel]));
-      }
 
-      /* skip over the regs populated above:
-       */
-      nr += 8;
-   
-      brw_pop_insn_state(p);
+	 if (c->dispatch_width == 16) {
+	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	    brw_MOV(p,
+		    brw_message_reg(nr + channel + 4),
+		    sechalf(arg0[channel]));
+	 }
+      }
    }
+   /* skip over the regs populated above:
+    */
+   nr += 8;
+   brw_pop_insn_state(p);
 
    if (c->key.source_depth_to_render_target)
    {
@@ -1142,7 +1253,7 @@ static void emit_fb_write( struct brw_wm_compile *c,
 	      get_element_ud(brw_vec8_grf(1,0), 6), 
 	      brw_imm_ud(1<<26)); 
 
-      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
       {
 	 emit_aa(c, arg1, 2);
 	 fire_fb_write(c, 0, nr, target, eot);
@@ -1156,7 +1267,6 @@ static void emit_fb_write( struct brw_wm_compile *c,
    }
 }
 
-
 /**
  * Move a GPR to scratch memory. 
  */
@@ -1294,7 +1404,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Generated instructions for calculating triangle interpolants:
 	  */
       case WM_PIXELXY:
-	 emit_pixel_xy(p, dst, dst_flags);
+	 emit_pixel_xy(c, dst, dst_flags);
 	 break;
 
       case WM_DELTAXY:
@@ -1306,7 +1416,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case WM_PIXELW:
-	 emit_pixel_w(p, dst, dst_flags, args[0], args[1]);
+	 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
 	 break;
 
       case WM_LINTERP:
@@ -1364,7 +1474,7 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_TRUNC:
-	 emit_trunc(p, dst, dst_flags, args[0]);
+	 emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_LRP:
@@ -1391,27 +1501,27 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Higher math functions:
 	  */
       case OPCODE_RCP:
-	 emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_RSQ:
-	 emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_SIN:
-	 emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_COS:
-	 emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_EX2:
-	 emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_LG2:
-	 emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
+	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
 	 break;
 
       case OPCODE_SCS:
@@ -1419,13 +1529,13 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	  * fixup for 16-element execution.
 	  */
 	 if (dst_flags & WRITEMASK_X)
-	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+	    emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
 	 if (dst_flags & WRITEMASK_Y)
-	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
+	    emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
 	 break;
 
       case OPCODE_POW:
-	 emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
+	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
 	 break;
 
 	 /* Comparisons:
@@ -1463,17 +1573,20 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	break;
 
       case OPCODE_LIT:
-	 emit_lit(p, dst, dst_flags, args[0]);
+	 emit_lit(c, dst, dst_flags, args[0]);
 	 break;
 
 	 /* Texturing operations:
 	  */
       case OPCODE_TEX:
-	 emit_tex(c, inst, dst, dst_flags, args[0]);
+	 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
+		  inst->tex_idx, inst->tex_unit,
+		  inst->tex_shadow);
 	 break;
 
       case OPCODE_TXB:
-	 emit_txb(c, inst, dst, dst_flags, args[0]);
+	 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
+		  inst->tex_idx, inst->tex_unit);
 	 break;
 
       case OPCODE_KIL:
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 0e86d75dea..3737faf26f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -138,7 +138,6 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
    reg.CondSrc = 0;
-   reg.pad = 0;
    return reg;
 }
 
@@ -182,6 +181,8 @@ static void release_temp( struct brw_wm_compile *c, struct prog_dst_register tem
 static struct prog_instruction *get_fp_inst(struct brw_wm_compile *c)
 {
    assert(c->nr_fp_insns < BRW_WM_MAX_INSN);
+   memset(&c->prog_instructions[c->nr_fp_insns], 0,
+	  sizeof(*c->prog_instructions));
    return &c->prog_instructions[c->nr_fp_insns++];
 }
 
@@ -448,7 +449,6 @@ static void emit_interp( struct brw_wm_compile *c,
       break;
 
    case FRAG_ATTRIB_FACE:
-      /* XXX review/test this case */
       emit_op(c,
               WM_FRONTFACING,
               dst_mask(dst, WRITEMASK_X),
@@ -957,7 +957,7 @@ static void precalc_txp( struct brw_wm_compile *c,
 
 
 
-static void emit_fb_write( struct brw_wm_compile *c )
+static void emit_render_target_writes( struct brw_wm_compile *c )
 {
    struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
    struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPTH);
@@ -985,7 +985,7 @@ static void emit_fb_write( struct brw_wm_compile *c )
    }
    else {
       /* if gl_FragData[0] is written, use it, else use gl_FragColor */
-      if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0))
+      if (c->fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DATA0))
          outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0);
       else 
          outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
@@ -1154,7 +1154,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 out->DstReg.WriteMask = 0;
 	 break;
       case OPCODE_END:
-	 emit_fb_write(c);
+	 emit_render_target_writes(c);
 	 break;
       case OPCODE_PRINT:
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 28d6d4eba5..fde83eea62 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -371,7 +371,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
 	  for (j = 0; j < 4; j++)
 	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
        }
-       if (c->key.vp_outputs_written & (1 << i)) {
+       if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 	  reg_index += 2;
        }
     }
@@ -550,42 +550,6 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
     }
 }
 
-
-/**
- * Same as \sa get_src_reg() but if the register is a literal, emit
- * a brw_reg encoding the literal.
- * Note that a brw instruction only allows one src operand to be a literal.
- * For instructions with more than one operand, only the second can be a
- * literal.  This means that we treat some literals as constants/uniforms
- * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
- * 
- */
-static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, 
-                                      const struct prog_instruction *inst,
-                                      GLuint srcRegIndex, GLuint channel)
-{
-    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-    if (src->File == PROGRAM_CONSTANT) {
-       /* a literal */
-       const int component = GET_SWZ(src->Swizzle, channel);
-       const GLfloat *param =
-          c->fp->program.Base.Parameters->ParameterValues[src->Index];
-       GLfloat value = param[component];
-       if (src->Negate & (1 << channel))
-          value = -value;
-       if (src->Abs)
-          value = FABSF(value);
-#if 0
-       printf("  form immed value %f for chan %d\n", value, channel);
-#endif
-       return brw_imm_f(value);
-    }
-    else {
-       return get_src_reg(c, inst, srcRegIndex, channel);
-    }
-}
-
-
 /**
  * Subroutines are minimal support for resusable instruction sequences.
  * They are implemented as simply as possible to minimise overhead: there
@@ -650,542 +614,110 @@ static void invoke_subroutine( struct brw_wm_compile *c,
     }
 }
 
-static void emit_trunc( struct brw_wm_compile *c,
-                        const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i);
-	    src = get_src_reg(c, inst, 0, i);
-	    brw_RNDZ(p, dst, src);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_mov( struct brw_wm_compile *c,
-                      const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i);
-            /* XXX some moves from immediate value don't work reliably!!! */
-            /*src = get_src_reg_imm(c, inst, 0, i);*/
-            src = get_src_reg(c, inst, 0, i);
-	    brw_MOV(p, dst, src);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_pixel_xy(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
-{
-    struct brw_reg r1 = brw_vec1_grf(1, 0);
-    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
-
-    struct brw_reg dst0, dst1;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-
-    dst0 = get_dst_reg(c, inst, 0);
-    dst1 = get_dst_reg(c, inst, 1);
-    /* Calculate pixel centers by adding 1 or 0 to each of the
-     * micro-tile coordinates passed in r1.
-     */
-    if (mask & WRITEMASK_X) {
-	brw_ADD(p,
-		vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
-		stride(suboffset(r1_uw, 4), 2, 4, 0),
-		brw_imm_v(0x10101010));
-    }
-
-    if (mask & WRITEMASK_Y) {
-	brw_ADD(p,
-		vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
-		stride(suboffset(r1_uw, 5), 2, 4, 0),
-		brw_imm_v(0x11001100));
-    }
-}
-
-static void emit_delta_xy(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
-{
-    struct brw_reg r1 = brw_vec1_grf(1, 0);
-    struct brw_reg dst0, dst1, src0, src1;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-
-    dst0 = get_dst_reg(c, inst, 0);
-    dst1 = get_dst_reg(c, inst, 1);
-    src0 = get_src_reg(c, inst, 0, 0);
-    src1 = get_src_reg(c, inst, 0, 1);
-    /* Calc delta X,Y by subtracting origin in r1 from the pixel
-     * centers.
-     */
-    if (mask & WRITEMASK_X) {
-	brw_ADD(p,
-		dst0,
-		retype(src0, BRW_REGISTER_TYPE_UW),
-		negate(r1));
-    }
-
-    if (mask & WRITEMASK_Y) {
-	brw_ADD(p,
-		dst1,
-		retype(src1, BRW_REGISTER_TYPE_UW),
-		negate(suboffset(r1,1)));
-
-    }
-}
-
-static void fire_fb_write( struct brw_wm_compile *c,
-                           GLuint base_reg,
-                           GLuint nr,
-                           GLuint target,
-                           GLuint eot)
-{
-    struct brw_compile *p = &c->func;
-    /* Pass through control information:
-     */
-    /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
-    {
-	brw_push_insn_state(p);
-	brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
-	brw_MOV(p,
-		brw_message_reg(base_reg + 1),
-		brw_vec8_grf(1, 0));
-	brw_pop_insn_state(p);
-    }
-    /* Send framebuffer write message: */
-    brw_fb_WRITE(p,
-	    retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
-	    base_reg,
-	    retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
-	    target,              
-	    nr,
-	    0,
-	    eot);
-}
-
-static void emit_fb_write(struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    int nr = 2;
-    int channel;
-    GLuint target, eot;
-    struct brw_reg src0;
-
-    /* Reserve a space for AA - may not be needed:
-     */
-    if (c->key.aa_dest_stencil_reg)
-	nr += 1;
-
-    brw_push_insn_state(p);
-    for (channel = 0; channel < 4; channel++) {
-        src0 = get_src_reg(c,  inst, 0, channel);
-        /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
-        /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-        brw_MOV(p, brw_message_reg(nr + channel), src0);
-    }
-    /* skip over the regs populated above: */
-    nr += 8;
-    brw_pop_insn_state(p);
-
-    if (c->key.source_depth_to_render_target) {
-       if (c->key.computes_depth) {
-          src0 = get_src_reg(c, inst, 2, 2);
-          brw_MOV(p, brw_message_reg(nr), src0);
-       }
-       else {
-          src0 = get_src_reg(c, inst, 1, 1);
-          brw_MOV(p, brw_message_reg(nr), src0);
-       }
-
-       nr += 2;
-    }
-
-    if (c->key.dest_depth_reg) {
-        const GLuint comp = c->key.dest_depth_reg / 2;
-        const GLuint off = c->key.dest_depth_reg % 2;
-
-        if (off != 0) {
-            /* XXX this code needs review/testing */
-            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
-            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
-
-            brw_push_insn_state(p);
-            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-
-            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
-            /* 2nd half? */
-            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
-            brw_pop_insn_state(p);
-        }
-        else
-        {
-            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
-            brw_MOV(p, brw_message_reg(nr), src);
-        }
-        nr += 2;
-   }
-
-    target = INST_AUX_GET_TARGET(inst->Aux);
-    eot = inst->Aux & INST_AUX_EOT;
-    fire_fb_write(c, 0, nr, target, eot);
-}
-
-static void emit_pixel_w( struct brw_wm_compile *c,
-                          const struct prog_instruction *inst)
+/* Workaround for using brw_wm_emit.c's emit functions, which expect
+ * destination regs to be uniquely written.  Moves arguments out to
+ * temporaries as necessary for instructions which use their destination as
+ * a temporary.
+ */
+static void
+unalias3(struct brw_wm_compile *c,
+	 void (*func)(struct brw_compile *c,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2),
+	 const struct brw_reg *dst,
+	 GLuint mask,
+	 const struct brw_reg *arg0,
+	 const struct brw_reg *arg1,
+	 const struct brw_reg *arg2)
 {
     struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    if (mask & WRITEMASK_W) {
-	struct brw_reg dst, src0, delta0, delta1;
-	struct brw_reg interp3;
-
-	dst = get_dst_reg(c, inst, 3);
-	src0 = get_src_reg(c, inst, 0, 0);
-	delta0 = get_src_reg(c, inst, 1, 0);
-	delta1 = get_src_reg(c, inst, 1, 1);
-
-	interp3 = brw_vec1_grf(src0.nr+1, 4);
-	/* Calc 1/w - just linterp wpos[3] optimized by putting the
-	 * result straight into a message reg.
-	 */
-	brw_LINE(p, brw_null_reg(), interp3, delta0);
-	brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
-
-	/* Calc w */
-	brw_math_16( p, dst,
-		BRW_MATH_FUNCTION_INV,
-		BRW_MATH_SATURATE_NONE,
-		2, brw_null_reg(),
-		BRW_MATH_PRECISION_FULL);
-    }
-}
+    struct brw_reg tmp_arg0[4], tmp_arg1[4], tmp_arg2[4];
+    int i, j;
+    int mark = mark_tmps(c);
 
-static void emit_linterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg interp[4];
-    struct brw_reg dst, delta0, delta1;
-    struct brw_reg src0;
-    GLuint nr, i;
-
-    src0 = get_src_reg(c, inst, 0, 0);
-    delta0 = get_src_reg(c, inst, 1, 0);
-    delta1 = get_src_reg(c, inst, 1, 1);
-    nr = src0.nr;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
-	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
-	}
+    for (j = 0; j < 4; j++) {
+	tmp_arg0[j] = arg0[j];
+	tmp_arg1[j] = arg1[j];
+	tmp_arg2[j] = arg2[j];
     }
-}
-
-static void emit_cinterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-
-    struct brw_reg interp[4];
-    struct brw_reg dst, src0;
-    GLuint nr, i;
 
-    src0 = get_src_reg(c, inst, 0, 0);
-    nr = src0.nr;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
+    for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV(p, dst, suboffset(interp[i],3));
+	    for (j = 0; j < 4; j++) {
+		if (arg0[j].file == dst[i].file &&
+		    dst[i].nr == arg0[j].nr) {
+		    tmp_arg0[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg0[j], arg0[j]);
+		}
+		if (arg1[j].file == dst[i].file &&
+		    dst[i].nr == arg1[j].nr) {
+		    tmp_arg1[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg1[j], arg1[j]);
+		}
+		if (arg2[j].file == dst[i].file &&
+		    dst[i].nr == arg2[j].nr) {
+		    tmp_arg2[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg2[j], arg2[j]);
+		}
+	    }
 	}
     }
-}
-
-static void emit_pinterp(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
 
-    struct brw_reg interp[4];
-    struct brw_reg dst, delta0, delta1;
-    struct brw_reg src0, w;
-    GLuint nr, i;
+    func(p, dst, mask, tmp_arg0, tmp_arg1, tmp_arg2);
 
-    src0 = get_src_reg(c, inst, 0, 0);
-    delta0 = get_src_reg(c, inst, 1, 0);
-    delta1 = get_src_reg(c, inst, 1, 1);
-    w = get_src_reg(c, inst, 2, 3);
-    nr = src0.nr;
-
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-
-    for(i = 0; i < 4; i++ ) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
-	    brw_MAC(p, dst, suboffset(interp[i],1), 
-		    delta1);
-	    brw_MUL(p, dst, dst, w);
-	}
-    }
+    release_tmps(c, mark);
 }
 
-/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
-static void emit_frontfacing(struct brw_wm_compile *c,
-			     const struct prog_instruction *inst)
+/* Workaround for using brw_wm_emit.c's emit functions, which expect
+ * destination regs to be uniquely written.  Moves arguments out to
+ * temporaries as necessary for instructions which use their destination as
+ * a temporary.
+ */
+static void
+unalias2(struct brw_wm_compile *c,
+	 void (*func)(struct brw_compile *c,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1),
+	 const struct brw_reg *dst,
+	 GLuint mask,
+	 const struct brw_reg *arg0,
+	 const struct brw_reg *arg1)
 {
     struct brw_compile *p = &c->func;
-    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
-    struct brw_reg dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV(p, dst, brw_imm_f(0.0));
-	}
-    }
+    struct brw_reg tmp_arg0[4], tmp_arg1[4];
+    int i, j;
+    int mark = mark_tmps(c);
 
-    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
-     * us front face
-     */
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    brw_MOV(p, dst, brw_imm_f(1.0));
-	}
+    for (j = 0; j < 4; j++) {
+	tmp_arg0[j] = arg0[j];
+	tmp_arg1[j] = arg1[j];
     }
-    brw_set_predicate_control_flag_value(p, 0xff);
-}
 
-static void emit_xpd(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
     for (i = 0; i < 4; i++) {
-	GLuint i2 = (i+2)%3;
-	GLuint i1 = (i+1)%3;
 	if (mask & (1<<i)) {
-	    struct brw_reg src0, src1, dst;
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = negate(get_src_reg(c, inst, 0, i2));
-	    src1 = get_src_reg_imm(c, inst, 1, i1);
-	    brw_MUL(p, brw_null_reg(), src0, src1);
-	    src0 = get_src_reg(c, inst, 0, i1);
-	    src1 = get_src_reg_imm(c, inst, 1, i2);
-	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-	    brw_MAC(p, dst, src0, src1);
-	    brw_set_saturate(p, 0);
+	    for (j = 0; j < 4; j++) {
+		if (arg0[j].file == dst[i].file &&
+		    dst[i].nr == arg0[j].nr) {
+		    tmp_arg0[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg0[j], arg0[j]);
+		}
+		if (arg1[j].file == dst[i].file &&
+		    dst[i].nr == arg1[j].nr) {
+		    tmp_arg1[j] = alloc_tmp(c);
+		    brw_MOV(p, tmp_arg1[j], arg1[j]);
+		}
+	    }
 	}
     }
-    brw_set_saturate(p, 0);
-}
 
-static void emit_dp3(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_reg src0[3], src1[3], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    for (i = 0; i < 3; i++) {
-	src0[i] = get_src_reg(c, inst, 0, i);
-	src1[i] = get_src_reg_imm(c, inst, 1, i);
-    }
-
-    dst = get_dst_reg(c, inst, dst_chan);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_MAC(p, dst, src0[2], src1[2]);
-    brw_set_saturate(p, 0);
-}
-
-static void emit_dp4(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_reg src0[4], src1[4], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, inst, 0, i);
-	src1[i] = get_src_reg_imm(c, inst, 1, i);
-    }
-    dst = get_dst_reg(c, inst, dst_chan);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_MAC(p, dst, src0[3], src1[3]);
-    brw_set_saturate(p, 0);
-}
-
-static void emit_dph(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_reg src0[4], src1[4], dst;
-    int i;
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, inst, 0, i);
-	src1[i] = get_src_reg_imm(c, inst, 1, i);
-    }
-    dst = get_dst_reg(c, inst, dst_chan);
-    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
-    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
-    brw_MAC(p, dst, src0[2], src1[2]);
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    brw_ADD(p, dst, dst, src1[3]);
-    brw_set_saturate(p, 0);
-}
-
-/**
- * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
- * Note that the result of the function is smeared across the dest
- * register's X, Y, Z and W channels (subject to writemasking of course).
- */
-static void emit_math1(struct brw_wm_compile *c,
-                       const struct prog_instruction *inst, GLuint func)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    /* Get first component of source register */
-    dst = get_dst_reg(c, inst, dst_chan);
-    src0 = get_src_reg(c, inst, 0, 0);
-
-    brw_MOV(p, brw_message_reg(2), src0);
-    brw_math(p,
-             dst,
-             func,
-             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-             2,
-             brw_null_reg(),
-             BRW_MATH_DATA_VECTOR,
-             BRW_MATH_PRECISION_FULL);
-}
-
-static void emit_rcp(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
-}
-
-static void emit_rsq(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
-}
-
-static void emit_sin(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
-}
+    func(p, dst, mask, tmp_arg0, tmp_arg1);
 
-static void emit_cos(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
-}
-
-static void emit_ex2(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
-}
-
-static void emit_lg2(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
-}
-
-static void emit_add(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_ADD(p, dst, src0, src1);
-	}
-    }
-    brw_set_saturate(p, 0);
+    release_tmps(c, mark);
 }
 
 static void emit_arl(struct brw_wm_compile *c,
@@ -1201,180 +733,6 @@ static void emit_arl(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
-
-static void emit_mul(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_MUL(p, dst, src0, src1);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_frc(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg_imm(c, inst, 0, i);
-	    brw_FRC(p, dst, src0);
-	}
-    }
-    if (inst->SaturateMode != SATURATE_OFF)
-	brw_set_saturate(p, 0);
-}
-
-static void emit_flr(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg_imm(c, inst, 0, i);
-	    brw_RNDD(p, dst, src0);
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
-
-static void emit_min_max(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    const GLuint mask = inst->DstReg.WriteMask;
-    const int mark = mark_tmps(c);
-    int i;
-    brw_push_insn_state(p);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-            struct brw_reg real_dst = get_dst_reg(c, inst, i);
-	    struct brw_reg src0 = get_src_reg(c, inst, 0, i);
-	    struct brw_reg src1 = get_src_reg(c, inst, 1, i);
-            struct brw_reg dst;
-            /* if dst==src0 or dst==src1 we need to use a temp reg */
-            GLboolean use_temp = brw_same_reg(dst, src0) ||
-                                 brw_same_reg(dst, src1);
-            if (use_temp)
-               dst = alloc_tmp(c);
-            else
-               dst = real_dst;
-
-            /*
-            printf("  Min/max: dst %d  src0 %d  src1 %d\n",
-                   dst.nr, src0.nr, src1.nr);
-            */
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MOV(p, dst, src0);
-	    brw_set_saturate(p, 0);
-
-            if (inst->Opcode == OPCODE_MIN)
-               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
-            else
-               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
-
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, src1);
-	    brw_set_saturate(p, 0);
-	    brw_set_predicate_control_flag_value(p, 0xff);
-            if (use_temp)
-               brw_MOV(p, real_dst, dst);
-	}
-    }
-    brw_pop_insn_state(p);
-    release_tmps(c, mark);
-}
-
-static void emit_pow(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst, src0, src1;
-    GLuint mask = inst->DstReg.WriteMask;
-    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
-
-    if (!(mask & WRITEMASK_XYZW))
-	return;
-
-    assert(is_power_of_two(mask & WRITEMASK_XYZW));
-
-    dst = get_dst_reg(c, inst, dst_chan);
-    src0 = get_src_reg_imm(c, inst, 0, 0);
-    src1 = get_src_reg_imm(c, inst, 1, 0);
-
-    brw_MOV(p, brw_message_reg(2), src0);
-    brw_MOV(p, brw_message_reg(3), src1);
-
-    brw_math(p,
-	    dst,
-	    BRW_MATH_FUNCTION_POW,
-	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	    2,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
-}
-
-static void emit_lrp(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
-    int i;
-    int mark = mark_tmps(c);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-
-	    if (src1.nr == dst.nr) {
-		tmp1 = alloc_tmp(c);
-		brw_MOV(p, tmp1, src1);
-	    } else
-		tmp1 = src1;
-
-	    src2 = get_src_reg(c, inst, 2, i);
-	    if (src2.nr == dst.nr) {
-		tmp2 = alloc_tmp(c);
-		brw_MOV(p, tmp2, src2);
-	    } else
-		tmp2 = src2;
-
-	    brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
-	    brw_MUL(p, brw_null_reg(), dst, tmp2);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MAC(p, dst, src0, tmp1);
-	    brw_set_saturate(p, 0);
-	}
-	release_tmps(c, mark);
-    }
-}
-
 /**
  * For GLSL shaders, this KIL will be unconditional.
  * It may be contained inside an IF/ENDIF structure of course.
@@ -1385,94 +743,11 @@ static void emit_kil(struct brw_wm_compile *c)
     struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
     brw_push_insn_state(p);
     brw_set_mask_control(p, BRW_MASK_DISABLE);
-    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
     brw_AND(p, depth, c->emit_mask_reg, depth);
     brw_pop_insn_state(p);
 }
 
-static void emit_mad(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, src0, src1, src2;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    src2 = get_src_reg_imm(c, inst, 2, i);
-	    brw_MUL(p, dst, src0, src1);
-
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_ADD(p, dst, dst, src2);
-	    brw_set_saturate(p, 0);
-	}
-    }
-}
-
-static void emit_sop(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst, GLuint cond)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg dst, src0, src1;
-    int i;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i);
-	    src0 = get_src_reg(c, inst, 0, i);
-	    src1 = get_src_reg_imm(c, inst, 1, i);
-	    brw_push_insn_state(p);
-	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	    brw_MOV(p, dst, brw_imm_f(0.0));
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, brw_imm_f(1.0));
-	    brw_pop_insn_state(p);
-	}
-    }
-}
-
-static void emit_slt(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_L);
-}
-
-static void emit_sle(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_LE);
-}
-
-static void emit_sgt(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_G);
-}
-
-static void emit_sge(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_GE);
-}
-
-static void emit_seq(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_EQ);
-}
-
-static void emit_sne(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
-}
-
 static INLINE struct brw_reg high_words( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
@@ -2525,196 +1800,6 @@ static void emit_noise4( struct brw_wm_compile *c,
     
     release_tmps( c, mark );
 }
-    
-static void emit_wpos_xy(struct brw_wm_compile *c,
-                         const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg src0[2], dst[2];
-
-    dst[0] = get_dst_reg(c, inst, 0);
-    dst[1] = get_dst_reg(c, inst, 1);
-
-    src0[0] = get_src_reg(c, inst, 0, 0);
-    src0[1] = get_src_reg(c, inst, 0, 1);
-
-    /* Calculate the pixel offset from window bottom left into destination
-     * X and Y channels.
-     */
-    if (mask & WRITEMASK_X) {
-	/* X' = X - origin_x */
-	brw_ADD(p,
-		dst[0],
-		retype(src0[0], BRW_REGISTER_TYPE_W),
-		brw_imm_d(0 - c->key.origin_x));
-    }
-
-    if (mask & WRITEMASK_Y) {
-	/* Y' = height - (Y - origin_y) = height + origin_y - Y */
-	brw_ADD(p,
-		dst[1],
-		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
-		brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
-    }
-}
-
-/* TODO
-   BIAS on SIMD8 not working yet...
- */	
-static void emit_txb(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst[4], src[4], payload_reg;
-    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
-    const GLuint unit = inst->TexSrcUnit;
-    GLuint i;
-    GLuint msg_type;
-
-    assert(unit < BRW_MAX_TEX_UNIT);
-
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
-
-    for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i);
-    for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, inst, 0, i);
-
-    switch (inst->TexSrcTarget) {
-	case TEXTURE_1D_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
-	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
-	    break;
-	case TEXTURE_2D_INDEX:
-	case TEXTURE_RECT_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), src[1]);
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
-	    break;
-	case TEXTURE_3D_INDEX:
-	case TEXTURE_CUBE_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), src[1]);
-	    brw_MOV(p, brw_message_reg(4), src[2]);
-	    break;
-	default:
-            /* invalid target */
-            abort();
-    }
-    brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
-    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
-
-    if (BRW_IS_IGDNG(p->brw)) {
-        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
-    } else {
-        /* Does it work well on SIMD8? */
-        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
-    }
-
-    brw_SAMPLE(p,
-               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
-               1,                                           /* msg_reg_nr */
-               retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
-               SURF_INDEX_TEXTURE(unit),
-               unit,                                        /* sampler */
-               inst->DstReg.WriteMask,                      /* writemask */
-               msg_type,                                    /* msg_type */
-               4,                                           /* response_length */
-               4,                                           /* msg_length */
-               0,                                           /* eot */
-               1,
-               BRW_SAMPLER_SIMD_MODE_SIMD8);	
-}
-
-
-static void emit_tex(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg dst[4], src[4], payload_reg;
-    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
-    const GLuint unit = inst->TexSrcUnit;
-    GLuint msg_len;
-    GLuint i, nr;
-    GLuint emit;
-    GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
-    GLuint msg_type;
-
-    assert(unit < BRW_MAX_TEX_UNIT);
-
-    payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
-
-    for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i);
-    for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, inst, 0, i);
-
-    switch (inst->TexSrcTarget) {
-	case TEXTURE_1D_INDEX:
-	    emit = WRITEMASK_X;
-	    nr = 1;
-	    break;
-	case TEXTURE_2D_INDEX:
-	case TEXTURE_RECT_INDEX:
-	    emit = WRITEMASK_XY;
-	    nr = 2;
-	    break;
-	case TEXTURE_3D_INDEX:
-	case TEXTURE_CUBE_INDEX:
-	    emit = WRITEMASK_XYZ;
-	    nr = 3;
-	    break;
-	default:
-           /* invalid target */
-           abort();
-    }
-    msg_len = 1;
-
-    /* move/load S, T, R coords */
-    for (i = 0; i < nr; i++) {
-	static const GLuint swz[4] = {0,1,2,2};
-	if (emit & (1<<i))
-	    brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
-	else
-	    brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
-	msg_len += 1;
-    }
-
-    if (shadow) {
-       brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
-       brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
-    }
-
-    if (BRW_IS_IGDNG(p->brw)) {
-        if (shadow)
-            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
-        else
-            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
-    } else {
-        /* Does it work for shadow on SIMD8 ? */
-        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
-    }
-    
-    brw_SAMPLE(p,
-               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
-               1,                                          /* msg_reg_nr */
-               retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
-               SURF_INDEX_TEXTURE(unit),
-               unit,                                       /* sampler */
-               inst->DstReg.WriteMask,                     /* writemask */
-               msg_type,                                   /* msg_type */
-               4,                                          /* response_length */
-               shadow ? 6 : 4,                             /* msg_length */
-               0,                                          /* eot */
-               1,
-               BRW_SAMPLER_SIMD_MODE_SIMD8);	
-
-    if (shadow)
-	brw_MOV(p, dst[3], brw_imm_f(1.0));
-}
-
 
 /**
  * Resolve subroutine calls after code emit is done.
@@ -2741,6 +1826,7 @@ get_argument_regs(struct brw_wm_compile *c,
 
 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 {
+   struct intel_context *intel = &brw->intel;
 #define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
     struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
@@ -2771,137 +1857,153 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
         if (c->fp->use_const_buffer)
            fetch_constants(c, inst);
 
-	if (inst->CondUpdate)
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	else
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+	if (inst->Opcode != OPCODE_ARL) {
+	   for (j = 0; j < 4; j++) {
+	      if (inst->DstReg.WriteMask & (1 << j))
+		 dst[j] = get_dst_reg(c, inst, j);
+	      else
+		 dst[j] = brw_null_reg();
+	   }
+	}
+	for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
+	    get_argument_regs(c, inst, j, args[j], WRITEMASK_XYZW);
 
 	dst_flags = inst->DstReg.WriteMask;
 	if (inst->SaturateMode == SATURATE_ZERO_ONE)
 	    dst_flags |= SATURATE;
 
+	if (inst->CondUpdate)
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	else
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+
 	switch (inst->Opcode) {
 	    case WM_PIXELXY:
-		emit_pixel_xy(c, inst);
+		emit_pixel_xy(c, dst, dst_flags);
 		break;
 	    case WM_DELTAXY: 
-		emit_delta_xy(c, inst);
+		emit_delta_xy(p, dst, dst_flags, args[0]);
 		break;
 	    case WM_PIXELW:
-		emit_pixel_w(c, inst);
+		emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
 		break;	
 	    case WM_LINTERP:
-		emit_linterp(c, inst);
+		emit_linterp(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case WM_PINTERP:
-		emit_pinterp(c, inst);
+		emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
 		break;
 	    case WM_CINTERP:
-		emit_cinterp(c, inst);
+		emit_cinterp(p, dst, dst_flags, args[0]);
 		break;
 	    case WM_WPOSXY:
-		emit_wpos_xy(c, inst);
+		emit_wpos_xy(c, dst, dst_flags, args[0]);
 		break;
 	    case WM_FB_WRITE:
-		emit_fb_write(c, inst);
+		emit_fb_write(c, args[0], args[1], args[2],
+			      INST_AUX_GET_TARGET(inst->Aux),
+			      inst->Aux & INST_AUX_EOT);
 		break;
 	    case WM_FRONTFACING:
-		emit_frontfacing(c, inst);
+		emit_frontfacing(p, dst, dst_flags);
 		break;
 	    case OPCODE_ADD:
-		emit_add(c, inst);
+		emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_ARL:
 		emit_arl(c, inst);
 		break;
 	    case OPCODE_FRC:
-		emit_frc(c, inst);
+		emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_FLR:
-		emit_flr(c, inst);
+		emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_LRP:
-		emit_lrp(c, inst);
+		unalias3(c, emit_lrp,
+			 dst, dst_flags, args[0], args[1], args[2]);
 		break;
 	    case OPCODE_TRUNC:
-		emit_trunc(c, inst);
+		emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_MOV:
 	    case OPCODE_SWZ:
-		emit_mov(c, inst);
+		emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_DP3:
-		emit_dp3(c, inst);
+		emit_dp3(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_DP4:
-		emit_dp4(c, inst);
+		emit_dp4(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_XPD:
-		emit_xpd(c, inst);
+		emit_xpd(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_DPH:
-		emit_dph(c, inst);
+		emit_dph(p, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_RCP:
-		emit_rcp(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_RSQ:
-		emit_rsq(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_SIN:
-		emit_sin(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_COS:
-		emit_cos(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_EX2:
-		emit_ex2(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_LG2:
-		emit_lg2(c, inst);
+		emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
 		break;
 	    case OPCODE_MIN:	
+		unalias2(c, emit_min, dst, dst_flags, args[0], args[1]);
+		break;
 	    case OPCODE_MAX:	
-		emit_min_max(c, inst);
+		unalias2(c, emit_max, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_DDX:
 	    case OPCODE_DDY:
-		for (j = 0; j < 4; j++) {
-		    if (inst->DstReg.WriteMask & (1 << j))
-			dst[j] = get_dst_reg(c, inst, j);
-		    else
-			dst[j] = brw_null_reg();
-		}
-		get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW);
 		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
 			  args[0]);
                 break;
 	    case OPCODE_SLT:
-		emit_slt(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_L, args[0], args[1]);
 		break;
 	    case OPCODE_SLE:
-		emit_sle(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_LE, args[0], args[1]);
 		break;
 	    case OPCODE_SGT:
-		emit_sgt(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_G, args[0], args[1]);
 		break;
 	    case OPCODE_SGE:
-		emit_sge(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_GE, args[0], args[1]);
 		break;
 	    case OPCODE_SEQ:
-		emit_seq(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_EQ, args[0], args[1]);
 		break;
 	    case OPCODE_SNE:
-		emit_sne(c, inst);
+		emit_sop(p, dst, dst_flags,
+			 BRW_CONDITIONAL_NEQ, args[0], args[1]);
 		break;
 	    case OPCODE_MUL:
-		emit_mul(c, inst);
+		emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_POW:
-		emit_pow(c, inst);
+		emit_math2(c, BRW_MATH_FUNCTION_POW,
+			   dst, dst_flags, args[0], args[1]);
 		break;
 	    case OPCODE_MAD:
-		emit_mad(c, inst);
+		emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
 		break;
 	    case OPCODE_NOISE1:
 		emit_noise1(c, inst);
@@ -2916,10 +2018,19 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_noise4(c, inst);
 		break;
 	    case OPCODE_TEX:
-		emit_tex(c, inst);
+		emit_tex(c, dst, dst_flags, args[0],
+			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
+				 0, 1, 0, 0),
+			 inst->TexSrcTarget,
+			 inst->TexSrcUnit,
+			 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
 		break;
 	    case OPCODE_TXB:
-		emit_txb(c, inst);
+		emit_txb(c, dst, dst_flags, args[0],
+			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
+				 0, 1, 0, 0),
+			 inst->TexSrcTarget,
+			 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
 		break;
 	    case OPCODE_KIL_NV:
 		emit_kil(c);
@@ -2929,6 +2040,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_ELSE:
+		assert(if_depth > 0);
 		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
 		break;
 	    case OPCODE_ENDIF:
@@ -2982,9 +2094,10 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                   struct brw_instruction *inst0, *inst1;
                   GLuint br = 1;
 
-                  if (BRW_IS_IGDNG(brw))
+                  if (intel->is_ironlake)
                      br = 2;
- 
+
+		  assert(loop_depth > 0);
                   loop_depth--;
                   inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
                   /* patch all the BREAK/CONT instructions from last BGNLOOP */
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 602b1351ef..ff4c082d5e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -42,12 +42,14 @@
 static struct brw_wm_ref *get_ref( struct brw_wm_compile *c )
 {
    assert(c->nr_refs < BRW_WM_MAX_REF);
+   memset(&c->refs[c->nr_refs], 0, sizeof(*c->refs));
    return &c->refs[c->nr_refs++];
 }
 
 static struct brw_wm_value *get_value( struct brw_wm_compile *c)
 {
    assert(c->nr_refs < BRW_WM_MAX_VREG);
+   memset(&c->vreg[c->nr_vreg], 0, sizeof(*c->vreg));
    return &c->vreg[c->nr_vreg++];
 }
 
@@ -55,6 +57,7 @@ static struct brw_wm_value *get_value( struct brw_wm_compile *c)
 static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
 {
    assert(c->nr_insns < BRW_WM_MAX_INSN);
+   memset(&c->instruction[c->nr_insns], 0, sizeof(*c->instruction));
    return &c->instruction[c->nr_insns++];
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 6faea018fb..31303febf0 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -82,8 +82,8 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < FRAG_ATTRIB_MAX; j++) {
-      if (c->key.vp_outputs_written & (1<<j)) {
+   for (j = 0; j < VERT_RESULT_MAX; j++) {
+      if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
 	 int fp_index;
 
 	 if (j >= VERT_RESULT_VAR0)
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 0acb027431..ad267a4e6a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -73,8 +73,8 @@ static dri_bo *upload_default_color( struct brw_context *brw,
 
    COPY_4V(sdc.color, color); 
    
-   return brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc,
-			  NULL, 0 );
+   return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			 &sdc, sizeof(sdc), NULL, 0);
 }
 
 
@@ -262,10 +262,10 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 dri_bo_unreference(brw->wm.sdc_bo[unit]);
 	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
 	    float bordercolor[4] = {
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0],
-	       texObj->BorderColor[0]
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0],
+	       texObj->BorderColor.f[0]
 	    };
 	    /* GL specs that border color for depth textures is taken from the
 	     * R channel, while the hardware uses A.  Spam R into all the
@@ -274,7 +274,7 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
 	 } else {
 	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
-							texObj->BorderColor);
+							texObj->BorderColor.f);
 	 }
 	 key->sampler_count = unit + 1;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 361f91292b..d3373ea79e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -49,8 +49,6 @@ struct brw_wm_unit_key {
    unsigned int curbe_offset;
    unsigned int urb_size;
 
-   unsigned int max_threads;
-
    unsigned int nr_surfaces, sampler_count;
    GLboolean uses_depth, computes_depth, uses_kill, is_glsl;
    GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
@@ -67,18 +65,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    memset(key, 0, sizeof(*key));
 
-   if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
-      key->max_threads = 1;
-   else {
-      /* WM maximum threads is number of EUs times number of threads per EU. */
-      if (BRW_IS_IGDNG(brw))
-         key->max_threads = 12 * 6;
-      else if (BRW_IS_G4X(brw))
-	 key->max_threads = 10 * 5;
-      else
-	 key->max_threads = 8 * 4;
-   }
-
    /* CACHE_NEW_WM_PROG */
    key->total_grf = brw->wm.prog_data->total_grf;
    key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
@@ -106,7 +92,7 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* as far as we can tell */
    key->computes_depth =
-      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPTH)) != 0;
+      (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) != 0;
    /* BRW_NEW_DEPTH_BUFFER
     * Override for NULL depthbuffer case, required by the Pixel Shader Computed
     * Depth field.
@@ -140,6 +126,7 @@ static dri_bo *
 wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			dri_bo **reloc_bufs)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_wm_unit_state wm;
    dri_bo *bo;
 
@@ -150,7 +137,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread1.depth_coef_urb_read_offset = 1;
    wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
-   if (BRW_IS_IGDNG(brw))
+   if (intel->is_ironlake)
       wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
    else
       wm.thread1.binding_table_entry_count = key->nr_surfaces;
@@ -170,7 +157,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   if (BRW_IS_IGDNG(brw)) 
+   if (intel->is_ironlake)
       wm.wm4.sampler_count = 0; /* hardware requirement */
    else
       wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
@@ -191,7 +178,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    else
       wm.wm5.enable_16_pix = 1;
 
-   wm.wm5.max_threads = key->max_threads - 1;
+   wm.wm5.max_threads = brw->wm_max_threads - 1;
    wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
    wm.wm5.legacy_line_rast = 0;
    wm.wm5.legacy_global_depth_bias = 0;
@@ -268,7 +255,7 @@ static void upload_wm_unit( struct brw_context *brw )
     */
    assert(key.total_scratch <= 12 * 1024);
    if (key.total_scratch) {
-      GLuint total = key.total_scratch * key.max_threads;
+      GLuint total = key.total_scratch * brw->wm_max_threads;
 
       if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
 	 dri_bo_unreference(brw->wm.scratch_bo);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 0bf735c0f2..f26cfabb7d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -86,25 +86,22 @@ static GLuint translate_tex_format( gl_format mesa_format,
    case MESA_FORMAT_AL88:
       return BRW_SURFACEFORMAT_L8A8_UNORM;
 
+   case MESA_FORMAT_AL1616:
+      return BRW_SURFACEFORMAT_L16A16_UNORM;
+
    case MESA_FORMAT_RGB888:
       assert(0);		/* not supported for sampling */
       return BRW_SURFACEFORMAT_R8G8B8_UNORM;      
 
    case MESA_FORMAT_ARGB8888:
-      if (internal_format == GL_RGB)
-	 return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
-      else
-	 return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 
    case MESA_FORMAT_XRGB8888:
       return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
 
    case MESA_FORMAT_RGBA8888_REV:
       _mesa_problem(NULL, "unexpected format in i965:translate_tex_format()");
-      if (internal_format == GL_RGB)
-	 return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
-      else
-	 return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
 
    case MESA_FORMAT_RGB565:
       return BRW_SURFACEFORMAT_B5G6R5_UNORM;
@@ -358,7 +355,10 @@ brw_create_constant_surface( struct brw_context *brw,
 			 NULL, NULL);
 
    if (key->bo) {
-      /* Emit relocation to surface contents */
+      /* Emit relocation to surface contents.  Section 5.1.1 of the gen4
+       * bspec ("Data Cache") says that the data cache does not exist as
+       * a separate cache and is just the sampler cache.
+       */
       dri_bo_emit_reloc(bo,
 			I915_GEM_DOMAIN_SAMPLER, 0,
 			0,
@@ -511,7 +511,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 				struct gl_renderbuffer *rb,
 				unsigned int unit)
 {
-   GLcontext *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;;
+   GLcontext *ctx = &intel->ctx;
    dri_bo *region_bo = NULL;
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    struct intel_region *region = irb ? irb->region : NULL;
@@ -522,7 +523,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       GLubyte color_mask[4];
       GLboolean color_blend;
       uint32_t tiling;
-      uint32_t draw_offset;
+      uint32_t draw_x;
+      uint32_t draw_y;
    } key;
 
    memset(&key, 0, sizeof(key));
@@ -531,12 +533,16 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       region_bo = region->buffer;
 
       key.surface_type = BRW_SURFACE_2D;
-      switch (irb->texformat) {
+      switch (irb->Base.Format) {
+      /* XRGB and ARGB are treated the same here because the chips in this
+       * family cannot render to XRGB targets.  This means that we have to
+       * mask writes to alpha (ala glColorMask) and reconfigure the alpha
+       * blending hardware to use GL_ONE (or GL_ZERO) for cases where
+       * GL_DST_ALPHA (or GL_ONE_MINUS_DST_ALPHA) is used.
+       */
       case MESA_FORMAT_ARGB8888:
-	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-	 break;
       case MESA_FORMAT_XRGB8888:
-	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
+	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 	 break;
       case MESA_FORMAT_RGB565:
 	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
@@ -548,7 +554,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
 	 break;
       default:
-	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n", irb->texformat);
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n", irb->Base.Format);
       }
       key.tiling = region->tiling;
       if (brw->intel.intelScreen->driScrnPriv->dri2.enabled) {
@@ -560,7 +566,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       }
       key.pitch = region->pitch;
       key.cpp = region->cpp;
-      key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */
+      key.draw_x = region->draw_x;
+      key.draw_y = region->draw_y;
    } else {
       key.surface_type = BRW_SURFACE_NULL;
       key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
@@ -568,11 +575,19 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       key.width = 1;
       key.height = 1;
       key.cpp = 4;
-      key.draw_offset = 0;
+      key.draw_x = 0;
+      key.draw_y = 0;
    }
    /* _NEW_COLOR */
-   memcpy(key.color_mask, ctx->Color.ColorMask,
+   memcpy(key.color_mask, ctx->Color.ColorMask[0],
 	  sizeof(key.color_mask));
+
+   /* As mentioned above, disable writes to the alpha component when the
+    * renderbuffer is XRGB.
+    */
+   if (ctx->DrawBuffer->Visual.alphaBits == 0)
+     key.color_mask[3] = GL_FALSE;
+
    key.color_blend = (!ctx->Color._LogicOpEnabled &&
 		      ctx->Color.BlendEnabled);
 
@@ -591,25 +606,32 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
       if (key.tiling == I915_TILING_NONE) {
-	 surf.ss1.base_addr = key.draw_offset;
+	 surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp;
       } else {
-	 uint32_t tile_offset = key.draw_offset % 4096;
-
-	 surf.ss1.base_addr = key.draw_offset - tile_offset;
-
-	 assert(BRW_IS_G4X(brw) || tile_offset == 0);
-	 if (BRW_IS_G4X(brw)) {
-	    if (key.tiling == I915_TILING_X) {
-	       /* Note that the low bits of these fields are missing, so
-		* there's the possibility of getting in trouble.
-		*/
-	       surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4;
-	       surf.ss5.y_offset = tile_offset / 512 / 2;
-	    } else {
-	       surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4;
-	       surf.ss5.y_offset = tile_offset / 128 / 2;
-	    }
+	 uint32_t tile_base, tile_x, tile_y;
+	 uint32_t pitch = key.pitch * key.cpp;
+
+	 if (key.tiling == I915_TILING_X) {
+	    tile_x = key.draw_x % (512 / key.cpp);
+	    tile_y = key.draw_y % 8;
+	    tile_base = ((key.draw_y / 8) * (8 * pitch));
+	    tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096;
+	 } else {
+	    /* Y */
+	    tile_x = key.draw_x % (128 / key.cpp);
+	    tile_y = key.draw_y % 32;
+	    tile_base = ((key.draw_y / 32) * (32 * pitch));
+	    tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096;
 	 }
+	 assert(intel->is_g4x || (tile_x == 0 && tile_y == 0));
+	 assert(tile_x % 4 == 0);
+	 assert(tile_y % 2 == 0);
+	 /* Note that the low bits of these fields are missing, so
+	  * there's the possibility of getting in trouble.
+	  */
+	 surf.ss1.base_addr = tile_base;
+	 surf.ss5.x_offset = tile_x / 4;
+	 surf.ss5.y_offset = tile_y / 2;
       }
       if (region_bo != NULL)
 	 surf.ss1.base_addr += region_bo->offset; /* reloc */
diff --git a/src/mesa/drivers/dri/i965/intel_swapbuffers.c b/src/mesa/drivers/dri/i965/intel_swapbuffers.c
deleted file mode 120000
index 148d5215aa..0000000000
--- a/src/mesa/drivers/dri/i965/intel_swapbuffers.c
+++ /dev/null
@@ -1 +0,0 @@
-../intel/intel_swapbuffers.c
-\ No newline at end of file
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index e94b8368cd..3a4b21a844 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -80,7 +80,7 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch)
       batch->buf = NULL;
    }
 
-   if (!batch->buffer && intel->ttm == GL_TRUE)
+   if (!batch->buffer)
       batch->buffer = malloc (intel->maxBatchSize);
 
    batch->buf = dri_bo_alloc(intel->bufmgr, "batchbuffer",
@@ -94,7 +94,6 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch)
    batch->size = intel->maxBatchSize;
    batch->ptr = batch->map;
    batch->dirty_state = ~0;
-   batch->cliprect_mode = IGNORE_CLIPRECTS;
 }
 
 struct intel_batchbuffer *
@@ -129,13 +128,10 @@ intel_batchbuffer_free(struct intel_batchbuffer *batch)
 /* TODO: Push this whole function into bufmgr.
  */
 static void
-do_flush_locked(struct intel_batchbuffer *batch,
-		GLuint used, GLboolean allow_unlock)
+do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
 {
    struct intel_context *intel = batch->intel;
    int ret = 0;
-   unsigned int num_cliprects = 0;
-   struct drm_clip_rect *cliprects = NULL;
    int x_off = 0, y_off = 0;
 
    if (batch->buffer)
@@ -146,31 +142,7 @@ do_flush_locked(struct intel_batchbuffer *batch,
    batch->map = NULL;
    batch->ptr = NULL;
 
-
-   if (batch->cliprect_mode == LOOP_CLIPRECTS) {
-      intel_get_cliprects(intel, &cliprects, &num_cliprects, &x_off, &y_off);
-   }
-   /* Dispatch the batchbuffer, if it has some effect (nonzero cliprects).
-    * Can't short-circuit like this once we have hardware contexts, but we
-    * should always be in DRI2 mode by then anyway.
-    */
-   if ((batch->cliprect_mode != LOOP_CLIPRECTS ||
-	num_cliprects != 0) && !intel->no_hw) {
-      dri_bo_exec(batch->buf, used, cliprects, num_cliprects,
-		  (x_off & 0xffff) | (y_off << 16));
-   }
-
-   if (batch->cliprect_mode == LOOP_CLIPRECTS && num_cliprects == 0) {
-      if (allow_unlock) {
-	 /* If we are not doing any actual user-visible rendering,
-	  * do a sched_yield to keep the app from pegging the cpu while
-	  * achieving nothing.
-	  */
-         UNLOCK_HARDWARE(intel);
-         sched_yield();
-         LOCK_HARDWARE(intel);
-      }
-   }
+   dri_bo_exec(batch->buf, used, NULL, 0, (x_off & 0xffff) | (y_off << 16));
 
    if (INTEL_DEBUG & DEBUG_BATCH) {
       dri_bo_map(batch->buf, GL_FALSE);
@@ -183,7 +155,6 @@ do_flush_locked(struct intel_batchbuffer *batch,
    }
 
    if (ret != 0) {
-      UNLOCK_HARDWARE(intel);
       exit(1);
    }
    intel->vtbl.new_batch(intel);
@@ -201,19 +172,17 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
       drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
    }
 
-   if (used == 0) {
-      batch->cliprect_mode = IGNORE_CLIPRECTS;
+   if (used == 0)
       return;
-   }
 
    if (INTEL_DEBUG & DEBUG_BATCH)
       fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
 	      used);
 
+   batch->reserved_space = 0;
    /* Emit a flush if the bufmgr doesn't do it for us. */
-   if (intel->always_flush_cache || !intel->ttm) {
-      *(GLuint *) (batch->ptr) = intel->vtbl.flush_cmd();
-      batch->ptr += 4;
+   if (intel->always_flush_cache) {
+      intel_batchbuffer_emit_mi_flush(batch);
       used = batch->ptr - batch->map;
    }
 
@@ -244,12 +213,15 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
    if (intel->vtbl.finish_batch)
       intel->vtbl.finish_batch(intel);
 
+   /* Check that we didn't just wrap our batchbuffer at a bad time. */
+   assert(!intel->no_batch_wrap);
+
+   batch->reserved_space = BATCH_RESERVED;
+
    /* TODO: Just pass the relocation list and dma buffer up to the
     * kernel.
     */
-   LOCK_HARDWARE(intel);
-   do_flush_locked(batch, used, GL_FALSE);
-   UNLOCK_HARDWARE(intel);
+   do_flush_locked(batch, used);
 
    if (INTEL_DEBUG & DEBUG_SYNC) {
       fprintf(stderr, "waiting for idle\n");
@@ -291,11 +263,38 @@ intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
 
 void
 intel_batchbuffer_data(struct intel_batchbuffer *batch,
-                       const void *data, GLuint bytes,
-		       enum cliprect_mode cliprect_mode)
+                       const void *data, GLuint bytes)
 {
    assert((bytes & 3) == 0);
-   intel_batchbuffer_require_space(batch, bytes, cliprect_mode);
+   intel_batchbuffer_require_space(batch, bytes);
    __memcpy(batch->ptr, data, bytes);
    batch->ptr += bytes;
 }
+
+/* Emit a pipelined flush to either flush render and texture cache for
+ * reading from a FBO-drawn texture, or flush so that frontbuffer
+ * render appears on the screen in DRI1.
+ *
+ * This is also used for the always_flush_cache driconf debug option.
+ */
+void
+intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
+{
+   struct intel_context *intel = batch->intel;
+
+   if (intel->gen >= 4) {
+      BEGIN_BATCH(4);
+      OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+		PIPE_CONTROL_INSTRUCTION_FLUSH |
+		PIPE_CONTROL_WRITE_FLUSH |
+		PIPE_CONTROL_NO_WRITE);
+      OUT_BATCH(0); /* write address */
+      OUT_BATCH(0); /* write data */
+      OUT_BATCH(0); /* write data */
+      ADVANCE_BATCH();
+   } else {
+      BEGIN_BATCH(1);
+      OUT_BATCH(MI_FLUSH);
+      ADVANCE_BATCH();
+   }
+}
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index d4899aab7f..b052b724d8 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -10,35 +10,6 @@
 #define BATCH_SZ 16384
 #define BATCH_RESERVED 16
 
-enum cliprect_mode {
-   /**
-    * Batchbuffer contents may be looped over per cliprect, but do not
-    * require it.
-    */
-   IGNORE_CLIPRECTS,
-   /**
-    * Batchbuffer contents require looping over per cliprect at batch submit
-    * time.
-    *
-    * This will be upgraded to NO_LOOP_CLIPRECTS when there's a single
-    * constant cliprect, as in DRI2 or FBO rendering.
-    */
-   LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that should not be executed multiple
-    * times.
-    */
-   NO_LOOP_CLIPRECTS,
-   /**
-    * Batchbuffer contents contain drawing that already handles cliprects, such
-    * as 2D drawing to front/back/depth that doesn't respect DRAWING_RECTANGLE.
-    *
-    * Equivalent behavior to NO_LOOP_CLIPRECTS, but may not persist in batch
-    * outside of LOCK/UNLOCK.  This is upgraded to just NO_LOOP_CLIPRECTS when
-    * there's a constant cliprect, as in DRI2 or FBO rendering.
-    */
-   REFERENCES_CLIPRECTS
-};
 
 struct intel_batchbuffer
 {
@@ -51,8 +22,6 @@ struct intel_batchbuffer
    GLubyte *map;
    GLubyte *ptr;
 
-   enum cliprect_mode cliprect_mode;
-
    GLuint size;
 
    /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
@@ -62,6 +31,7 @@ struct intel_batchbuffer
    } emit;
 
    GLuint dirty_state;
+   GLuint reserved_space;
 };
 
 struct intel_batchbuffer *intel_batchbuffer_alloc(struct intel_context
@@ -84,8 +54,7 @@ void intel_batchbuffer_reset(struct intel_batchbuffer *batch);
  * intel_buffer_dword() calls.
  */
 void intel_batchbuffer_data(struct intel_batchbuffer *batch,
-                            const void *data, GLuint bytes,
-			    enum cliprect_mode cliprect_mode);
+                            const void *data, GLuint bytes);
 
 void intel_batchbuffer_release_space(struct intel_batchbuffer *batch,
                                      GLuint bytes);
@@ -95,6 +64,7 @@ GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
 				       uint32_t read_domains,
 				       uint32_t write_domain,
 				       uint32_t offset);
+void intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch);
 
 /* Inline functions - might actually be better off with these
  * non-inlined.  Certainly better off switching all command packets to
@@ -104,7 +74,7 @@ GLboolean intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
 static INLINE GLint
 intel_batchbuffer_space(struct intel_batchbuffer *batch)
 {
-   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
+   return (batch->size - batch->reserved_space) - (batch->ptr - batch->map);
 }
 
 
@@ -119,36 +89,19 @@ intel_batchbuffer_emit_dword(struct intel_batchbuffer *batch, GLuint dword)
 
 static INLINE void
 intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
-                                GLuint sz,
-				enum cliprect_mode cliprect_mode)
+                                GLuint sz)
 {
    assert(sz < batch->size - 8);
    if (intel_batchbuffer_space(batch) < sz)
       intel_batchbuffer_flush(batch);
-
-   if ((cliprect_mode == LOOP_CLIPRECTS ||
-	cliprect_mode == REFERENCES_CLIPRECTS) &&
-       batch->intel->constant_cliprect)
-      cliprect_mode = NO_LOOP_CLIPRECTS;
-
-   if (cliprect_mode != IGNORE_CLIPRECTS) {
-      if (batch->cliprect_mode == IGNORE_CLIPRECTS) {
-	 batch->cliprect_mode = cliprect_mode;
-      } else {
-	 if (batch->cliprect_mode != cliprect_mode) {
-	    intel_batchbuffer_flush(batch);
-	    batch->cliprect_mode = cliprect_mode;
-	 }
-      }
-   }
 }
 
 /* Here are the crusty old macros, to be removed:
  */
 #define BATCH_LOCALS
 
-#define BEGIN_BATCH(n, cliprect_mode) do {				\
-   intel_batchbuffer_require_space(intel->batch, (n)*4, cliprect_mode); \
+#define BEGIN_BATCH(n) do {				\
+   intel_batchbuffer_require_space(intel->batch, (n)*4); \
    assert(intel->batch->emit.start_ptr == NULL);			\
    intel->batch->emit.total = (n) * 4;					\
    intel->batch->emit.start_ptr = intel->batch->ptr;			\
@@ -173,12 +126,4 @@ intel_batchbuffer_require_space(struct intel_batchbuffer *batch,
    intel->batch->emit.start_ptr = NULL;					\
 } while(0)
 
-
-static INLINE void
-intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
-{
-   intel_batchbuffer_require_space(batch, 4, IGNORE_CLIPRECTS);
-   intel_batchbuffer_emit_dword(batch, MI_FLUSH);
-}
-
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index 817223da41..55bee0084c 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -42,137 +42,6 @@
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
-/**
- * Copy the back color buffer to the front color buffer. 
- * Used for SwapBuffers().
- */
-void
-intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
-                const drm_clip_rect_t * rect)
-{
-
-   struct intel_context *intel;
-   const intelScreenPrivate *intelScreen;
-
-   DBG("%s\n", __FUNCTION__);
-
-   assert(dPriv);
-
-   intel = intelScreenContext(dPriv->driScreenPriv->private);
-   if (!intel)
-      return;
-
-   intelScreen = intel->intelScreen;
-
-   /* The LOCK_HARDWARE is required for the cliprects.  Buffer offsets
-    * should work regardless.
-    */
-   LOCK_HARDWARE(intel);
-
-   if (dPriv && dPriv->numClipRects) {
-      struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
-      struct intel_region *src, *dst;
-      int nbox = dPriv->numClipRects;
-      drm_clip_rect_t *pbox = dPriv->pClipRects;
-      int cpp;
-      int src_pitch, dst_pitch;
-      unsigned short src_x, src_y;
-      int BR13, CMD;
-      int i;
-      dri_bo *aper_array[3];
-
-      src = intel_get_rb_region(&intel_fb->Base, BUFFER_BACK_LEFT);
-      dst = intel_get_rb_region(&intel_fb->Base, BUFFER_FRONT_LEFT);
-
-      src_pitch = src->pitch * src->cpp;
-      dst_pitch = dst->pitch * dst->cpp;
-
-      cpp = src->cpp;
-
-      ASSERT(intel_fb);
-      ASSERT(intel_fb->Base.Name == 0);    /* Not a user-created FBO */
-      ASSERT(src);
-      ASSERT(dst);
-      ASSERT(src->cpp == dst->cpp);
-
-      if (cpp == 2) {
-	 BR13 = (0xCC << 16) | BR13_565;
-	 CMD = XY_SRC_COPY_BLT_CMD;
-      }
-      else {
-	 BR13 = (0xCC << 16) | BR13_8888;
-	 CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      }
-
-      assert(src->tiling != I915_TILING_Y);
-      assert(dst->tiling != I915_TILING_Y);
-#ifndef I915
-      if (src->tiling != I915_TILING_NONE) {
-	 CMD |= XY_SRC_TILED;
-	 src_pitch /= 4;
-      }
-      if (dst->tiling != I915_TILING_NONE) {
-	 CMD |= XY_DST_TILED;
-	 dst_pitch /= 4;
-      }
-#endif
-      /* do space/cliprects check before going any further */
-      intel_batchbuffer_require_space(intel->batch, 8 * 4,
-				      REFERENCES_CLIPRECTS);
-   again:
-      aper_array[0] = intel->batch->buf;
-      aper_array[1] = dst->buffer;
-      aper_array[2] = src->buffer;
-
-      if (dri_bufmgr_check_aperture_space(aper_array, 3) != 0) {
-	intel_batchbuffer_flush(intel->batch);
-	goto again;
-      }
-
-      for (i = 0; i < nbox; i++, pbox++) {
-	 drm_clip_rect_t box = *pbox;
-
-	 if (rect) {
-	    if (!intel_intersect_cliprects(&box, &box, rect))
-	       continue;
-	 }
-
-	 if (box.x1 >= box.x2 ||
-	     box.y1 >= box.y2)
-	    continue;
-
-	 assert(box.x1 < box.x2);
-	 assert(box.y1 < box.y2);
-	 src_x = box.x1 - dPriv->x + dPriv->backX;
-	 src_y = box.y1 - dPriv->y + dPriv->backY;
-
-	 BEGIN_BATCH(8, REFERENCES_CLIPRECTS);
-	 OUT_BATCH(CMD);
-	 OUT_BATCH(BR13 | dst_pitch);
-	 OUT_BATCH((box.y1 << 16) | box.x1);
-	 OUT_BATCH((box.y2 << 16) | box.x2);
-
-	 OUT_RELOC(dst->buffer,
-		   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-		   0);
-	 OUT_BATCH((src_y << 16) | src_x);
-	 OUT_BATCH(src_pitch);
-	 OUT_RELOC(src->buffer,
-		   I915_GEM_DOMAIN_RENDER, 0,
-		   0);
-	 ADVANCE_BATCH();
-      }
-
-      /* Flush the rendering and the batch so that the results all land on the
-       * screen in a timely fashion.
-       */
-      intel_batchbuffer_emit_mi_flush(intel->batch);
-      intel_batchbuffer_flush(intel->batch);
-   }
-
-   UNLOCK_HARDWARE(intel);
-}
-
 static GLuint translate_raster_op(GLenum logicop)
 {
    switch(logicop) {
@@ -248,7 +117,6 @@ intelEmitCopyBlit(struct intel_context *intel,
    } while (pass < 2);
 
    if (pass >= 2) {
-       LOCK_HARDWARE(intel);
        dri_bo_map(dst_buffer, GL_TRUE);
        dri_bo_map(src_buffer, GL_FALSE);
        _mesa_copy_rect((GLubyte *)dst_buffer->virtual + dst_offset,
@@ -262,12 +130,11 @@ intelEmitCopyBlit(struct intel_context *intel,
        
        dri_bo_unmap(src_buffer);
        dri_bo_unmap(dst_buffer);
-       UNLOCK_HARDWARE(intel);
 
        return GL_TRUE;
    }
 
-   intel_batchbuffer_require_space(intel->batch, 8 * 4, NO_LOOP_CLIPRECTS);
+   intel_batchbuffer_require_space(intel->batch, 8 * 4);
    DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
        __FUNCTION__,
        src_buffer, src_pitch, src_offset, src_x, src_y,
@@ -312,7 +179,7 @@ intelEmitCopyBlit(struct intel_context *intel,
    assert(dst_x < dst_x2);
    assert(dst_y < dst_y2);
 
-   BEGIN_BATCH(8, NO_LOOP_CLIPRECTS);
+   BEGIN_BATCH(8);
    OUT_BATCH(CMD);
    OUT_BATCH(BR13 | (uint16_t)dst_pitch);
    OUT_BATCH((dst_y << 16) | dst_x);
@@ -370,8 +237,6 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
       skipBuffers = BUFFER_BIT_STENCIL;
    }
 
-   LOCK_HARDWARE(intel);
-
    intel_get_cliprects(intel, &cliprects, &num_cliprects, &x_off, &y_off);
    if (num_cliprects) {
       GLint cx, cy, cw, ch;
@@ -496,13 +361,14 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
 		  CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
 		  CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
 
-		  switch (irb->texformat) {
+		  switch (irb->Base.Format) {
 		  case MESA_FORMAT_ARGB8888:
 		  case MESA_FORMAT_XRGB8888:
-		     clearVal = intel->ClearColor8888;
+		     clearVal = PACK_COLOR_8888(clear[3], clear[0],
+						clear[1], clear[2]);
 		     break;
 		  case MESA_FORMAT_RGB565:
-		     clearVal = intel->ClearColor565;
+		     clearVal = PACK_COLOR_565(clear[0], clear[1], clear[2]);
 		     break;
 		  case MESA_FORMAT_ARGB4444:
 		     clearVal = PACK_COLOR_4444(clear[3], clear[0],
@@ -514,7 +380,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
 		     break;
 		  default:
 		     _mesa_problem(ctx, "Unexpected renderbuffer format: %d\n",
-				   irb->texformat);
+				   irb->Base.Format);
 		     clearVal = 0;
 		  }
 	       }
@@ -527,7 +393,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                assert(x1 < x2);
                assert(y1 < y2);
 
-               BEGIN_BATCH(6, REFERENCES_CLIPRECTS);
+               BEGIN_BATCH(6);
                OUT_BATCH(CMD);
                OUT_BATCH(BR13);
                OUT_BATCH((y1 << 16) | x1);
@@ -542,8 +408,6 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
          }
       }
    }
-
-   UNLOCK_HARDWARE(intel);
 }
 
 GLboolean
@@ -585,8 +449,7 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
    intel_batchbuffer_require_space( intel->batch,
 				    (8 * 4) +
 				    (3 * 4) +
-				    dwords * 4,
-				    REFERENCES_CLIPRECTS );
+				    dwords * 4 );
 
    opcode = XY_SETUP_BLT_CMD;
    if (cpp == 4)
@@ -608,7 +471,7 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
    if (dst_tiling != I915_TILING_NONE)
       blit_cmd |= XY_DST_TILED;
 
-   BEGIN_BATCH(8 + 3, REFERENCES_CLIPRECTS);
+   BEGIN_BATCH(8 + 3);
    OUT_BATCH(opcode);
    OUT_BATCH(br13);
    OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */
@@ -627,8 +490,7 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 
    intel_batchbuffer_data( intel->batch,
 			   src_bits,
-			   dwords * 4,
-			   REFERENCES_CLIPRECTS );
+			   dwords * 4 );
 
    intel_batchbuffer_emit_mi_flush(intel->batch);
 
diff --git a/src/mesa/drivers/dri/intel/intel_blit.h b/src/mesa/drivers/dri/intel/intel_blit.h
index 240cb7cd1b..eb66fe0481 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.h
+++ b/src/mesa/drivers/dri/intel/intel_blit.h
@@ -30,7 +30,7 @@
 
 #include "intel_context.h"
 
-extern void intelCopyBuffer(const __DRIdrawablePrivate * dpriv,
+extern void intelCopyBuffer(const __DRIdrawable * dpriv,
                             const drm_clip_rect_t * rect);
 
 extern void intelClearWithBlit(GLcontext * ctx, GLbitfield mask);
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.c b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
index ea9d5a6276..3b7015b5ad 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
@@ -209,10 +209,23 @@ intel_bufferobj_subdata(GLcontext * ctx,
       memcpy((char *)intel_obj->sys_buffer + offset, data, size);
    else {
       /* Flush any existing batchbuffer that might reference this data. */
-      if (drm_intel_bo_references(intel->batch->buf, intel_obj->buffer))
-	 intelFlush(ctx);
+      if (drm_intel_bo_busy(intel_obj->buffer) ||
+	  drm_intel_bo_references(intel->batch->buf, intel_obj->buffer)) {
+	 drm_intel_bo *temp_bo;
 
-      dri_bo_subdata(intel_obj->buffer, offset, size, data);
+	 temp_bo = drm_intel_bo_alloc(intel->bufmgr, "subdata temp", size, 64);
+
+	 drm_intel_bo_subdata(temp_bo, 0, size, data);
+
+	 intel_emit_linear_blit(intel,
+				intel_obj->buffer, offset,
+				temp_bo, 0,
+				size);
+
+	 drm_intel_bo_unreference(temp_bo);
+      } else {
+	 dri_bo_subdata(intel_obj->buffer, offset, size, data);
+      }
    }
 }
 
@@ -255,6 +268,8 @@ intel_bufferobj_map(GLcontext * ctx,
 
    if (intel_obj->sys_buffer) {
       obj->Pointer = intel_obj->sys_buffer;
+      obj->Length = obj->Size;
+      obj->Offset = 0;
       return obj->Pointer;
    }
 
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index 6b12d484d8..7c4b79f743 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -102,33 +102,15 @@ intel_get_cliprects(struct intel_context *intel,
 		    unsigned int *num_cliprects,
 		    int *x_off, int *y_off)
 {
-   __DRIdrawablePrivate *dPriv = intel->driDrawable;
-
-   if (intel->constant_cliprect) {
-      /* FBO or DRI2 rendering, which can just use the fb's size. */
-      intel->fboRect.x1 = 0;
-      intel->fboRect.y1 = 0;
-      intel->fboRect.x2 = intel->ctx.DrawBuffer->Width;
-      intel->fboRect.y2 = intel->ctx.DrawBuffer->Height;
-
-      *cliprects = &intel->fboRect;
-      *num_cliprects = 1;
-      *x_off = 0;
-      *y_off = 0;
-   } else if (intel->front_cliprects || dPriv->numBackClipRects == 0) {
-      /* use the front clip rects */
-      *cliprects = dPriv->pClipRects;
-      *num_cliprects = dPriv->numClipRects;
-      *x_off = dPriv->x;
-      *y_off = dPriv->y;
-   }
-   else {
-      /* use the back clip rects */
-      *num_cliprects = dPriv->numBackClipRects;
-      *cliprects = dPriv->pBackClipRects;
-      *x_off = dPriv->backX;
-      *y_off = dPriv->backY;
-   }
+   intel->fboRect.x1 = 0;
+   intel->fboRect.y1 = 0;
+   intel->fboRect.x2 = intel->ctx.DrawBuffer->Width;
+   intel->fboRect.y2 = intel->ctx.DrawBuffer->Height;
+
+   *cliprects = &intel->fboRect;
+   *num_cliprects = 1;
+   *x_off = 0;
+   *y_off = 0;
 }
 
 
@@ -191,13 +173,17 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       return;
    }
 
-   /*
-    * How many color buffers are we drawing into?
+   /* How many color buffers are we drawing into?
+    *
+    * If there are zero buffers or the buffer is too big, don't configure any
+    * regions for hardware drawing.  We'll fallback to software below.  Not
+    * having regions set makes some of the software fallback paths faster.
     */
-   if (fb->_NumColorDrawBuffers == 0) {
+   if ((fb->Width > ctx->Const.MaxRenderbufferSize)
+       || (fb->Height > ctx->Const.MaxRenderbufferSize)
+       || (fb->_NumColorDrawBuffers == 0)) {
       /* writing to 0  */
       colorRegions[0] = NULL;
-      intel->constant_cliprect = GL_TRUE;
    }
    else if (fb->_NumColorDrawBuffers > 1) {
        int i;
@@ -207,34 +193,23 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
            irb = intel_renderbuffer(fb->_ColorDrawBuffers[i]);
            colorRegions[i] = irb ? irb->region : NULL;
        }
-       intel->constant_cliprect = GL_TRUE;
    }
    else {
       /* Get the intel_renderbuffer for the single colorbuffer we're drawing
-       * into, and set up cliprects if it's a DRI1 window front buffer.
+       * into.
        */
       if (fb->Name == 0) {
-	 intel->constant_cliprect = intel->driScreen->dri2.enabled;
 	 /* drawing to window system buffer */
-	 if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-	    if (!intel->constant_cliprect && !intel->front_cliprects)
-	       intel_batchbuffer_flush(intel->batch);
-	    intel->front_cliprects = GL_TRUE;
+	 if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT)
 	    colorRegions[0] = intel_get_rb_region(fb, BUFFER_FRONT_LEFT);
-	 }
-	 else {
-	    if (!intel->constant_cliprect && intel->front_cliprects)
-	       intel_batchbuffer_flush(intel->batch);
-	    intel->front_cliprects = GL_FALSE;
+	 else
 	    colorRegions[0] = intel_get_rb_region(fb, BUFFER_BACK_LEFT);
-	 }
       }
       else {
 	 /* drawing to user-created FBO */
 	 struct intel_renderbuffer *irb;
 	 irb = intel_renderbuffer(fb->_ColorDrawBuffers[0]);
 	 colorRegions[0] = (irb && irb->region) ? irb->region : NULL;
-	 intel->constant_cliprect = GL_TRUE;
       }
    }
 
@@ -286,6 +261,12 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_FALSE);
    }
 
+   /* If we have a (packed) stencil buffer attached but no depth buffer,
+    * we still need to set up the shared depth/stencil state so we can use it.
+    */
+   if (depthRegion == NULL && irbStencil && irbStencil->region)
+      depthRegion = irbStencil->region;
+
    /*
     * Update depth and stencil test state
     */
diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
index fb62f0f430..956f2339ff 100644
--- a/src/mesa/drivers/dri/intel/intel_clear.c
+++ b/src/mesa/drivers/dri/intel/intel_clear.c
@@ -68,13 +68,17 @@ static void
 intelClear(GLcontext *ctx, GLbitfield mask)
 {
    struct intel_context *intel = intel_context(ctx);
-   const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+   const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask[0]);
    GLbitfield tri_mask = 0;
    GLbitfield blit_mask = 0;
    GLbitfield swrast_mask = 0;
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    GLuint i;
 
+   if (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_FRONT_RIGHT)) {
+      intel->front_buffer_dirty = GL_TRUE;
+   }
+
    if (0)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index a7d94ced9a..3f6634c65a 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -55,10 +55,8 @@
 #include "intel_decode.h"
 #include "intel_bufmgr.h"
 #include "intel_screen.h"
-#include "intel_swapbuffers.h"
 
 #include "drirenderbuffer.h"
-#include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h"            /* for symbolic values of enum-type options */
 
@@ -68,12 +66,10 @@ int INTEL_DEBUG = (0);
 #endif
 
 
-#define DRIVER_DATE                     "20090712 2009Q2 RC3"
+#define DRIVER_DATE                     "20091221 DEVELOPMENT"
 #define DRIVER_DATE_GEM                 "GEM " DRIVER_DATE
 
 
-static void intel_flush(GLcontext *ctx, GLboolean needs_mi_flush);
-
 static const GLubyte *
 intelGetString(GLcontext * ctx, GLenum name)
 {
@@ -176,9 +172,7 @@ intelGetString(GLcontext * ctx, GLenum name)
          break;
       }
 
-      (void) driGetRendererString(buffer, chipset, 
-				  (intel->ttm) ? DRIVER_DATE_GEM : DRIVER_DATE,
-				  0);
+      (void) driGetRendererString(buffer, chipset, DRIVER_DATE_GEM, 0);
       return (GLubyte *) buffer;
 
    default:
@@ -195,10 +189,11 @@ intel_bits_per_pixel(const struct intel_renderbuffer *rb)
 void
 intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 {
-   struct intel_framebuffer *intel_fb = drawable->driverPrivate;
+   struct gl_framebuffer *fb = drawable->driverPrivate;
    struct intel_renderbuffer *rb;
    struct intel_region *region, *depth_region;
    struct intel_context *intel = context->driverPrivate;
+   struct intel_renderbuffer *front_rb, *back_rb, *depth_rb, *stencil_rb;
    __DRIbuffer *buffers = NULL;
    __DRIscreen *screen;
    int i, count;
@@ -214,26 +209,25 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
    if (screen->dri2.loader
        && (screen->dri2.loader->base.version > 2)
        && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
-      struct intel_renderbuffer *depth_rb;
-      struct intel_renderbuffer *stencil_rb;
+
+      front_rb = intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT);
+      back_rb = intel_get_renderbuffer(fb, BUFFER_BACK_LEFT);
+      depth_rb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
+      stencil_rb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
 
       i = 0;
       if ((intel->is_front_buffer_rendering ||
 	   intel->is_front_buffer_reading ||
-	   !intel_fb->color_rb[1])
-	   && intel_fb->color_rb[0]) {
+	   !back_rb) && front_rb) {
 	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
-	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[0]);
+	 attachments[i++] = intel_bits_per_pixel(front_rb);
       }
 
-      if (intel_fb->color_rb[1]) {
+      if (back_rb) {
 	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[1]);
+	 attachments[i++] = intel_bits_per_pixel(back_rb);
       }
 
-      depth_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
-      stencil_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
-
       if ((depth_rb != NULL) && (stencil_rb != NULL)) {
 	 attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
 	 attachments[i++] = intel_bits_per_pixel(depth_rb);
@@ -254,13 +248,13 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 						      drawable->loaderPrivate);
    } else if (screen->dri2.loader) {
       i = 0;
-      if (intel_fb->color_rb[0])
+      if (intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT))
 	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
-      if (intel_fb->color_rb[1])
+      if (intel_get_renderbuffer(fb, BUFFER_BACK_LEFT))
 	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH))
+      if (intel_get_renderbuffer(fb, BUFFER_DEPTH))
 	 attachments[i++] = __DRI_BUFFER_DEPTH;
-      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL))
+      if (intel_get_renderbuffer(fb, BUFFER_STENCIL))
 	 attachments[i++] = __DRI_BUFFER_STENCIL;
 
       buffers = (*screen->dri2.loader->getBuffers)(drawable,
@@ -293,32 +287,32 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
    for (i = 0; i < count; i++) {
        switch (buffers[i].attachment) {
        case __DRI_BUFFER_FRONT_LEFT:
-	   rb = intel_fb->color_rb[0];
+	   rb = intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT);
 	   region_name = "dri2 front buffer";
 	   break;
 
        case __DRI_BUFFER_FAKE_FRONT_LEFT:
-	   rb = intel_fb->color_rb[0];
+	   rb = intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT);
 	   region_name = "dri2 fake front buffer";
 	   break;
 
        case __DRI_BUFFER_BACK_LEFT:
-	   rb = intel_fb->color_rb[1];
+	   rb = intel_get_renderbuffer(fb, BUFFER_BACK_LEFT);
 	   region_name = "dri2 back buffer";
 	   break;
 
        case __DRI_BUFFER_DEPTH:
-	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+	   rb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
 	   region_name = "dri2 depth buffer";
 	   break;
 
        case __DRI_BUFFER_DEPTH_STENCIL:
-	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+	   rb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
 	   region_name = "dri2 depth / stencil buffer";
 	   break;
 
        case __DRI_BUFFER_STENCIL:
-	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+	   rb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
 	   region_name = "dri2 stencil buffer";
 	   break;
 
@@ -365,7 +359,7 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
        intel_region_release(&region);
 
        if (buffers[i].attachment == __DRI_BUFFER_DEPTH_STENCIL) {
-	  rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+	  rb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
 	  if (rb != NULL) {
 	     struct intel_region *stencil_region = NULL;
 
@@ -382,6 +376,7 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
        }
    }
 
+   drawable->validBuffers = GL_TRUE;
    driUpdateFramebufferSize(&intel->ctx, drawable);
 }
 
@@ -393,9 +388,6 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     void (*old_viewport)(GLcontext *ctx, GLint x, GLint y,
 			 GLsizei w, GLsizei h);
 
-    if (!driContext->driScreenPriv->dri2.enabled)
-	return;
-
     if (!intel->meta.internal_viewport_call && ctx->DrawBuffer->Name == 0) {
        /* If we're rendering to the fake front buffer, make sure all the pending
 	* drawing has landed on the real front buffer.  Otherwise when we
@@ -414,7 +406,6 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     old_viewport = ctx->Driver.Viewport;
     ctx->Driver.Viewport = NULL;
     intel->driDrawable = driContext->driDrawablePriv;
-    intelWindowMoved(intel);
     intel_draw_buffer(ctx, intel->ctx.DrawBuffer);
     ctx->Driver.Viewport = old_viewport;
 }
@@ -469,7 +460,7 @@ intelInvalidateState(GLcontext * ctx, GLuint new_state)
       intel->vtbl.invalidate_state( intel, new_state );
 }
 
-static void
+void
 intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
 {
    struct intel_context *intel = intel_context(ctx);
@@ -477,16 +468,9 @@ intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
    if (intel->Fallback)
       _swrast_flush(ctx);
 
-   if (!IS_965(intel->intelScreen->deviceID))
+   if (intel->gen < 4)
       INTEL_FIREVERTICES(intel);
 
-   /* Emit a flush so that any frontbuffer rendering that might have occurred
-    * lands onscreen in a timely manner, even if the X Server doesn't trigger
-    * a flush for us.
-    */
-   if (!intel->driScreen->dri2.enabled && needs_mi_flush)
-      intel_batchbuffer_emit_mi_flush(intel->batch);
-
    if (intel->batch->map != intel->batch->ptr)
       intel_batchbuffer_flush(intel->batch);
 
@@ -592,15 +576,15 @@ intelInitDriverFunctions(struct dd_function_table *functions)
 GLboolean
 intelInitContext(struct intel_context *intel,
                  const __GLcontextModes * mesaVis,
-                 __DRIcontextPrivate * driContextPriv,
+                 __DRIcontext * driContextPriv,
                  void *sharedContextPrivate,
                  struct dd_function_table *functions)
 {
    GLcontext *ctx = &intel->ctx;
    GLcontext *shareCtx = (GLcontext *) sharedContextPrivate;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    intelScreenPrivate *intelScreen = (intelScreenPrivate *) sPriv->private;
-   int fthrottle_mode;
+   int bo_reuse_mode;
 
    if (!_mesa_initialize_context(&intel->ctx, mesaVis, shareCtx,
                                  functions, (void *) intel)) {
@@ -611,35 +595,46 @@ intelInitContext(struct intel_context *intel,
    driContextPriv->driverPrivate = intel;
    intel->intelScreen = intelScreen;
    intel->driScreen = sPriv;
-   intel->sarea = intelScreen->sarea;
    intel->driContext = driContextPriv;
-
-   /* Dri stuff */
-   intel->hHWContext = driContextPriv->hHWContext;
    intel->driFd = sPriv->fd;
-   intel->driHwLock = sPriv->lock;
+
+   if (IS_965(intel->intelScreen->deviceID)) {
+      intel->gen = 4;
+   } else if (IS_9XX(intel->intelScreen->deviceID)) {
+      intel->gen = 3;
+      if (IS_945(intel->intelScreen->deviceID)) {
+	 intel->is_945 = GL_TRUE;
+      }
+   } else {
+      intel->gen = 2;
+   }
+
+   if (IS_IGDNG(intel->intelScreen->deviceID)) {
+      intel->is_ironlake = GL_TRUE;
+      intel->needs_ff_sync = GL_TRUE;
+      intel->has_luminance_srgb = GL_TRUE;
+   } else if (IS_G4X(intel->intelScreen->deviceID)) {
+      intel->has_luminance_srgb = GL_TRUE;
+      intel->is_g4x = GL_TRUE;
+   }
 
    driParseConfigFiles(&intel->optionCache, &intelScreen->optionCache,
                        intel->driScreen->myNum,
-		       IS_965(intelScreen->deviceID) ? "i965" : "i915");
+		       (intel->gen >= 4) ? "i965" : "i915");
    if (intelScreen->deviceID == PCI_CHIP_I865_G)
       intel->maxBatchSize = 4096;
    else
       intel->maxBatchSize = BATCH_SZ;
 
    intel->bufmgr = intelScreen->bufmgr;
-   intel->ttm = intelScreen->ttm;
-   if (intel->ttm) {
-      int bo_reuse_mode;
 
-      bo_reuse_mode = driQueryOptioni(&intel->optionCache, "bo_reuse");
-      switch (bo_reuse_mode) {
-      case DRI_CONF_BO_REUSE_DISABLED:
-	 break;
-      case DRI_CONF_BO_REUSE_ALL:
-	 intel_bufmgr_gem_enable_reuse(intel->bufmgr);
-	 break;
-      }
+   bo_reuse_mode = driQueryOptioni(&intel->optionCache, "bo_reuse");
+   switch (bo_reuse_mode) {
+   case DRI_CONF_BO_REUSE_DISABLED:
+      break;
+   case DRI_CONF_BO_REUSE_ALL:
+      intel_bufmgr_gem_enable_reuse(intel->bufmgr);
+      break;
    }
 
    /* This doesn't yet catch all non-conformant rendering, but it's a
@@ -683,7 +678,7 @@ intelInitContext(struct intel_context *intel,
 
    meta_init_metaops(ctx, &intel->meta);
    ctx->Const.MaxColorAttachments = 4;  /* XXX FBO: review this */
-   if (IS_965(intelScreen->deviceID)) {
+   if (intel->gen >= 4) {
       if (MAX_WIDTH > 8192)
 	 ctx->Const.MaxRenderbufferSize = 8192;
    } else {
@@ -720,33 +715,22 @@ intelInitContext(struct intel_context *intel,
       break;
    }
 
-   if (IS_965(intelScreen->deviceID))
+   if (intel->gen >= 4)
       intel->polygon_offset_scale /= 0xffff;
 
    intel->RenderIndex = ~0;
 
-   fthrottle_mode = driQueryOptioni(&intel->optionCache, "fthrottle_mode");
-   intel->irqsEmitted = 0;
-
-   intel->do_irqs = (intel->intelScreen->irq_active &&
-                     fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
-
-   intel->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
-
-   if (IS_965(intelScreen->deviceID) && !intel->intelScreen->irq_active) {
+   if (intel->gen >= 4 && !intel->intelScreen->irq_active) {
       _mesa_printf("IRQs not active.  Exiting\n");
       exit(1);
    }
 
-   intelInitExtensions(ctx, GL_FALSE);
+   intelInitExtensions(ctx);
 
    INTEL_DEBUG = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
    if (INTEL_DEBUG & DEBUG_BUFMGR)
       dri_bufmgr_set_debug(intel->bufmgr, GL_TRUE);
 
-   if (!sPriv->dri2.enabled)
-      intel_recreate_static_regions(intel);
-
    intel->batch = intel_batchbuffer_alloc(intel);
 
    intel_fbo_init(intel);
@@ -795,7 +779,7 @@ intelInitContext(struct intel_context *intel,
 }
 
 void
-intelDestroyContext(__DRIcontextPrivate * driContextPriv)
+intelDestroyContext(__DRIcontext * driContextPriv)
 {
    struct intel_context *intel =
       (struct intel_context *) driContextPriv->driverPrivate;
@@ -842,57 +826,6 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
           */
       }
 
-      /* XXX In intelMakeCurrent() below, the context's static regions are 
-       * referenced inside the frame buffer; it's listed as a hack,
-       * with a comment of "XXX FBO temporary fix-ups!", but
-       * as long as it's there, we should release the regions here.
-       * The do/while loop around the block is used to allow the
-       * "continue" statements inside the block to exit the block,
-       * to avoid many layers of "if" constructs.
-       */
-      do {
-         __DRIdrawablePrivate * driDrawPriv = intel->driDrawable;
-         struct intel_framebuffer *intel_fb;
-         struct intel_renderbuffer *irbDepth, *irbStencil;
-         if (!driDrawPriv) {
-            /* We're already detached from the drawable; exit this block. */
-            continue;
-         }
-         intel_fb = (struct intel_framebuffer *) driDrawPriv->driverPrivate;
-         if (!intel_fb) {
-            /* The frame buffer is already gone; exit this block. */
-            continue;
-         }
-         irbDepth = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
-         irbStencil = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
-
-         /* If the regions of the frame buffer still match the regions
-          * of the context, release them.  If they've changed somehow,
-          * leave them alone.
-          */
-         if (intel_fb->color_rb[0] && intel_fb->color_rb[0]->region == intel->front_region) {
-	    intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
-         }
-         if (intel_fb->color_rb[1] && intel_fb->color_rb[1]->region == intel->back_region) {
-	    intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
-         }
-
-         if (irbDepth && irbDepth->region == intel->depth_region) {
-	    intel_renderbuffer_set_region(irbDepth, NULL);
-         }
-         /* Usually, the stencil buffer is the same as the depth buffer;
-          * but they're handled separately in MakeCurrent, so we'll
-          * handle them separately here.
-          */
-         if (irbStencil && irbStencil->region == intel->depth_region) {
-	    intel_renderbuffer_set_region(irbStencil, NULL);
-         }
-      } while (0);
-
-      intel_region_release(&intel->front_region);
-      intel_region_release(&intel->back_region);
-      intel_region_release(&intel->depth_region);
-
       driDestroyOptionCache(&intel->optionCache);
 
       /* free the Mesa context */
@@ -904,7 +837,7 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 }
 
 GLboolean
-intelUnbindContext(__DRIcontextPrivate * driContextPriv)
+intelUnbindContext(__DRIcontext * driContextPriv)
 {
    struct intel_context *intel =
       (struct intel_context *) driContextPriv->driverPrivate;
@@ -918,11 +851,10 @@ intelUnbindContext(__DRIcontextPrivate * driContextPriv)
 }
 
 GLboolean
-intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
-                 __DRIdrawablePrivate * driDrawPriv,
-                 __DRIdrawablePrivate * driReadPriv)
+intelMakeCurrent(__DRIcontext * driContextPriv,
+                 __DRIdrawable * driDrawPriv,
+                 __DRIdrawable * driReadPriv)
 {
-   __DRIscreenPrivate *psp = driDrawPriv->driScreenPriv;
    struct intel_context *intel;
    GET_CURRENT_CONTEXT(curCtx);
 
@@ -940,41 +872,12 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
    }
 
    if (driContextPriv) {
-      struct intel_framebuffer *intel_fb =
-	 (struct intel_framebuffer *) driDrawPriv->driverPrivate;
-      GLframebuffer *readFb = (GLframebuffer *) driReadPriv->driverPrivate;
+      struct gl_framebuffer *fb = driDrawPriv->driverPrivate;
+      struct gl_framebuffer *readFb = driReadPriv->driverPrivate;
  
-      if (driContextPriv->driScreenPriv->dri2.enabled) {     
-          intel_update_renderbuffers(driContextPriv, driDrawPriv);
-          if (driDrawPriv != driReadPriv)
-              intel_update_renderbuffers(driContextPriv, driReadPriv);
-      } else {
-          /* XXX FBO temporary fix-ups!  These are released in 
-           * intelDextroyContext(), above.  Changes here should be
-           * reflected there.
-           */
-          /* if the renderbuffers don't have regions, init them from the context */
-         struct intel_renderbuffer *irbDepth
-            = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
-         struct intel_renderbuffer *irbStencil
-            = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
-
-         if (intel_fb->color_rb[0]) {
-	    intel_renderbuffer_set_region(intel_fb->color_rb[0],
-					  intel->front_region);
-         }
-         if (intel_fb->color_rb[1]) {
-	    intel_renderbuffer_set_region(intel_fb->color_rb[1],
-					  intel->back_region);
-         }
-
-         if (irbDepth) {
-	    intel_renderbuffer_set_region(irbDepth, intel->depth_region);
-         }
-         if (irbStencil) {
-	    intel_renderbuffer_set_region(irbStencil, intel->depth_region);
-         }
-      }
+      intel_update_renderbuffers(driContextPriv, driDrawPriv);
+      if (driDrawPriv != driReadPriv)
+	 intel_update_renderbuffers(driContextPriv, driReadPriv);
 
       /* set GLframebuffer size to match window, if needed */
       driUpdateFramebufferSize(&intel->ctx, driDrawPriv);
@@ -983,37 +886,10 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
 	 driUpdateFramebufferSize(&intel->ctx, driReadPriv);
       }
 
-      _mesa_make_current(&intel->ctx, &intel_fb->Base, readFb);
-
+      _mesa_make_current(&intel->ctx, fb, readFb);
       intel->driReadDrawable = driReadPriv;
-
-      if (intel->driDrawable != driDrawPriv) {
-         if (driDrawPriv->swap_interval == (unsigned)-1) {
-            int i;
-
-            driDrawPriv->vblFlags = (intel->intelScreen->irq_active != 0)
-               ? driGetDefaultVBlankFlags(&intel->optionCache)
-               : VBLANK_FLAG_NO_IRQ;
-
-            /* Prevent error printf if one crtc is disabled, this will
-             * be properly calculated in intelWindowMoved() next.
-             */
-            driDrawPriv->vblFlags = intelFixupVblank(intel, driDrawPriv);
-
-            (*psp->systemTime->getUST) (&intel_fb->swap_ust);
-            driDrawableInitVBlank(driDrawPriv);
-            intel_fb->vbl_waited = driDrawPriv->vblSeq;
-
-            for (i = 0; i < 2; i++) {
-               if (intel_fb->color_rb[i])
-                  intel_fb->color_rb[i]->vbl_pending = driDrawPriv->vblSeq;
-            }
-         }
-         intel->driDrawable = driDrawPriv;
-         intelWindowMoved(intel);
-      }
-
-      intel_draw_buffer(&intel->ctx, &intel_fb->Base);
+      intel->driDrawable = driDrawPriv;
+      intel_draw_buffer(&intel->ctx, fb);
    }
    else {
       _mesa_make_current(NULL, NULL, NULL);
@@ -1021,143 +897,3 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
 
    return GL_TRUE;
 }
-
-static void
-intelContendedLock(struct intel_context *intel, GLuint flags)
-{
-   __DRIdrawablePrivate *dPriv = intel->driDrawable;
-   __DRIscreenPrivate *sPriv = intel->driScreen;
-   volatile drm_i915_sarea_t *sarea = intel->sarea;
-   int me = intel->hHWContext;
-
-   drmGetLock(intel->driFd, intel->hHWContext, flags);
-
-   if (INTEL_DEBUG & DEBUG_LOCK)
-      _mesa_printf("%s - got contended lock\n", __progname);
-
-   /* If the window moved, may need to set a new cliprect now.
-    *
-    * NOTE: This releases and regains the hw lock, so all state
-    * checking must be done *after* this call:
-    */
-   if (dPriv)
-       DRI_VALIDATE_DRAWABLE_INFO(sPriv, dPriv);
-
-   if (sarea && sarea->ctxOwner != me) {
-      if (INTEL_DEBUG & DEBUG_BUFMGR) {
-	 fprintf(stderr, "Lost Context: sarea->ctxOwner %x me %x\n",
-		 sarea->ctxOwner, me);
-      }
-      sarea->ctxOwner = me;
-   }
-
-   /* If the last consumer of the texture memory wasn't us, notify the fake
-    * bufmgr and record the new owner.  We should have the memory shared
-    * between contexts of a single fake bufmgr, but this will at least make
-    * things correct for now.
-    */
-   if (!intel->ttm && sarea->texAge != intel->hHWContext) {
-      sarea->texAge = intel->hHWContext;
-      intel_bufmgr_fake_contended_lock_take(intel->bufmgr);
-      if (INTEL_DEBUG & DEBUG_BATCH)
-	 intel_decode_context_reset();
-      if (INTEL_DEBUG & DEBUG_BUFMGR)
-	 fprintf(stderr, "Lost Textures: sarea->texAge %x hw context %x\n",
-		 sarea->ctxOwner, intel->hHWContext);
-   }
-
-   /* Drawable changed?
-    */
-   if (dPriv && intel->lastStamp != dPriv->lastStamp) {
-       intelWindowMoved(intel);
-       intel->lastStamp = dPriv->lastStamp;
-   }
-}
-
-
-_glthread_DECLARE_STATIC_MUTEX(lockMutex);
-
-/* Lock the hardware and validate our state.  
- */
-void LOCK_HARDWARE( struct intel_context *intel )
-{
-    __DRIdrawable *dPriv = intel->driDrawable;
-    __DRIscreen *sPriv = intel->driScreen;
-    char __ret = 0;
-    struct intel_framebuffer *intel_fb = NULL;
-    struct intel_renderbuffer *intel_rb = NULL;
-
-    intel->locked++;
-    if (intel->locked >= 2)
-       return;
-
-    if (!sPriv->dri2.enabled)
-       _glthread_LOCK_MUTEX(lockMutex);
-
-    if (intel->driDrawable) {
-       intel_fb = intel->driDrawable->driverPrivate;
-
-       if (intel_fb)
-	  intel_rb =
-	     intel_get_renderbuffer(&intel_fb->Base,
-				    intel_fb->Base._ColorDrawBufferIndexes[0]);
-    }
-
-    if (intel_rb && dPriv->vblFlags &&
-	!(dPriv->vblFlags & VBLANK_FLAG_NO_IRQ) &&
-	(intel_fb->vbl_waited - intel_rb->vbl_pending) > (1<<23)) {
-	drmVBlank vbl;
-
-	vbl.request.type = DRM_VBLANK_ABSOLUTE;
-
-	if ( dPriv->vblFlags & VBLANK_FLAG_SECONDARY ) {
-	    vbl.request.type |= DRM_VBLANK_SECONDARY;
-	}
-
-	vbl.request.sequence = intel_rb->vbl_pending;
-	drmWaitVBlank(intel->driFd, &vbl);
-	intel_fb->vbl_waited = vbl.reply.sequence;
-    }
-
-    if (!sPriv->dri2.enabled) {
-	DRM_CAS(intel->driHwLock, intel->hHWContext,
-		(DRM_LOCK_HELD|intel->hHWContext), __ret);
-
-	if (__ret)
-	    intelContendedLock( intel, 0 );
-    }
-
-
-    if (INTEL_DEBUG & DEBUG_LOCK)
-      _mesa_printf("%s - locked\n", __progname);
-}
-
-
-/* Unlock the hardware using the global current context 
- */
-void UNLOCK_HARDWARE( struct intel_context *intel )
-{
-    __DRIscreen *sPriv = intel->driScreen;
-
-   intel->locked--;
-   if (intel->locked > 0)
-      return;
-
-   assert(intel->locked == 0);
-
-   if (!sPriv->dri2.enabled) {
-      DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
-      _glthread_UNLOCK_MUTEX(lockMutex);
-   }
-
-   if (INTEL_DEBUG & DEBUG_LOCK)
-      _mesa_printf("%s - unlocked\n", __progname);
-
-   /**
-    * Nothing should be left in batch outside of LOCK/UNLOCK which references
-    * cliprects.
-    */
-   if (intel->batch->cliprect_mode == REFERENCES_CLIPRECTS)
-      intel_batchbuffer_flush(intel->batch);
-}
-
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 356fa4d1e5..07207bfbec 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -117,8 +117,6 @@ struct intel_context
                                struct intel_region * depth_region,
 			       GLuint num_regions);
 
-      GLuint (*flush_cmd) (void);
-
       void (*reduced_primitive_state) (struct intel_context * intel,
                                        GLenum rprim);
 
@@ -137,14 +135,6 @@ struct intel_context
                                 struct intel_region * draw_region,
                                 struct intel_region * depth_region);
 
-      void (*meta_draw_quad)(struct intel_context *intel,
-			     GLfloat x0, GLfloat x1,
-			     GLfloat y0, GLfloat y1,
-			     GLfloat z,
-			     GLuint color, /* ARGB32 */
-			     GLfloat s0, GLfloat s1,
-			     GLfloat t0, GLfloat t1);
-
       void (*meta_color_mask) (struct intel_context * intel, GLboolean);
 
       void (*meta_stencil_replace) (struct intel_context * intel,
@@ -176,27 +166,27 @@ struct intel_context
 
    struct dri_metaops meta;
 
-   GLint refcount;
    GLbitfield Fallback;  /**< mask of INTEL_FALLBACK_x bits */
    GLuint NewGLState;
 
    dri_bufmgr *bufmgr;
    unsigned int maxBatchSize;
 
-   struct intel_region *front_region;
-   struct intel_region *back_region;
-   struct intel_region *depth_region;
-
    /**
-    * This value indicates that the kernel memory manager is being used
-    * instead of the fake client-side memory manager.
+    * Generation number of the hardware: 2 is 8xx, 3 is 9xx pre-965, 4 is 965.
     */
-   GLboolean ttm;
+   int gen;
+   GLboolean needs_ff_sync;
+   GLboolean is_ironlake;
+   GLboolean is_g4x;
+   GLboolean is_945;
+   GLboolean has_luminance_srgb;
+
+   int urb_size;
 
    struct intel_batchbuffer *batch;
    drm_intel_bo *first_post_swapbuffers_batch;
    GLboolean no_batch_wrap;
-   unsigned batch_id;
 
    struct
    {
@@ -216,10 +206,6 @@ struct intel_context
    char *prevLockFile;
    int prevLockLine;
 
-   GLuint ClearColor565;
-   GLuint ClearColor8888;
-
-
    /* Offsets of fields within the current vertex:
     */
    GLuint coloroffset;
@@ -236,6 +222,7 @@ struct intel_context
    GLboolean hw_stipple;
    GLboolean depth_buffer_is_float;
    GLboolean no_rast;
+   GLboolean no_hw;
    GLboolean always_flush_batch;
    GLboolean always_flush_cache;
 
@@ -261,19 +248,6 @@ struct intel_context
    intel_tri_func draw_tri;
 
    /**
-    * Set to true if a single constant cliprect should be used in the
-    * batchbuffer.  Otherwise, cliprects must be calculated at batchbuffer
-    * flush time while the lock is held.
-    */
-   GLboolean constant_cliprect;
-
-   /**
-    * In !constant_cliprect mode, set to true if the front cliprects should be
-    * used instead of back.
-    */
-   GLboolean front_cliprects;
-
-   /**
     * Set if rendering has occured to the drawable's front buffer.
     *
     * This is used in the DRI2 case to detect that glFlush should also copy
@@ -301,48 +275,23 @@ struct intel_context
    GLboolean use_early_z;
    drm_clip_rect_t fboRect;     /**< cliprect for FBO rendering */
 
-   int perf_boxes;
-
-   GLuint do_usleeps;
-   int do_irqs;
-   GLuint irqsEmitted;
-
-   GLboolean scissor;
    drm_clip_rect_t draw_rect;
    drm_clip_rect_t scissor_rect;
 
-   drm_context_t hHWContext;
-   drmLock *driHwLock;
    int driFd;
 
-   __DRIcontextPrivate *driContext;
-   __DRIdrawablePrivate *driDrawable;
-   __DRIdrawablePrivate *driReadDrawable;
-   __DRIscreenPrivate *driScreen;
+   __DRIcontext *driContext;
+   __DRIdrawable *driDrawable;
+   __DRIdrawable *driReadDrawable;
+   __DRIscreen *driScreen;
    intelScreenPrivate *intelScreen;
-   volatile drm_i915_sarea_t *sarea;
-
-   GLuint lastStamp;
-
-   GLboolean no_hw;
 
    /**
     * Configuration cache
     */
    driOptionCache optionCache;
-
-   int64_t swap_ust;
-   int64_t swap_missed_ust;
-
-   GLuint swap_count;
-   GLuint swap_missed_count;
 };
 
-/* These are functions now:
- */
-void LOCK_HARDWARE( struct intel_context *intel );
-void UNLOCK_HARDWARE( struct intel_context *intel );
-
 extern char *__progname;
 
 
@@ -353,14 +302,14 @@ extern char *__progname;
 #define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
 #define IS_POWER_OF_TWO(val) (((val) & (val - 1)) == 0)
 
-static inline uint32_t
+static INLINE uint32_t
 U_FIXED(float value, uint32_t frac_bits)
 {
    value *= (1 << frac_bits);
    return value < 0 ? 0 : value;
 }
 
-static inline uint32_t
+static INLINE uint32_t
 S_FIXED(float value, uint32_t frac_bits)
 {
    return value * (1 << frac_bits);
@@ -373,29 +322,6 @@ do {						\
 } while (0)
 
 /* ================================================================
- * Color packing:
- */
-
-#define INTEL_PACKCOLOR4444(r,g,b,a) \
-  ((((a) & 0xf0) << 8) | (((r) & 0xf0) << 4) | ((g) & 0xf0) | ((b) >> 4))
-
-#define INTEL_PACKCOLOR1555(r,g,b,a) \
-  ((((r) & 0xf8) << 7) | (((g) & 0xf8) << 2) | (((b) & 0xf8) >> 3) | \
-    ((a) ? 0x8000 : 0))
-
-#define INTEL_PACKCOLOR565(r,g,b) \
-  ((((r) & 0xf8) << 8) | (((g) & 0xfc) << 3) | (((b) & 0xf8) >> 3))
-
-#define INTEL_PACKCOLOR8888(r,g,b,a) \
-  ((a<<24) | (r<<16) | (g<<8) | b)
-
-#define INTEL_PACKCOLOR(format, r,  g,  b, a)		\
-(format == DV_PF_555 ? INTEL_PACKCOLOR1555(r,g,b,a) :	\
- (format == DV_PF_565 ? INTEL_PACKCOLOR565(r,g,b) :	\
-  (format == DV_PF_8888 ? INTEL_PACKCOLOR8888(r,g,b,a) :	\
-   0)))
-
-/* ================================================================
  * From linux kernel i386 header files, copes with odd sizes better
  * than COPY_DWORDS would:
  * XXX Put this in src/mesa/main/imports.h ???
@@ -480,14 +406,13 @@ extern int INTEL_DEBUG;
 
 extern GLboolean intelInitContext(struct intel_context *intel,
                                   const __GLcontextModes * mesaVis,
-                                  __DRIcontextPrivate * driContextPriv,
+                                  __DRIcontext * driContextPriv,
                                   void *sharedContextPrivate,
                                   struct dd_function_table *functions);
 
-extern void intelGetLock(struct intel_context *intel, GLuint flags);
-
 extern void intelFinish(GLcontext * ctx);
 extern void intelFlush(GLcontext * ctx);
+extern void intel_flush(GLcontext * ctx, GLboolean needs_mi_flush);
 
 extern void intelInitDriverFunctions(struct dd_function_table *functions);
 
@@ -587,4 +512,25 @@ is_power_of_two(uint32_t value)
    return (value & (value - 1)) == 0;
 }
 
+static INLINE void
+intel_bo_map_gtt_preferred(struct intel_context *intel,
+			   drm_intel_bo *bo,
+			   GLboolean write)
+{
+   if (intel->intelScreen->kernel_exec_fencing)
+      drm_intel_gem_bo_map_gtt(bo);
+   else
+      drm_intel_bo_map(bo, write);
+}
+
+static INLINE void
+intel_bo_unmap_gtt_preferred(struct intel_context *intel,
+			     drm_intel_bo *bo)
+{
+   if (intel->intelScreen->kernel_exec_fencing)
+      drm_intel_gem_bo_unmap_gtt(bo);
+   else
+      drm_intel_bo_unmap(bo);
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
index b6754c9fcb..5ac5ce10af 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -79,6 +79,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_ARB_half_float_pixel",           NULL },
    { "GL_ARB_map_buffer_range",           GL_ARB_map_buffer_range_functions },
    { "GL_ARB_multitexture",               NULL },
+   { "GL_ARB_pixel_buffer_object",      NULL },
    { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
    { "GL_ARB_point_sprite",               NULL },
    { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
@@ -104,6 +105,8 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_blend_logic_op",             NULL },
    { "GL_EXT_blend_subtract",             NULL },
    { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
+   { "GL_EXT_framebuffer_blit",         GL_EXT_framebuffer_blit_functions },
+   { "GL_EXT_framebuffer_object",       GL_EXT_framebuffer_object_functions },
    { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
    { "GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions },
    { "GL_EXT_packed_depth_stencil",       NULL },
@@ -121,7 +124,6 @@ static const struct dri_extension card_extensions[] = {
    { "GL_MESA_pack_invert",               NULL },
    { "GL_MESA_ycbcr_texture",             NULL },
    { "GL_NV_blend_square",                NULL },
-   { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
    { "GL_NV_vertex_program",              GL_NV_vertex_program_functions },
    { "GL_NV_vertex_program1_1",           NULL },
    { "GL_SGIS_generate_mipmap",           NULL },
@@ -176,14 +178,6 @@ static const struct dri_extension arb_oq_extensions[] = {
    { NULL, NULL }
 };
 
-
-static const struct dri_extension ttm_extensions[] = {
-   { "GL_ARB_pixel_buffer_object",      NULL },
-   { "GL_EXT_framebuffer_blit",         GL_EXT_framebuffer_blit_functions },
-   { "GL_EXT_framebuffer_object",       GL_EXT_framebuffer_object_functions },
-   { NULL, NULL }
-};
-
 static const struct dri_extension fragment_shader_extensions[] = {
    { "GL_ARB_fragment_shader",            NULL },
    { NULL, NULL }
@@ -194,31 +188,24 @@ static const struct dri_extension fragment_shader_extensions[] = {
  * extensions for a context.
  */
 void
-intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging)
+intelInitExtensions(GLcontext *ctx)
 {
-   struct intel_context *intel = ctx?intel_context(ctx):NULL;
+   struct intel_context *intel = intel_context(ctx);
 
    /* Disable imaging extension until convolution is working in teximage paths.
     */
-   enable_imaging = GL_FALSE;
-
-   driInitExtensions(ctx, card_extensions, enable_imaging);
-
-   if (intel == NULL || intel->ttm)
-      driInitExtensions(ctx, ttm_extensions, GL_FALSE);
+   driInitExtensions(ctx, card_extensions, GL_FALSE);
 
-   if (intel == NULL || IS_965(intel->intelScreen->deviceID))
+   if (intel->gen >= 4)
       driInitExtensions(ctx, brw_extensions, GL_FALSE);
 
-   if (intel == NULL || IS_915(intel->intelScreen->deviceID)
-       || IS_945(intel->intelScreen->deviceID)) {
+   if (intel->gen == 3) {
       driInitExtensions(ctx, i915_extensions, GL_FALSE);
 
-      if (intel == NULL || driQueryOptionb(&intel->optionCache, "fragment_shader"))
+      if (driQueryOptionb(&intel->optionCache, "fragment_shader"))
 	 driInitExtensions(ctx, fragment_shader_extensions, GL_FALSE);
 
-      if (intel == NULL || driQueryOptionb(&intel->optionCache,
-					   "stub_occlusion_query"))
+      if (driQueryOptionb(&intel->optionCache, "stub_occlusion_query"))
 	 driInitExtensions(ctx, arb_oq_extensions, GL_FALSE);
    }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.h b/src/mesa/drivers/dri/intel/intel_extensions.h
index 97147ecdb0..e78e07356e 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions.h
+++ b/src/mesa/drivers/dri/intel/intel_extensions.h
@@ -30,7 +30,10 @@
 
 
 extern void
-intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging);
+intelInitExtensions(GLcontext *ctx);
+
+extern void
+intelFlushDrawable(__DRIdrawable *drawable);
 
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index d8ac4d3663..d58ffd95fa 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -37,6 +37,7 @@
 #include "drivers/common/meta.h"
 
 #include "intel_context.h"
+#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
@@ -105,8 +106,8 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
-   GLboolean softwareBuffer = GL_FALSE;
    int cpp;
+   GLuint pitch;
 
    ASSERT(rb->Name != 0);
 
@@ -116,18 +117,14 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_RGB5:
       rb->Format = MESA_FORMAT_RGB565;
       rb->DataType = GL_UNSIGNED_BYTE;
-      irb->texformat = MESA_FORMAT_RGB565;
-      cpp = 2;
       break;
    case GL_RGB:
    case GL_RGB8:
    case GL_RGB10:
    case GL_RGB12:
    case GL_RGB16:
-      rb->Format = MESA_FORMAT_ARGB8888;
+      rb->Format = MESA_FORMAT_XRGB8888;
       rb->DataType = GL_UNSIGNED_BYTE;
-      irb->texformat = MESA_FORMAT_ARGB8888; /* XXX: Need xrgb8888 */
-      cpp = 4;
       break;
    case GL_RGBA:
    case GL_RGBA2:
@@ -139,8 +136,6 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_RGBA16:
       rb->Format = MESA_FORMAT_ARGB8888;
       rb->DataType = GL_UNSIGNED_BYTE;
-      irb->texformat = MESA_FORMAT_ARGB8888;
-      cpp = 4;
       break;
    case GL_STENCIL_INDEX:
    case GL_STENCIL_INDEX1_EXT:
@@ -150,29 +145,21 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       /* alloc a depth+stencil buffer */
       rb->Format = MESA_FORMAT_S8_Z24;
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
-      cpp = 4;
-      irb->texformat = MESA_FORMAT_S8_Z24;
       break;
    case GL_DEPTH_COMPONENT16:
       rb->Format = MESA_FORMAT_Z16;
       rb->DataType = GL_UNSIGNED_SHORT;
-      cpp = 2;
-      irb->texformat = MESA_FORMAT_Z16;
       break;
    case GL_DEPTH_COMPONENT:
    case GL_DEPTH_COMPONENT24:
    case GL_DEPTH_COMPONENT32:
       rb->Format = MESA_FORMAT_S8_Z24;
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
-      cpp = 4;
-      irb->texformat = MESA_FORMAT_S8_Z24;
       break;
    case GL_DEPTH_STENCIL_EXT:
    case GL_DEPTH24_STENCIL8_EXT:
       rb->Format = MESA_FORMAT_S8_Z24;
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
-      cpp = 4;
-      irb->texformat = MESA_FORMAT_S8_Z24;
       break;
    default:
       _mesa_problem(ctx,
@@ -181,6 +168,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    }
 
    rb->_BaseFormat = _mesa_base_fbo_format(ctx, internalFormat);
+   cpp = _mesa_get_format_bytes(rb->Format);
 
    intelFlush(ctx);
 
@@ -190,32 +178,25 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    }
 
    /* allocate new memory region/renderbuffer */
-   if (softwareBuffer) {
-      return _mesa_soft_renderbuffer_storage(ctx, rb, internalFormat,
-                                             width, height);
-   }
-   else {
-      /* Choose a pitch to match hardware requirements:
-       */
-      GLuint pitch = ((cpp * width + 63) & ~63) / cpp;
 
-      /* alloc hardware renderbuffer */
-      DBG("Allocating %d x %d Intel RBO (pitch %d)\n", width,
-	  height, pitch);
+   /* Choose a pitch to match hardware requirements:
+    */
+   pitch = ((cpp * width + 63) & ~63) / cpp;
+
+   /* alloc hardware renderbuffer */
+   DBG("Allocating %d x %d Intel RBO (pitch %d)\n", width, height, pitch);
 
-      irb->region = intel_region_alloc(intel, I915_TILING_NONE,
-				       cpp, width, height, pitch,
-				       GL_TRUE);
-      if (!irb->region)
-         return GL_FALSE;       /* out of memory? */
+   irb->region = intel_region_alloc(intel, I915_TILING_NONE, cpp,
+				    width, height, pitch, GL_TRUE);
+   if (!irb->region)
+      return GL_FALSE;       /* out of memory? */
 
-      ASSERT(irb->region->buffer);
+   ASSERT(irb->region->buffer);
 
-      rb->Width = width;
-      rb->Height = height;
+   rb->Width = width;
+   rb->Height = height;
 
-      return GL_TRUE;
-   }
+   return GL_TRUE;
 }
 
 
@@ -241,7 +222,6 @@ static void
 intel_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
 		     GLuint width, GLuint height)
 {
-   struct intel_framebuffer *intel_fb = (struct intel_framebuffer*)fb;
    int i;
 
    _mesa_resize_framebuffer(ctx, fb, width, height);
@@ -252,9 +232,10 @@ intel_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
       return;
    }
 
+
    /* Make sure all window system renderbuffers are up to date */
-   for (i = 0; i < 2; i++) {
-      struct gl_renderbuffer *rb = &intel_fb->color_rb[i]->Base;
+   for (i = BUFFER_FRONT_LEFT; i <= BUFFER_BACK_RIGHT; i++) {
+      struct gl_renderbuffer *rb = fb->Attachment[i].Renderbuffer;
 
       /* only resize if size is changing */
       if (rb && (rb->Width != width || rb->Height != height)) {
@@ -297,7 +278,6 @@ intel_create_renderbuffer(gl_format format)
    GET_CURRENT_CONTEXT(ctx);
 
    struct intel_renderbuffer *irb;
-   const GLuint name = 0;
 
    irb = CALLOC_STRUCT(intel_renderbuffer);
    if (!irb) {
@@ -305,7 +285,7 @@ intel_create_renderbuffer(gl_format format)
       return NULL;
    }
 
-   _mesa_init_renderbuffer(&irb->Base, name);
+   _mesa_init_renderbuffer(&irb->Base, 0);
    irb->Base.ClassID = INTEL_RB_CLASS;
 
    switch (format) {
@@ -314,10 +294,6 @@ intel_create_renderbuffer(gl_format format)
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       break;
    case MESA_FORMAT_XRGB8888:
-      /* XXX this is a hack since XRGB surfaces don't seem to work
-       * properly yet.  Reading the alpha channel returns 0 instead of 1.
-       */
-      format = MESA_FORMAT_ARGB8888;
       irb->Base._BaseFormat = GL_RGB;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       break;
@@ -346,7 +322,6 @@ intel_create_renderbuffer(gl_format format)
 
    irb->Base.Format = format;
    irb->Base.InternalFormat = irb->Base._BaseFormat;
-   irb->texformat = format;
 
    /* intel-specific methods */
    irb->Base.Delete = intel_delete_renderbuffer;
@@ -423,9 +398,6 @@ static GLboolean
 intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb, 
 		     struct gl_texture_image *texImage)
 {
-   irb->texformat = texImage->TexFormat;
-   gl_format texFormat;
-
    if (texImage->TexFormat == MESA_FORMAT_ARGB8888) {
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to RGBA8 texture OK\n");
@@ -455,14 +427,13 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
       DBG("Render to DEPTH_STENCIL texture OK\n");
    }
    else {
-      DBG("Render to texture BAD FORMAT %d\n", texImage->TexFormat);
+      DBG("Render to texture BAD FORMAT %s\n",
+	  _mesa_get_format_name(texImage->TexFormat));
       return GL_FALSE;
    }
 
    irb->Base.Format = texImage->TexFormat;
 
-   texFormat = texImage->TexFormat;
-
    irb->Base.InternalFormat = texImage->InternalFormat;
    irb->Base._BaseFormat = _mesa_base_fbo_format(ctx, irb->Base.InternalFormat);
    irb->Base.Width = texImage->Width;
@@ -577,6 +548,7 @@ intel_render_texture(GLcontext * ctx,
 					   dst_x) * intel_image->mt->cpp;
    intel_image->mt->region->draw_x = dst_x;
    intel_image->mt->region->draw_y = dst_y;
+   intel_image->used_as_render_target = GL_TRUE;
 
    /* update drawing region, etc */
    intel_draw_buffer(ctx, fb);
@@ -590,19 +562,23 @@ static void
 intel_finish_render_texture(GLcontext * ctx,
                             struct gl_renderbuffer_attachment *att)
 {
-   /* no-op
-    * Previously we released the renderbuffer's intel_region but
-    * that's not necessary and actually caused problems when trying
-    * to do a glRead/CopyPixels from the renderbuffer later.
-    * The region will be released later if the texture is replaced
-    * or the renderbuffer deleted.
-    *
-    * The intention of this driver hook is more of a "done rendering
-    * to texture, please re-twiddle/etc if necessary".
+   struct intel_context *intel = intel_context(ctx);
+   struct gl_texture_object *tex_obj = att->Texture;
+   struct gl_texture_image *image =
+      tex_obj->Image[att->CubeMapFace][att->TextureLevel];
+   struct intel_texture_image *intel_image = intel_texture_image(image);
+
+   /* Flag that this image may now be validated into the object's miptree. */
+   intel_image->used_as_render_target = GL_FALSE;
+
+   /* Since we've (probably) rendered to the texture and will (likely) use
+    * it in the texture domain later on in this batchbuffer, flush the
+    * batch.  Once again, we wish for a domain tracker in libdrm to cover
+    * usage inside of a batchbuffer like GEM does in the kernel.
     */
+   intel_batchbuffer_emit_mi_flush(intel->batch);
 }
 
-
 /**
  * Do additional "completeness" testing of a framebuffer object.
  */
@@ -615,11 +591,21 @@ intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
       intel_get_renderbuffer(fb, BUFFER_STENCIL);
    int i;
 
-   if (stencilRb && stencilRb != depthRb) {
-      /* we only support combined depth/stencil buffers, not separate
-       * stencil buffers.
-       */
-      fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+   if (depthRb && stencilRb && stencilRb != depthRb) {
+      if (ctx->DrawBuffer->Attachment[BUFFER_DEPTH].Type == GL_TEXTURE &&
+	  ctx->DrawBuffer->Attachment[BUFFER_STENCIL].Type == GL_TEXTURE &&
+	  (ctx->DrawBuffer->Attachment[BUFFER_DEPTH].Texture->Name ==
+	   ctx->DrawBuffer->Attachment[BUFFER_STENCIL].Texture->Name)) {
+	 /* OK */
+      } else {
+	 /* we only support combined depth/stencil buffers, not separate
+	  * stencil buffers.
+	  */
+	 DBG("Only supports combined depth/stencil (found %s, %s)\n",
+	     depthRb ? _mesa_get_format_name(depthRb->Base.Format): "NULL",
+	     stencilRb ? _mesa_get_format_name(stencilRb->Base.Format): "NULL");
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+      }
    }
 
    for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
@@ -630,11 +616,12 @@ intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
 	 continue;
 
       if (irb == NULL) {
+	 DBG("software rendering renderbuffer\n");
 	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
 	 continue;
       }
 
-      switch (irb->texformat) {
+      switch (irb->Base.Format) {
       case MESA_FORMAT_ARGB8888:
       case MESA_FORMAT_XRGB8888:
       case MESA_FORMAT_RGB565:
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h b/src/mesa/drivers/dri/intel/intel_fbo.h
index 50a8a95985..586dbbbb25 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.h
+++ b/src/mesa/drivers/dri/intel/intel_fbo.h
@@ -34,27 +34,6 @@
 struct intel_context;
 
 /**
- * Intel framebuffer, derived from gl_framebuffer.
- */
-struct intel_framebuffer
-{
-   struct gl_framebuffer Base;
-
-   struct intel_renderbuffer *color_rb[2];
-
-   /* VBI
-    */
-   GLuint vbl_waited;
-
-   int64_t swap_ust;
-   int64_t swap_missed_ust;
-
-   GLuint swap_count;
-   GLuint swap_missed_count;
-};
-
-
-/**
  * Intel renderbuffer, derived from gl_renderbuffer.
  */
 struct intel_renderbuffer
@@ -62,10 +41,6 @@ struct intel_renderbuffer
    struct gl_renderbuffer Base;
    struct intel_region *region;
 
-   gl_format texformat;
-
-   GLuint vbl_pending;   /**< vblank sequence number of pending flip */
-
    uint8_t *span_cache;
    unsigned long span_cache_offset;
 };
@@ -123,7 +98,7 @@ intel_fbo_init(struct intel_context *intel);
 
 
 extern void
-intel_flip_renderbuffers(struct intel_framebuffer *intel_fb);
+intel_flip_renderbuffers(struct gl_framebuffer *fb);
 
 
 static INLINE struct intel_region *
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 3996c100a5..82e4150c6a 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -87,7 +87,7 @@ intel_miptree_create_internal(struct intel_context *intel,
    mt->pitch = 0;
 
 #ifdef I915
-   if (IS_945(intel->intelScreen->deviceID))
+   if (intel->is_945)
       ok = i945_miptree_layout(intel, mt, tiling);
    else
       ok = i915_miptree_layout(intel, mt, tiling);
@@ -122,7 +122,7 @@ intel_miptree_create(struct intel_context *intel,
 
    if (intel->use_texture_tiling && compress_byte == 0 &&
        intel->intelScreen->kernel_exec_fencing) {
-      if (IS_965(intel->intelScreen->deviceID) &&
+      if (intel->gen >= 4 &&
 	  (base_format == GL_DEPTH_COMPONENT ||
 	   base_format == GL_DEPTH_STENCIL_EXT))
 	 tiling = I915_TILING_Y;
@@ -224,16 +224,12 @@ int intel_miptree_pitch_align (struct intel_context *intel,
    if (!mt->compressed) {
       int pitch_align;
 
-      if (intel->ttm) {
-	 /* XXX: Align pitch to multiple of 64 bytes for now to allow
-	  * render-to-texture to work in all cases. This should probably be
-	  * replaced at some point by some scheme to only do this when really
-	  * necessary.
-	  */
-	 pitch_align = 64;
-      } else {
-	 pitch_align = 4;
-      }
+      /* XXX: Align pitch to multiple of 64 bytes for now to allow
+       * render-to-texture to work in all cases. This should probably be
+       * replaced at some point by some scheme to only do this when really
+       * necessary.
+       */
+      pitch_align = 64;
 
       if (tiling == I915_TILING_X)
 	 pitch_align = 512;
@@ -315,17 +311,14 @@ intel_miptree_release(struct intel_context *intel,
  */
 GLboolean
 intel_miptree_match_image(struct intel_mipmap_tree *mt,
-                          struct gl_texture_image *image,
-                          GLuint face, GLuint level)
+                          struct gl_texture_image *image)
 {
    GLboolean isCompressed = _mesa_is_format_compressed(image->TexFormat);
+   struct intel_texture_image *intelImage = intel_texture_image(image);
+   GLuint level = intelImage->level;
 
-   /* Images with borders are never pulled into mipmap trees. 
-    */
-   if (image->Border ||
-       ((image->_BaseFormat == GL_DEPTH_COMPONENT) &&
-        ((image->TexObject->WrapS == GL_CLAMP_TO_BORDER) ||
-         (image->TexObject->WrapT == GL_CLAMP_TO_BORDER)))) 
+   /* Images with borders are never pulled into mipmap trees. */
+   if (image->Border)
       return GL_FALSE;
 
    if (image->InternalFormat != mt->internal_format ||
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
index 3bce54daa1..b19c548def 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
@@ -165,8 +165,7 @@ void intel_miptree_release(struct intel_context *intel,
 /* Check if an image fits an existing mipmap tree layout
  */
 GLboolean intel_miptree_match_image(struct intel_mipmap_tree *mt,
-                                    struct gl_texture_image *image,
-                                    GLuint face, GLuint level);
+                                    struct gl_texture_image *image);
 
 /* Return a pointer to an image within a tree.  Return image stride as
  * well.
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.c b/src/mesa/drivers/dri/intel/intel_pixel.c
index 993e427a99..5142f3dcd9 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel.c
@@ -88,10 +88,10 @@ intel_check_blit_fragment_ops(GLcontext * ctx, GLboolean src_alpha_is_one)
       return GL_FALSE;
    }
 
-   if (!(ctx->Color.ColorMask[0] &&
-	 ctx->Color.ColorMask[1] &&
-	 ctx->Color.ColorMask[2] &&
-	 ctx->Color.ColorMask[3])) {
+   if (!(ctx->Color.ColorMask[0][0] &&
+	 ctx->Color.ColorMask[0][1] &&
+	 ctx->Color.ColorMask[0][2] &&
+	 ctx->Color.ColorMask[0][3])) {
       DBG("fallback due to color masking\n");
       return GL_FALSE;
    }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 99330b6ddf..85e5ad2cdd 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -32,6 +32,7 @@
 #include "main/mtypes.h"
 #include "main/macros.h"
 #include "main/bufferobj.h"
+#include "main/polygon.h"
 #include "main/pixelstore.h"
 #include "main/polygon.h"
 #include "main/state.h"
@@ -165,7 +166,7 @@ static GLuint get_bitmap_rect(GLsizei width, GLsizei height,
  * Returns the low Y value of the vertical range given, flipped according to
  * whether the framebuffer is or not.
  */
-static inline int
+static INLINE int
 y_flip(struct gl_framebuffer *fb, int y, int height)
 {
    if (fb->Name != 0)
@@ -228,16 +229,13 @@ do_blit_bitmap( GLcontext *ctx,
    UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[3], tmpColor[3]);
 
    if (dst->cpp == 2)
-      color = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+      color = PACK_COLOR_565(ubcolor[0], ubcolor[1], ubcolor[2]);
    else
-      color = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1],
-				  ubcolor[2], ubcolor[3]);
+      color = PACK_COLOR_8888(ubcolor[3], ubcolor[0], ubcolor[1], ubcolor[2]);
 
    if (!intel_check_blit_fragment_ops(ctx, tmpColor[3] == 1.0F))
       return GL_FALSE;
 
-   LOCK_HARDWARE(intel);
-
    intel_get_cliprects(intel, &cliprects, &num_cliprects, &x_off, &y_off);
    if (num_cliprects != 0) {
       GLuint i;
@@ -325,7 +323,6 @@ do_blit_bitmap( GLcontext *ctx,
       }
    }
 out:
-   UNLOCK_HARDWARE(intel);
 
    if (INTEL_DEBUG & DEBUG_SYNC)
       intel_batchbuffer_flush(intel->batch);
@@ -336,6 +333,8 @@ out:
                               unpack->BufferObj);
    }
 
+   intel_check_front_buffer_rendering(intel);
+
    return GL_TRUE;
 }
 
@@ -502,6 +501,7 @@ intel_texture_bitmap(GLcontext * ctx,
    meta_restore_fragment_program(&intel->meta);
    meta_restore_vertex_program(&intel->meta);
 
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
    _mesa_PopClientAttrib();
    _mesa_PopAttrib();
 
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_copy.c b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
index f058b3c8e4..e002516cdd 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
@@ -35,28 +35,33 @@
 #include "intel_buffers.h"
 #include "intel_regions.h"
 #include "intel_pixel.h"
+#include "intel_fbo.h"
 
 #define FILE_DEBUG_FLAG DEBUG_PIXEL
 
 static struct intel_region *
 copypix_src_region(struct intel_context *intel, GLenum type)
 {
+   struct intel_renderbuffer *depth;
+
+   depth = (struct intel_renderbuffer *)
+      &intel->ctx.DrawBuffer->Attachment[BUFFER_DEPTH].Renderbuffer;
+
    switch (type) {
    case GL_COLOR:
       return intel_readbuf_region(intel);
    case GL_DEPTH:
-      /* Don't think this is really possible execpt at 16bpp, when we have no stencil.
-       */
-      if (intel->depth_region && intel->depth_region->cpp == 2)
-         return intel->depth_region;
+      /* Don't think this is really possible execpt at 16bpp, when we
+       * have no stencil. */
+      if (depth && depth->region->cpp == 2)
+         return depth->region;
    case GL_STENCIL:
-      /* Don't think this is really possible. 
-       */
+      /* Don't think this is really possible. */
       break;
    case GL_DEPTH_STENCIL_EXT:
       /* Does it matter whether it is stencil/depth or depth/stencil?
        */
-      return intel->depth_region;
+      return depth->region;
    default:
       break;
    }
@@ -83,10 +88,10 @@ intel_check_copypixel_blit_fragment_ops(GLcontext * ctx)
             ctx->Depth.Test ||
             ctx->Fog.Enabled ||
             ctx->Stencil._Enabled ||
-            !ctx->Color.ColorMask[0] ||
-            !ctx->Color.ColorMask[1] ||
-            !ctx->Color.ColorMask[2] ||
-            !ctx->Color.ColorMask[3] ||
+            !ctx->Color.ColorMask[0][0] ||
+            !ctx->Color.ColorMask[0][1] ||
+            !ctx->Color.ColorMask[0][2] ||
+            !ctx->Color.ColorMask[0][3] ||
             ctx->Texture._EnabledUnits ||
 	    ctx->FragmentProgram._Enabled ||
 	    ctx->Color.BlendEnabled);
@@ -134,8 +139,6 @@ do_blit_copypixels(GLcontext * ctx,
 
    intelFlush(&intel->ctx);
 
-   LOCK_HARDWARE(intel);
-
    intel_get_cliprects(intel, &cliprects, &num_cliprects, &x_off, &y_off);
    if (num_cliprects != 0) {
       GLint delta_x;
@@ -214,13 +217,13 @@ do_blit_copypixels(GLcontext * ctx,
 				ctx->Color.ColorLogicOpEnabled ?
 				ctx->Color.LogicOp : GL_COPY)) {
 	    DBG("%s: blit failure\n", __FUNCTION__);
-	    UNLOCK_HARDWARE(intel);
 	    return GL_FALSE;
 	 }
       }
    }
 out:
-   UNLOCK_HARDWARE(intel);
+
+   intel_check_front_buffer_rendering(intel);
 
    DBG("%s: success\n", __FUNCTION__);
    return GL_TRUE;
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_draw.c b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
index 9b382e3622..b870e9315e 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
@@ -69,7 +69,6 @@ intel_stencil_drawpixels(GLcontext * ctx,
    GLfloat vertices[4][2];
    struct intel_renderbuffer *irb;
    struct intel_renderbuffer *depth_irb;
-   struct gl_renderbuffer *rb;
    struct gl_pixelstore_attrib old_unpack;
    GLstencil *stencil_pixels;
    int row, y1, y2;
@@ -170,7 +169,6 @@ intel_stencil_drawpixels(GLcontext * ctx,
     */
    depth_irb = intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
    irb = intel_create_renderbuffer(MESA_FORMAT_ARGB8888);
-   rb = &irb->Base;
    irb->Base.Width = depth_irb->Base.Width;
    irb->Base.Height = depth_irb->Base.Height;
    intel_renderbuffer_set_region(irb, depth_irb->region);
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_read.c b/src/mesa/drivers/dri/intel/intel_pixel_read.c
index 4707500180..9c0fdc6067 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_read.c
@@ -77,7 +77,7 @@ do_texture_readpixels(GLcontext * ctx,
    struct intel_context *intel = intel_context(ctx);
    intelScreenPrivate *screen = intel->intelScreen;
    GLint pitch = pack->RowLength ? pack->RowLength : width;
-   __DRIdrawablePrivate *dPriv = intel->driDrawable;
+   __DRIdrawable *dPriv = intel->driDrawable;
    int textureFormat;
    GLenum glTextureFormat;
    int destFormat, depthFormat, destPitch;
@@ -105,15 +105,12 @@ do_texture_readpixels(GLcontext * ctx,
       return GL_FALSE;
    }
 
-   LOCK_HARDWARE(intel);
-
    if (intel->driDrawable->numClipRects) {
       intel->vtbl.install_meta_state(intel);
       intel->vtbl.meta_no_depth_write(intel);
       intel->vtbl.meta_no_stencil_write(intel);
 
       if (!driClipRectToFramebuffer(ctx->ReadBuffer, &x, &y, &width, &height)) {
-         UNLOCK_HARDWARE(intel);
          SET_STATE(i830, state);
          if (INTEL_DEBUG & DEBUG_PIXEL)
             fprintf(stderr, "%s: cliprect failed\n", __FUNCTION__);
@@ -150,7 +147,6 @@ do_texture_readpixels(GLcontext * ctx,
 
       intel->vtbl.leave_meta_state(intel);
    }
-   UNLOCK_HARDWARE(intel);
 
    intel_region_wait_fence(ctx, dest_region);   /* required by GL */
    return GL_TRUE;
@@ -224,7 +220,6 @@ do_blit_readpixels(GLcontext * ctx,
     * fire with lock held to guarentee cliprects are correct.
     */
    intelFlush(&intel->ctx);
-   LOCK_HARDWARE(intel);
 
    if (intel->driReadDrawable->numClipRects) {
       GLboolean all = (width * height * src->cpp == dst->Base.Size &&
@@ -233,7 +228,7 @@ do_blit_readpixels(GLcontext * ctx,
       dri_bo *dst_buffer = intel_bufferobj_buffer(intel, dst,
 						  all ? INTEL_WRITE_FULL :
 						  INTEL_WRITE_PART);
-      __DRIdrawablePrivate *dPriv = intel->driReadDrawable;
+      __DRIdrawable *dPriv = intel->driReadDrawable;
       int nbox = dPriv->numClipRects;
       drm_clip_rect_t *box = dPriv->pClipRects;
       drm_clip_rect_t rect;
@@ -261,12 +256,10 @@ do_blit_readpixels(GLcontext * ctx,
 				rect.y2 - src_rect.y2,
 				rect.x2 - rect.x1, rect.y2 - rect.y1,
 				GL_COPY)) {
-	    UNLOCK_HARDWARE(intel);
 	    return GL_FALSE;
 	 }
       }
    }
-   UNLOCK_HARDWARE(intel);
 
    if (INTEL_DEBUG & DEBUG_PIXEL)
       _mesa_printf("%s - DONE\n", __FUNCTION__);
@@ -285,11 +278,11 @@ intelReadPixels(GLcontext * ctx,
 
    intelFlush(ctx);
 
-#ifdef I915
    if (do_blit_readpixels
        (ctx, x, y, width, height, format, type, pack, pixels))
       return;
 
+#ifdef I915
    if (do_texture_readpixels
        (ctx, x, y, width, height, format, type, pack, pixels))
       return;
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index a86c66a844..61aefa01b8 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -362,14 +362,12 @@ intel_region_data(struct intel_context *intel,
          intel_region_cow(intel, dst);
    }
 
-   LOCK_HARDWARE(intel);
    _mesa_copy_rect(intel_region_map(intel, dst) + dst_offset,
                    dst->cpp,
                    dst->pitch,
                    dstx, dsty, width, height, src, src_pitch, srcx, srcy);
 
    intel_region_unmap(intel, dst);
-   UNLOCK_HARDWARE(intel);
 }
 
 /* Copy rectangular sub-regions. Need better logic about when to
@@ -485,7 +483,6 @@ intel_region_cow(struct intel_context *intel, struct intel_region *region)
    /* Now blit from the texture buffer to the new buffer: 
     */
 
-   LOCK_HARDWARE(intel);
    ok = intelEmitCopyBlit(intel,
                           region->cpp,
                           region->pitch, pbo->buffer, 0, region->tiling,
@@ -494,7 +491,6 @@ intel_region_cow(struct intel_context *intel, struct intel_region *region)
                           region->pitch, region->height,
                           GL_COPY);
    assert(ok);
-   UNLOCK_HARDWARE(intel);
 }
 
 dri_bo *
@@ -510,126 +506,3 @@ intel_region_buffer(struct intel_context *intel,
 
    return region->buffer;
 }
-
-static struct intel_region *
-intel_recreate_static(struct intel_context *intel,
-		      const char *name,
-		      struct intel_region *region,
-		      intelRegion *region_desc)
-{
-   intelScreenPrivate *intelScreen = intel->intelScreen;
-   int ret;
-
-   if (region == NULL) {
-      region = calloc(sizeof(*region), 1);
-      region->refcount = 1;
-      _DBG("%s creating new region %p\n", __FUNCTION__, region);
-   }
-   else {
-      _DBG("%s %p\n", __FUNCTION__, region);
-   }
-
-   if (intel->ctx.Visual.rgbBits == 24)
-      region->cpp = 4;
-   else
-      region->cpp = intel->ctx.Visual.rgbBits / 8;
-   region->pitch = intelScreen->pitch;
-   region->width = intelScreen->width;
-   region->height = intelScreen->height;
-
-   if (region->buffer != NULL) {
-      dri_bo_unreference(region->buffer);
-      region->buffer = NULL;
-   }
-
-   if (intel->ttm) {
-      assert(region_desc->bo_handle != -1);
-      region->buffer = intel_bo_gem_create_from_name(intel->bufmgr,
-						     name,
-						     region_desc->bo_handle);
-
-      ret = dri_bo_get_tiling(region->buffer, &region->tiling,
-			      &region->bit_6_swizzle);
-      if (ret != 0) {
-	 fprintf(stderr, "Couldn't get tiling of buffer %d (%s): %s\n",
-		 region_desc->bo_handle, name, strerror(-ret));
-	 intel_region_release(&region);
-	 return NULL;
-      }
-   } else {
-      if (region->classic_map != NULL) {
-	 drmUnmap(region->classic_map,
-		  region->pitch * region->cpp * region->height);
-	 region->classic_map = NULL;
-      }
-      ret = drmMap(intel->driFd, region_desc->handle,
-		   region->pitch * region->cpp * region->height,
-		   &region->classic_map);
-      if (ret != 0) {
-	 fprintf(stderr, "Failed to drmMap %s buffer\n", name);
-	 free(region);
-	 return NULL;
-      }
-
-      region->buffer = intel_bo_fake_alloc_static(intel->bufmgr,
-						  name,
-						  region_desc->offset,
-						  region->pitch * region->cpp *
-						  region->height,
-						  region->classic_map);
-
-      /* The sarea just gives us a boolean for whether it's tiled or not,
-       * instead of which tiling mode it is.  Guess.
-       */
-      if (region_desc->tiled) {
-	 if (IS_965(intel->intelScreen->deviceID) &&
-	     region_desc == &intelScreen->depth)
-	    region->tiling = I915_TILING_Y;
-	 else
-	    region->tiling = I915_TILING_X;
-      } else {
-	 region->tiling = I915_TILING_NONE;
-      }
-
-      region->bit_6_swizzle = I915_BIT_6_SWIZZLE_NONE;
-   }
-
-   assert(region->buffer != NULL);
-
-   return region;
-}
-
-/**
- * Create intel_region structs to describe the static front, back, and depth
- * buffers created by the xserver.
- *
- * Although FBO's mean we now no longer use these as render targets in
- * all circumstances, they won't go away until the back and depth
- * buffers become private, and the front buffer will remain even then.
- *
- * Note that these don't allocate video memory, just describe
- * allocations alread made by the X server.
- */
-void
-intel_recreate_static_regions(struct intel_context *intel)
-{
-   intelScreenPrivate *intelScreen = intel->intelScreen;
-
-   intel->front_region =
-      intel_recreate_static(intel, "front",
-			    intel->front_region,
-			    &intelScreen->front);
-
-   intel->back_region =
-      intel_recreate_static(intel, "back",
-			    intel->back_region,
-			    &intelScreen->back);
-
-   /* Still assumes front.cpp == depth.cpp.  We can kill this when we move to
-    * private buffers.
-    */
-   intel->depth_region =
-      intel_recreate_static(intel, "depth",
-			    intel->depth_region,
-			    &intelScreen->depth);
-}
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 789135b49f..e240957197 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -31,7 +31,6 @@
 #include "main/renderbuffer.h"
 
 #include "utils.h"
-#include "vblank.h"
 #include "xmlpool.h"
 
 #include "intel_batchbuffer.h"
@@ -41,7 +40,6 @@
 #include "intel_extensions.h"
 #include "intel_fbo.h"
 #include "intel_regions.h"
-#include "intel_swapbuffers.h"
 #include "intel_screen.h"
 #include "intel_span.h"
 #include "intel_tex.h"
@@ -57,7 +55,6 @@
 PUBLIC const char __driConfigOptions[] =
    DRI_CONF_BEGIN
    DRI_CONF_SECTION_PERFORMANCE
-      DRI_CONF_FTHROTTLE_MODE(DRI_CONF_FTHROTTLE_IRQS)
       DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_ALWAYS_SYNC)
       /* Options correspond to DRI_CONF_BO_REUSE_DISABLED,
        * DRI_CONF_BO_REUSE_ALL
@@ -99,157 +96,58 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-const GLuint __driNConfigOptions = 12;
+const GLuint __driNConfigOptions = 11;
 
 #ifdef USE_NEW_INTERFACE
 static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
 #endif /*USE_NEW_INTERFACE */
 
-/**
- * Map all the memory regions described by the screen.
- * \return GL_TRUE if success, GL_FALSE if error.
- */
-GLboolean
-intelMapScreenRegions(__DRIscreenPrivate * sPriv)
-{
-   intelScreenPrivate *intelScreen = (intelScreenPrivate *) sPriv->private;
-
-   if (0)
-      _mesa_printf("TEX 0x%08x ", intelScreen->tex.handle);
-   if (intelScreen->tex.size != 0) {
-      if (drmMap(sPriv->fd,
-		 intelScreen->tex.handle,
-		 intelScreen->tex.size,
-		 (drmAddress *) & intelScreen->tex.map) != 0) {
-	 intelUnmapScreenRegions(intelScreen);
-	 return GL_FALSE;
-      }
-   }
-
-   return GL_TRUE;
-}
-
-void
-intelUnmapScreenRegions(intelScreenPrivate * intelScreen)
-{
-   if (intelScreen->tex.map) {
-      drmUnmap(intelScreen->tex.map, intelScreen->tex.size);
-      intelScreen->tex.map = NULL;
-   }
-}
+static const __DRItexOffsetExtension intelTexOffsetExtension = {
+   { __DRI_TEX_OFFSET },
+   intelSetTexOffset,
+};
 
+static const __DRItexBufferExtension intelTexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   intelSetTexBuffer,
+   intelSetTexBuffer2,
+};
 
 static void
-intelPrintDRIInfo(intelScreenPrivate * intelScreen,
-                  __DRIscreenPrivate * sPriv, I830DRIPtr gDRIPriv)
+intelDRI2Flush(__DRIdrawable *drawable)
 {
-   fprintf(stderr, "*** Front size:   0x%x  offset: 0x%x  pitch: %d\n",
-           intelScreen->front.size, intelScreen->front.offset,
-           intelScreen->pitch);
-   fprintf(stderr, "*** Back size:    0x%x  offset: 0x%x  pitch: %d\n",
-           intelScreen->back.size, intelScreen->back.offset,
-           intelScreen->pitch);
-   fprintf(stderr, "*** Depth size:   0x%x  offset: 0x%x  pitch: %d\n",
-           intelScreen->depth.size, intelScreen->depth.offset,
-           intelScreen->pitch);
-   fprintf(stderr, "*** Texture size: 0x%x  offset: 0x%x\n",
-           intelScreen->tex.size, intelScreen->tex.offset);
-   fprintf(stderr, "*** Memory : 0x%x\n", gDRIPriv->mem);
-}
+   struct intel_context *intel = drawable->driContextPriv->driverPrivate;
 
+   if (intel->gen < 4)
+      INTEL_FIREVERTICES(intel);
 
-static void
-intelPrintSAREA(const drm_i915_sarea_t * sarea)
-{
-   fprintf(stderr, "SAREA: sarea width %d  height %d\n", sarea->width,
-           sarea->height);
-   fprintf(stderr, "SAREA: pitch: %d\n", sarea->pitch);
-   fprintf(stderr,
-           "SAREA: front offset: 0x%08x  size: 0x%x  handle: 0x%x tiled: %d\n",
-           sarea->front_offset, sarea->front_size,
-           (unsigned) sarea->front_handle, sarea->front_tiled);
-   fprintf(stderr,
-           "SAREA: back  offset: 0x%08x  size: 0x%x  handle: 0x%x tiled: %d\n",
-           sarea->back_offset, sarea->back_size,
-           (unsigned) sarea->back_handle, sarea->back_tiled);
-   fprintf(stderr, "SAREA: depth offset: 0x%08x  size: 0x%x  handle: 0x%x tiled: %d\n",
-           sarea->depth_offset, sarea->depth_size,
-           (unsigned) sarea->depth_handle, sarea->depth_tiled);
-   fprintf(stderr, "SAREA: tex   offset: 0x%08x  size: 0x%x  handle: 0x%x\n",
-           sarea->tex_offset, sarea->tex_size, (unsigned) sarea->tex_handle);
+   if (intel->batch->map != intel->batch->ptr)
+      intel_batchbuffer_flush(intel->batch);
 }
 
-
-/**
- * A number of the screen parameters are obtained/computed from
- * information in the SAREA.  This function updates those parameters.
- */
 static void
-intelUpdateScreenFromSAREA(intelScreenPrivate * intelScreen,
-                           drm_i915_sarea_t * sarea)
+intelDRI2FlushInvalidate(__DRIdrawable *drawable)
 {
-   intelScreen->width = sarea->width;
-   intelScreen->height = sarea->height;
-   intelScreen->pitch = sarea->pitch;
-
-   intelScreen->front.offset = sarea->front_offset;
-   intelScreen->front.handle = sarea->front_handle;
-   intelScreen->front.size = sarea->front_size;
-   intelScreen->front.tiled = sarea->front_tiled;
-
-   intelScreen->back.offset = sarea->back_offset;
-   intelScreen->back.handle = sarea->back_handle;
-   intelScreen->back.size = sarea->back_size;
-   intelScreen->back.tiled = sarea->back_tiled;
-
-   intelScreen->depth.offset = sarea->depth_offset;
-   intelScreen->depth.handle = sarea->depth_handle;
-   intelScreen->depth.size = sarea->depth_size;
-   intelScreen->depth.tiled = sarea->depth_tiled;
-
-   if (intelScreen->driScrnPriv->ddx_version.minor >= 9) {
-      intelScreen->front.bo_handle = sarea->front_bo_handle;
-      intelScreen->back.bo_handle = sarea->back_bo_handle;
-      intelScreen->depth.bo_handle = sarea->depth_bo_handle;
-   } else {
-      intelScreen->front.bo_handle = -1;
-      intelScreen->back.bo_handle = -1;
-      intelScreen->depth.bo_handle = -1;
-   }
-
-   intelScreen->tex.offset = sarea->tex_offset;
-   intelScreen->logTextureGranularity = sarea->log_tex_granularity;
-   intelScreen->tex.handle = sarea->tex_handle;
-   intelScreen->tex.size = sarea->tex_size;
-
-   if (0)
-      intelPrintSAREA(sarea);
+   intelDRI2Flush(drawable);
+   drawable->validBuffers = GL_FALSE;
 }
 
-static const __DRItexOffsetExtension intelTexOffsetExtension = {
-   { __DRI_TEX_OFFSET },
-   intelSetTexOffset,
-};
-
-static const __DRItexBufferExtension intelTexBufferExtension = {
-    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
-   intelSetTexBuffer,
-   intelSetTexBuffer2,
+static const struct __DRI2flushExtensionRec intelFlushExtension = {
+    { __DRI2_FLUSH, __DRI2_FLUSH_VERSION },
+    intelDRI2Flush,
+    intelDRI2FlushInvalidate,
 };
 
 static const __DRIextension *intelScreenExtensions[] = {
     &driReadDrawableExtension,
-    &driCopySubBufferExtension.base,
-    &driSwapControlExtension.base,
-    &driFrameTrackingExtension.base,
-    &driMediaStreamCounterExtension.base,
     &intelTexOffsetExtension.base,
     &intelTexBufferExtension.base,
+    &intelFlushExtension.base,
     NULL
 };
 
 static GLboolean
-intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
+intel_get_param(__DRIscreen *psp, int param, int *value)
 {
    int ret;
    struct drm_i915_getparam gp;
@@ -266,68 +164,12 @@ intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
    return GL_TRUE;
 }
 
-static GLboolean intelInitDriver(__DRIscreenPrivate *sPriv)
-{
-   intelScreenPrivate *intelScreen;
-   I830DRIPtr gDRIPriv = (I830DRIPtr) sPriv->pDevPriv;
-   drm_i915_sarea_t *sarea;
-
-   if (sPriv->devPrivSize != sizeof(I830DRIRec)) {
-      fprintf(stderr,
-              "\nERROR!  sizeof(I830DRIRec) does not match passed size from device driver\n");
-      return GL_FALSE;
-   }
-
-   /* Allocate the private area */
-   intelScreen = (intelScreenPrivate *) CALLOC(sizeof(intelScreenPrivate));
-   if (!intelScreen) {
-      fprintf(stderr, "\nERROR!  Allocating private area failed\n");
-      return GL_FALSE;
-   }
-   /* parse information in __driConfigOptions */
-   driParseOptionInfo(&intelScreen->optionCache,
-                      __driConfigOptions, __driNConfigOptions);
-
-   intelScreen->driScrnPriv = sPriv;
-   sPriv->private = (void *) intelScreen;
-   sarea = (drm_i915_sarea_t *)
-      (((GLubyte *) sPriv->pSAREA) + gDRIPriv->sarea_priv_offset);
-   intelScreen->sarea = sarea;
-
-   intelScreen->deviceID = gDRIPriv->deviceID;
-
-   intelUpdateScreenFromSAREA(intelScreen, sarea);
-
-   if (!intelMapScreenRegions(sPriv)) {
-      fprintf(stderr, "\nERROR!  mapping regions\n");
-      _mesa_free(intelScreen);
-      sPriv->private = NULL;
-      return GL_FALSE;
-   }
-
-   if (0)
-      intelPrintDRIInfo(intelScreen, sPriv, gDRIPriv);
-
-   intelScreen->drmMinor = sPriv->drm_version.minor;
-
-   /* Determine if IRQs are active? */
-   if (!intel_get_param(sPriv, I915_PARAM_IRQ_ACTIVE,
-			&intelScreen->irq_active))
-      return GL_FALSE;
-
-   sPriv->extensions = intelScreenExtensions;
-
-   return GL_TRUE;
-}
-
-
 static void
-intelDestroyScreen(__DRIscreenPrivate * sPriv)
+intelDestroyScreen(__DRIscreen * sPriv)
 {
    intelScreenPrivate *intelScreen = (intelScreenPrivate *) sPriv->private;
 
    dri_bufmgr_destroy(intelScreen->bufmgr);
-   intelUnmapScreenRegions(intelScreen);
    driDestroyOptionInfo(&intelScreen->optionCache);
 
    FREE(intelScreen);
@@ -339,10 +181,12 @@ intelDestroyScreen(__DRIscreenPrivate * sPriv)
  * This is called when we need to set up GL rendering to a new X window.
  */
 static GLboolean
-intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
-                  __DRIdrawablePrivate * driDrawPriv,
+intelCreateBuffer(__DRIscreen * driScrnPriv,
+                  __DRIdrawable * driDrawPriv,
                   const __GLcontextModes * mesaVis, GLboolean isPixmap)
 {
+   struct intel_renderbuffer *rb;
+
    if (isPixmap) {
       return GL_FALSE;          /* not implemented */
    }
@@ -351,12 +195,12 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
                              mesaVis->depthBits != 24);
       gl_format rgbFormat;
 
-      struct intel_framebuffer *intel_fb = CALLOC_STRUCT(intel_framebuffer);
+      struct gl_framebuffer *fb = CALLOC_STRUCT(gl_framebuffer);
 
-      if (!intel_fb)
+      if (!fb)
 	 return GL_FALSE;
 
-      _mesa_initialize_framebuffer(&intel_fb->Base, mesaVis);
+      _mesa_initialize_framebuffer(fb, mesaVis);
 
       if (mesaVis->redBits == 5)
 	 rgbFormat = MESA_FORMAT_RGB565;
@@ -366,16 +210,12 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
 	 rgbFormat = MESA_FORMAT_ARGB8888;
 
       /* setup the hardware-based renderbuffers */
-      intel_fb->color_rb[0] = intel_create_renderbuffer(rgbFormat);
-      _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_FRONT_LEFT,
-			     &intel_fb->color_rb[0]->Base);
+      rb = intel_create_renderbuffer(rgbFormat);
+      _mesa_add_renderbuffer(fb, BUFFER_FRONT_LEFT, &rb->Base);
 
       if (mesaVis->doubleBufferMode) {
-	 intel_fb->color_rb[1] = intel_create_renderbuffer(rgbFormat);
-
-         _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_BACK_LEFT,
-				&intel_fb->color_rb[1]->Base);
-
+	 rb = intel_create_renderbuffer(rgbFormat);
+         _mesa_add_renderbuffer(fb, BUFFER_BACK_LEFT, &rb->Base);
       }
 
       if (mesaVis->depthBits == 24) {
@@ -384,115 +224,63 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
 	    struct intel_renderbuffer *depthStencilRb
 	       = intel_create_renderbuffer(MESA_FORMAT_S8_Z24);
 	    /* note: bind RB to two attachment points */
-	    _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_DEPTH,
-				   &depthStencilRb->Base);
-	    _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_STENCIL,
-				   &depthStencilRb->Base);
+	    _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthStencilRb->Base);
+	    _mesa_add_renderbuffer(fb, BUFFER_STENCIL, &depthStencilRb->Base);
 	 } else {
 	    struct intel_renderbuffer *depthRb
 	       = intel_create_renderbuffer(MESA_FORMAT_X8_Z24);
-	    _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_DEPTH,
-				   &depthRb->Base);
+	    _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
 	 }
       }
       else if (mesaVis->depthBits == 16) {
          /* just 16-bit depth buffer, no hw stencil */
          struct intel_renderbuffer *depthRb
 	    = intel_create_renderbuffer(MESA_FORMAT_Z16);
-         _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_DEPTH, &depthRb->Base);
+         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
       }
 
       /* now add any/all software-based renderbuffers we may need */
-      _mesa_add_soft_renderbuffers(&intel_fb->Base,
+      _mesa_add_soft_renderbuffers(fb,
                                    GL_FALSE, /* never sw color */
                                    GL_FALSE, /* never sw depth */
                                    swStencil, mesaVis->accumRedBits > 0,
                                    GL_FALSE, /* never sw alpha */
                                    GL_FALSE  /* never sw aux */ );
-      driDrawPriv->driverPrivate = (void *) intel_fb;
+      driDrawPriv->driverPrivate = fb;
 
       return GL_TRUE;
    }
 }
 
 static void
-intelDestroyBuffer(__DRIdrawablePrivate * driDrawPriv)
-{
-   struct intel_framebuffer *intel_fb = driDrawPriv->driverPrivate;
-   struct intel_renderbuffer *depth_rb;
-   struct intel_renderbuffer *stencil_rb;
-
-   if (intel_fb) {
-      if (intel_fb->color_rb[0]) {
-         intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
-      }
-
-      if (intel_fb->color_rb[1]) {
-         intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
-      }
-
-      depth_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
-      if (depth_rb) {
-         intel_renderbuffer_set_region(depth_rb, NULL);
-      }
-
-      stencil_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
-      if (stencil_rb) {
-         intel_renderbuffer_set_region(stencil_rb, NULL);
-      }
-   }
-
-   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
-}
-
-
-/**
- * Get information about previous buffer swaps.
- */
-static int
-intelGetSwapInfo(__DRIdrawablePrivate * dPriv, __DRIswapInfo * sInfo)
+intelDestroyBuffer(__DRIdrawable * driDrawPriv)
 {
-   struct intel_framebuffer *intel_fb;
-
-   if ((dPriv == NULL) || (dPriv->driverPrivate == NULL)
-       || (sInfo == NULL)) {
-      return -1;
-   }
-
-   intel_fb = dPriv->driverPrivate;
-   sInfo->swap_count = intel_fb->swap_count;
-   sInfo->swap_ust = intel_fb->swap_ust;
-   sInfo->swap_missed_count = intel_fb->swap_missed_count;
-
-   sInfo->swap_missed_usage = (sInfo->swap_missed_count != 0)
-      ? driCalculateSwapUsage(dPriv, 0, intel_fb->swap_missed_ust)
-      : 0.0;
-
-   return 0;
+    struct gl_framebuffer *fb = driDrawPriv->driverPrivate;
+  
+    _mesa_reference_framebuffer(&fb, NULL);
 }
 
-
 /* There are probably better ways to do this, such as an
  * init-designated function to register chipids and createcontext
  * functions.
  */
 extern GLboolean i830CreateContext(const __GLcontextModes * mesaVis,
-                                   __DRIcontextPrivate * driContextPriv,
+                                   __DRIcontext * driContextPriv,
                                    void *sharedContextPrivate);
 
 extern GLboolean i915CreateContext(const __GLcontextModes * mesaVis,
-                                   __DRIcontextPrivate * driContextPriv,
+                                   __DRIcontext * driContextPriv,
                                    void *sharedContextPrivate);
 extern GLboolean brwCreateContext(const __GLcontextModes * mesaVis,
-				  __DRIcontextPrivate * driContextPriv,
+				  __DRIcontext * driContextPriv,
 				  void *sharedContextPrivate);
 
 static GLboolean
 intelCreateContext(const __GLcontextModes * mesaVis,
-                   __DRIcontextPrivate * driContextPriv,
+                   __DRIcontext * driContextPriv,
                    void *sharedContextPrivate)
 {
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    intelScreenPrivate *intelScreen = (intelScreenPrivate *) sPriv->private;
 
 #ifdef I915
@@ -513,152 +301,20 @@ intelCreateContext(const __GLcontextModes * mesaVis,
    return GL_FALSE;
 }
 
-
-static __DRIconfig **
-intelFillInModes(__DRIscreenPrivate *psp,
-		 unsigned pixel_bits, unsigned depth_bits,
-                 unsigned stencil_bits, GLboolean have_back_buffer)
-{
-   __DRIconfig **configs;
-   __GLcontextModes *m;
-   unsigned depth_buffer_factor;
-   unsigned back_buffer_factor;
-   int i;
-
-   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
-    * support pageflipping at all.
-    */
-   static const GLenum back_buffer_modes[] = {
-      GLX_NONE, GLX_SWAP_UNDEFINED_OML, GLX_SWAP_COPY_OML
-   };
-
-   uint8_t depth_bits_array[3];
-   uint8_t stencil_bits_array[3];
-   uint8_t msaa_samples_array[1];
-
-   depth_bits_array[0] = 0;
-   depth_bits_array[1] = depth_bits;
-   depth_bits_array[2] = depth_bits;
-
-   /* Just like with the accumulation buffer, always provide some modes
-    * with a stencil buffer.  It will be a sw fallback, but some apps won't
-    * care about that.
-    */
-   stencil_bits_array[0] = 0;
-   stencil_bits_array[1] = 0;
-   if (depth_bits == 24)
-      stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
-
-   stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
-
-   msaa_samples_array[0] = 0;
-
-   depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
-   back_buffer_factor = (have_back_buffer) ? 3 : 1;
-
-   if (pixel_bits == 16) {
-      configs = driCreateConfigs(GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
-				 depth_bits_array, stencil_bits_array,
-				 depth_buffer_factor, back_buffer_modes,
-				 back_buffer_factor,
-				 msaa_samples_array, 1);
-   }
-   else {
-      __DRIconfig **configs_a8r8g8b8;
-      __DRIconfig **configs_x8r8g8b8;
-
-      configs_a8r8g8b8 = driCreateConfigs(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV,
-					  depth_bits_array,
-					  stencil_bits_array,
-					  depth_buffer_factor,
-					  back_buffer_modes,
-					  back_buffer_factor,
-					  msaa_samples_array, 1);
-      configs_x8r8g8b8 = driCreateConfigs(GL_BGR, GL_UNSIGNED_INT_8_8_8_8_REV,
-					  depth_bits_array,
-					  stencil_bits_array,
-					  depth_buffer_factor,
-					  back_buffer_modes,
-					  back_buffer_factor,
-					  msaa_samples_array, 1);
-      configs = driConcatConfigs(configs_a8r8g8b8, configs_x8r8g8b8);
-   }
-
-   if (configs == NULL) {
-    fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
-              __LINE__);
-      return NULL;
-   }
-
-   /* Mark the visual as slow if there are "fake" stencil bits.
-    */
-   for (i = 0; configs[i]; i++) {
-      m = &configs[i]->modes;
-      if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) {
-         m->visualRating = GLX_SLOW_CONFIG;
-      }
-   }
-
-   return configs;
-}
-
 static GLboolean
 intel_init_bufmgr(intelScreenPrivate *intelScreen)
 {
-   GLboolean gem_disable = getenv("INTEL_NO_GEM") != NULL;
-   int gem_kernel = 0;
-   GLboolean gem_supported;
-   struct drm_i915_getparam gp;
-   __DRIscreenPrivate *spriv = intelScreen->driScrnPriv;
+   __DRIscreen *spriv = intelScreen->driScrnPriv;
    int num_fences = 0;
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
-   gp.param = I915_PARAM_HAS_GEM;
-   gp.value = &gem_kernel;
-
-   (void) drmCommandWriteRead(spriv->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
-
-   /* If we've got a new enough DDX that's initializing GEM and giving us
-    * object handles for the shared buffers, use that.
-    */
-   intelScreen->ttm = GL_FALSE;
-   if (intelScreen->driScrnPriv->dri2.enabled)
-       gem_supported = GL_TRUE;
-   else if (intelScreen->driScrnPriv->ddx_version.minor >= 9 &&
-	    gem_kernel &&
-	    intelScreen->front.bo_handle != -1)
-       gem_supported = GL_TRUE;
-   else
-       gem_supported = GL_FALSE;
-
-   if (!gem_disable && gem_supported) {
-      intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, BATCH_SZ);
-      if (intelScreen->bufmgr != NULL)
-	 intelScreen->ttm = GL_TRUE;
-   }
+   intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, BATCH_SZ);
    /* Otherwise, use the classic buffer manager. */
    if (intelScreen->bufmgr == NULL) {
-      if (gem_disable) {
-	 _mesa_warning(NULL, "GEM disabled.  Using classic.");
-      } else {
-	 _mesa_warning(NULL,
-                       "Failed to initialize GEM.  Falling back to classic.");
-      }
-
-      if (intelScreen->tex.size == 0) {
-	 fprintf(stderr, "[%s:%u] Error initializing buffer manager.\n",
-		 __func__, __LINE__);
-	 return GL_FALSE;
-      }
-
-      intelScreen->bufmgr =
-	 intel_bufmgr_fake_init(spriv->fd,
-				intelScreen->tex.offset,
-				intelScreen->tex.map,
-				intelScreen->tex.size,
-				(unsigned int * volatile)
-				&intelScreen->sarea->last_dispatch);
+      fprintf(stderr, "[%s:%u] Error initializing buffer manager.\n",
+	      __func__, __LINE__);
+      return GL_FALSE;
    }
 
    if (intel_get_param(spriv, I915_PARAM_NUM_FENCES_AVAIL, &num_fences))
@@ -671,78 +327,20 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
 
 /**
  * This is the driver specific part of the createNewScreen entry point.
- * Called when using legacy DRI.
- * 
- * \todo maybe fold this into intelInitDriver
- *
- * \return the __GLcontextModes supported by this driver
- */
-static const __DRIconfig **intelInitScreen(__DRIscreenPrivate *psp)
-{
-   intelScreenPrivate *intelScreen;
-#ifdef I915
-   static const __DRIversion ddx_expected = { 1, 5, 0 };
-#else
-   static const __DRIversion ddx_expected = { 1, 6, 0 };
-#endif
-   static const __DRIversion dri_expected = { 4, 0, 0 };
-   static const __DRIversion drm_expected = { 1, 5, 0 };
-   I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
-
-   if (!driCheckDriDdxDrmVersions2("i915",
-                                   &psp->dri_version, &dri_expected,
-                                   &psp->ddx_version, &ddx_expected,
-                                   &psp->drm_version, &drm_expected)) {
-      return NULL;
-   }
-
-   if (!intelInitDriver(psp))
-       return NULL;
-
-   psp->extensions = intelScreenExtensions;
-
-   intelScreen = psp->private;
-   if (!intel_init_bufmgr(intelScreen))
-       return GL_FALSE;
-
-   return (const __DRIconfig **)
-       intelFillInModes(psp, dri_priv->cpp * 8,
-			(dri_priv->cpp == 2) ? 16 : 24,
-			(dri_priv->cpp == 2) ? 0  : 8, 1);
-}
-
-struct intel_context *intelScreenContext(intelScreenPrivate *intelScreen)
-{
-  /*
-   * This should probably change to have the screen allocate a dummy
-   * context at screen creation. For now just use the current context.
-   */
-
-  GET_CURRENT_CONTEXT(ctx);
-  if (ctx == NULL) {
-     _mesa_problem(NULL, "No current context in intelScreenContext\n");
-     return NULL;
-  }
-  return intel_context(ctx);
-}
-
-/**
- * This is the driver specific part of the createNewScreen entry point.
  * Called when using DRI2.
  *
  * \return the __GLcontextModes supported by this driver
  */
 static const
-__DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
+__DRIconfig **intelInitScreen2(__DRIscreen *psp)
 {
    intelScreenPrivate *intelScreen;
    GLenum fb_format[3];
    GLenum fb_type[3];
-   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
-    * support pageflipping at all.
-    */
+
    static const GLenum back_buffer_modes[] = {
-      GLX_NONE, GLX_SWAP_UNDEFINED_OML, GLX_SWAP_COPY_OML
+       GLX_NONE, GLX_SWAP_UNDEFINED_OML,
+       GLX_SWAP_EXCHANGE_OML, GLX_SWAP_COPY_OML
    };
    uint8_t depth_bits[4], stencil_bits[4], msaa_samples_array[1];
    int color;
@@ -842,19 +440,19 @@ __DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
 }
 
 const struct __DriverAPIRec driDriverAPI = {
-   .InitScreen		 = intelInitScreen,
    .DestroyScreen	 = intelDestroyScreen,
    .CreateContext	 = intelCreateContext,
    .DestroyContext	 = intelDestroyContext,
    .CreateBuffer	 = intelCreateBuffer,
    .DestroyBuffer	 = intelDestroyBuffer,
-   .SwapBuffers		 = intelSwapBuffers,
    .MakeCurrent		 = intelMakeCurrent,
    .UnbindContext	 = intelUnbindContext,
-   .GetSwapInfo		 = intelGetSwapInfo,
-   .GetDrawableMSC	 = driDrawableGetMSC32,
-   .WaitForMSC		 = driWaitForMSC32,
-   .CopySubBuffer	 = intelCopySubBuffer,
-
    .InitScreen2		 = intelInitScreen2,
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driDRI2Extension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/intel/intel_screen.h b/src/mesa/drivers/dri/intel/intel_screen.h
index a9b9e109a6..e87e306d86 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.h
+++ b/src/mesa/drivers/dri/intel/intel_screen.h
@@ -66,7 +66,7 @@ typedef struct
 
    int logTextureGranularity;
 
-   __DRIscreenPrivate *driScrnPriv;
+   __DRIscreen *driScrnPriv;
 
    volatile drm_i915_sarea_t *sarea;
 
@@ -77,7 +77,6 @@ typedef struct
    GLboolean no_hw;
 
    GLboolean no_vbo;
-   int ttm;
    dri_bufmgr *bufmgr;
    GLboolean kernel_exec_fencing;
 
@@ -89,18 +88,18 @@ typedef struct
 
 
 
-extern GLboolean intelMapScreenRegions(__DRIscreenPrivate * sPriv);
+extern GLboolean intelMapScreenRegions(__DRIscreen * sPriv);
 
 extern void intelUnmapScreenRegions(intelScreenPrivate * intelScreen);
 
-extern void intelDestroyContext(__DRIcontextPrivate * driContextPriv);
+extern void intelDestroyContext(__DRIcontext * driContextPriv);
 
-extern GLboolean intelUnbindContext(__DRIcontextPrivate * driContextPriv);
+extern GLboolean intelUnbindContext(__DRIcontext * driContextPriv);
 
 extern GLboolean
-intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
-                 __DRIdrawablePrivate * driDrawPriv,
-                 __DRIdrawablePrivate * driReadPriv);
+intelMakeCurrent(__DRIcontext * driContextPriv,
+                 __DRIdrawable * driDrawPriv,
+                 __DRIdrawable * driReadPriv);
 
 extern struct intel_context *intelScreenContext(intelScreenPrivate *intelScreen);
 
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index bab13e3665..605734d8e5 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -292,7 +292,6 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 
 #define Y_FLIP(_y) ((_y) * yScale + yBias)
 
-/* XXX with GEM, these need to tell the kernel */
 #define HW_LOCK()
 
 #define HW_UNLOCK()
@@ -335,7 +334,7 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 #include "intel_spantmp.h"
 
 /* x8r8g8b8 color span and pixel functions */
-#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_FMT GL_BGR
 #define INTEL_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
 #define INTEL_READ_VALUE(offset) pread_xrgb8888(irb, offset)
 #define INTEL_WRITE_VALUE(offset, v) pwrite_xrgb8888(irb, offset, v)
@@ -518,7 +517,6 @@ intelSpanRenderStart(GLcontext * ctx)
    GLuint i;
 
    intelFlush(&intel->ctx);
-   LOCK_HARDWARE(intel);
 
    for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
       if (ctx->Texture.Unit[i]._ReallyEnabled) {
@@ -554,8 +552,6 @@ intelSpanRenderFinish(GLcontext * ctx)
    intel_map_unmap_framebuffer(intel, ctx->DrawBuffer, GL_FALSE);
    if (ctx->ReadBuffer != ctx->DrawBuffer)
       intel_map_unmap_framebuffer(intel, ctx->ReadBuffer, GL_FALSE);
-
-   UNLOCK_HARDWARE(intel);
 }
 
 
@@ -614,18 +610,10 @@ intel_set_span_functions(struct intel_context *intel,
 			 struct gl_renderbuffer *rb)
 {
    struct intel_renderbuffer *irb = (struct intel_renderbuffer *) rb;
-   uint32_t tiling;
-
-   /* If in GEM mode, we need to do the tile address swizzling ourselves,
-    * instead of the fence registers handling it.
-    */
-   if (intel->ttm)
-      tiling = irb->region->tiling;
-   else
-      tiling = I915_TILING_NONE;
+   uint32_t tiling = irb->region->tiling;
 
    if (intel->intelScreen->kernel_exec_fencing) {
-      switch (irb->texformat) {
+      switch (irb->Base.Format) {
       case MESA_FORMAT_RGB565:
 	 intel_gttmap_InitPointers_RGB565(rb);
 	 break;
@@ -639,13 +627,7 @@ intel_set_span_functions(struct intel_context *intel,
          intel_gttmap_InitPointers_xRGB8888(rb);
 	 break;
       case MESA_FORMAT_ARGB8888:
-	 if (rb->_BaseFormat == GL_RGB) {
-	    /* XXX remove this code someday when we enable XRGB surfaces */
-	    /* 8888 RGBx */
-	    intel_gttmap_InitPointers_xRGB8888(rb);
-	 } else {
-	    intel_gttmap_InitPointers_ARGB8888(rb);
-	 }
+	 intel_gttmap_InitPointers_ARGB8888(rb);
 	 break;
       case MESA_FORMAT_Z16:
 	 intel_gttmap_InitDepthPointers_z16(rb);
@@ -668,13 +650,16 @@ intel_set_span_functions(struct intel_context *intel,
       default:
 	 _mesa_problem(NULL,
 		       "Unexpected MesaFormat %d in intelSetSpanFunctions",
-		       irb->texformat);
+		       irb->Base.Format);
 	 break;
       }
       return;
    }
 
-   switch (irb->texformat) {
+   /* If in GEM mode, we need to do the tile address swizzling ourselves,
+    * instead of the fence registers handling it.
+    */
+   switch (irb->Base.Format) {
    case MESA_FORMAT_RGB565:
       switch (tiling) {
       case I915_TILING_NONE:
@@ -732,35 +717,18 @@ intel_set_span_functions(struct intel_context *intel,
       }
       break;
    case MESA_FORMAT_ARGB8888:
-      if (rb->_BaseFormat == GL_RGB) {
-         /* XXX remove this code someday when we enable XRGB surfaces */
-	 /* 8888 RGBx */
-	 switch (tiling) {
-	 case I915_TILING_NONE:
-	 default:
-	    intelInitPointers_xRGB8888(rb);
-	    break;
-	 case I915_TILING_X:
-	    intel_XTile_InitPointers_xRGB8888(rb);
-	    break;
-	 case I915_TILING_Y:
-	    intel_YTile_InitPointers_xRGB8888(rb);
-	    break;
-	 }
-      } else {
-	 /* 8888 RGBA */
-	 switch (tiling) {
-	 case I915_TILING_NONE:
-	 default:
-	    intelInitPointers_ARGB8888(rb);
-	    break;
-	 case I915_TILING_X:
-	    intel_XTile_InitPointers_ARGB8888(rb);
-	    break;
-	 case I915_TILING_Y:
-	    intel_YTile_InitPointers_ARGB8888(rb);
-	    break;
-	 }
+      /* 8888 RGBA */
+      switch (tiling) {
+      case I915_TILING_NONE:
+      default:
+	 intelInitPointers_ARGB8888(rb);
+	 break;
+      case I915_TILING_X:
+	 intel_XTile_InitPointers_ARGB8888(rb);
+	 break;
+      case I915_TILING_Y:
+	 intel_YTile_InitPointers_ARGB8888(rb);
+	 break;
       }
       break;
    case MESA_FORMAT_Z16:
diff --git a/src/mesa/drivers/dri/intel/intel_state.c b/src/mesa/drivers/dri/intel/intel_state.c
index 4ee742377d..aefae53eb2 100644
--- a/src/mesa/drivers/dri/intel/intel_state.c
+++ b/src/mesa/drivers/dri/intel/intel_state.c
@@ -196,25 +196,6 @@ intel_translate_logic_op(GLenum opcode)
    }
 }
 
-
-static void
-intelClearColor(GLcontext *ctx, const GLfloat color[4])
-{
-   struct intel_context *intel = intel_context(ctx);
-   GLubyte clear[4];
-
-   CLAMPED_FLOAT_TO_UBYTE(clear[0], color[0]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[1], color[1]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
-
-   /* compute both 32 and 16-bit clear values */
-   intel->ClearColor8888 = INTEL_PACKCOLOR8888(clear[0], clear[1],
-                                               clear[2], clear[3]);
-   intel->ClearColor565 = INTEL_PACKCOLOR565(clear[0], clear[1], clear[2]);
-}
-
-
 /* Fallback to swrast for select and feedback.
  */
 static void
@@ -229,5 +210,4 @@ void
 intelInitStateFuncs(struct dd_function_table *functions)
 {
    functions->RenderMode = intelRenderMode;
-   functions->ClearColor = intelClearColor;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_swapbuffers.c b/src/mesa/drivers/dri/intel/intel_swapbuffers.c
deleted file mode 100644
index 7d035b9f6e..0000000000
--- a/src/mesa/drivers/dri/intel/intel_swapbuffers.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "intel_blit.h"
-#include "intel_buffers.h"
-#include "intel_swapbuffers.h"
-#include "intel_fbo.h"
-#include "intel_batchbuffer.h"
-#include "drirenderbuffer.h"
-#include "vblank.h"
-#include "i915_drm.h"
-
-
-
-/*
- * Correct a drawablePrivate's set of vblank flags WRT the current context.
- * When considering multiple crtcs.
- */
-GLuint
-intelFixupVblank(struct intel_context *intel, __DRIdrawablePrivate *dPriv)
-{
-   if (!intel->intelScreen->driScrnPriv->dri2.enabled &&
-       intel->intelScreen->driScrnPriv->ddx_version.minor >= 7) {
-      volatile drm_i915_sarea_t *sarea = intel->sarea;
-      drm_clip_rect_t drw_rect = { .x1 = dPriv->x, .x2 = dPriv->x + dPriv->w,
-				   .y1 = dPriv->y, .y2 = dPriv->y + dPriv->h };
-      drm_clip_rect_t planeA_rect = { .x1 = sarea->planeA_x, .y1 = sarea->planeA_y,
-				     .x2 = sarea->planeA_x + sarea->planeA_w,
-				     .y2 = sarea->planeA_y + sarea->planeA_h };
-      drm_clip_rect_t planeB_rect = { .x1 = sarea->planeB_x, .y1 = sarea->planeB_y,
-				     .x2 = sarea->planeB_x + sarea->planeB_w,
-				     .y2 = sarea->planeB_y + sarea->planeB_h };
-      GLint areaA = driIntersectArea( drw_rect, planeA_rect );
-      GLint areaB = driIntersectArea( drw_rect, planeB_rect );
-      GLuint flags = dPriv->vblFlags;
-
-      /* Update vblank info
-       */
-      if (areaB > areaA || (areaA == areaB && areaB > 0)) {
-	 flags = dPriv->vblFlags | VBLANK_FLAG_SECONDARY;
-      } else {
-	 flags = dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
-      }
-
-      /* Do the stupid test: Is one of them actually disabled?
-       */
-      if (sarea->planeA_w == 0 || sarea->planeA_h == 0) {
-	 flags = dPriv->vblFlags | VBLANK_FLAG_SECONDARY;
-      } else if (sarea->planeB_w == 0 || sarea->planeB_h == 0) {
-	 flags = dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
-      }
-
-      return flags;
-   } else {
-      return dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
-   }
-}
-
-
-/**
- * Called from driSwapBuffers()
- */
-void
-intelSwapBuffers(__DRIdrawablePrivate * dPriv)
-{
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      GET_CURRENT_CONTEXT(ctx);
-      struct intel_context *intel;
-
-      if (ctx == NULL)
-	 return;
-
-      intel = intel_context(ctx);
-
-      if (ctx->Visual.doubleBufferMode) {
-	 GLboolean missed_target;
-	 struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
-	 int64_t ust;
-         
-	 _mesa_notifySwapBuffers(ctx);  /* flush pending rendering comands */
-
-	/*
-	 * The old swapping ioctl was incredibly racy, just wait for vblank
-	 * and do the swap ourselves.
-	 */
-	 driWaitForVBlank(dPriv, &missed_target);
-
-	 /*
-	  * Update each buffer's vbl_pending so we don't get too out of
-	  * sync
-	  */
-	 intel_get_renderbuffer(&intel_fb->Base,
-		   		BUFFER_BACK_LEFT)->vbl_pending = dPriv->vblSeq;
-         intel_get_renderbuffer(&intel_fb->Base,
-		   		BUFFER_FRONT_LEFT)->vbl_pending = dPriv->vblSeq;
-
-	 intelCopyBuffer(dPriv, NULL);
-
-	 intel_fb->swap_count++;
-	 (*psp->systemTime->getUST) (&ust);
-	 if (missed_target) {
-	    intel_fb->swap_missed_count++;
-	    intel_fb->swap_missed_ust = ust - intel_fb->swap_ust;
-	 }
-
-	 intel_fb->swap_ust = ust;
-      }
-      drmCommandNone(intel->driFd, DRM_I915_GEM_THROTTLE);
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      fprintf(stderr, "%s: drawable has no context!\n", __FUNCTION__);
-   }
-}
-
-
-/**
- * Called from driCopySubBuffer()
- */
-void
-intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h)
-{
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      struct intel_context *intel =
-         (struct intel_context *) dPriv->driContextPriv->driverPrivate;
-      GLcontext *ctx = &intel->ctx;
-
-      if (ctx->Visual.doubleBufferMode) {
-         drm_clip_rect_t rect;
-         rect.x1 = x + dPriv->x;
-         rect.y1 = (dPriv->h - y - h) + dPriv->y;
-         rect.x2 = rect.x1 + w;
-         rect.y2 = rect.y1 + h;
-         _mesa_notifySwapBuffers(ctx);  /* flush pending rendering comands */
-         intelCopyBuffer(dPriv, &rect);
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      fprintf(stderr, "%s: drawable has no context!\n", __FUNCTION__);
-   }
-}
-
-
-/**
- * This will be called whenever the currently bound window is moved/resized.
- * XXX: actually, it seems to NOT be called when the window is only moved (BP).
- */
-void
-intelWindowMoved(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-   __DRIdrawablePrivate *dPriv = intel->driDrawable;
-   struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
-
-   if (!intel->intelScreen->driScrnPriv->dri2.enabled &&
-       intel->intelScreen->driScrnPriv->ddx_version.minor >= 7) {
-      GLuint flags = intelFixupVblank(intel, dPriv);
-
-      /* Check to see if we changed pipes */
-      if (flags != dPriv->vblFlags && dPriv->vblFlags &&
-	  !(dPriv->vblFlags & VBLANK_FLAG_NO_IRQ)) {
-	 int64_t count;
-	 drmVBlank vbl;
-	 int i;
-
-	 /*
-	  * Deal with page flipping
-	  */
-	 vbl.request.type = DRM_VBLANK_ABSOLUTE;
-
-	 if ( dPriv->vblFlags & VBLANK_FLAG_SECONDARY ) {
-	    vbl.request.type |= DRM_VBLANK_SECONDARY;
-	 }
-
-	 for (i = 0; i < 2; i++) {
-	    if (!intel_fb->color_rb[i] ||
-		(intel_fb->vbl_waited - intel_fb->color_rb[i]->vbl_pending) <=
-		(1<<23))
-	       continue;
-
-	    vbl.request.sequence = intel_fb->color_rb[i]->vbl_pending;
-	    drmWaitVBlank(intel->driFd, &vbl);
-	 }
-
-	 /*
-	  * Update msc_base from old pipe
-	  */
-	 driDrawableGetMSC32(dPriv->driScreenPriv, dPriv, &count);
-	 dPriv->msc_base = count;
-	 /*
-	  * Then get new vblank_base and vblSeq values
-	  */
-	 dPriv->vblFlags = flags;
-	 driGetCurrentVBlank(dPriv);
-	 dPriv->vblank_base = dPriv->vblSeq;
-
-	 intel_fb->vbl_waited = dPriv->vblSeq;
-
-	 for (i = 0; i < 2; i++) {
-	    if (intel_fb->color_rb[i])
-	       intel_fb->color_rb[i]->vbl_pending = intel_fb->vbl_waited;
-	 }
-      }
-   } else {
-      dPriv->vblFlags &= ~VBLANK_FLAG_SECONDARY;
-   }
-
-   /* Update Mesa's notion of window size */
-   driUpdateFramebufferSize(ctx, dPriv);
-   intel_fb->Base.Initialized = GL_TRUE; /* XXX remove someday */
-
-   /* Update hardware scissor */
-   if (ctx->Driver.Scissor != NULL) {
-      ctx->Driver.Scissor(ctx, ctx->Scissor.X, ctx->Scissor.Y,
-			  ctx->Scissor.Width, ctx->Scissor.Height);
-   }
-
-   /* Re-calculate viewport related state */
-   if (ctx->Driver.DepthRange != NULL)
-      ctx->Driver.DepthRange( ctx, ctx->Viewport.Near, ctx->Viewport.Far );
-}
diff --git a/src/mesa/drivers/dri/intel/intel_swapbuffers.h b/src/mesa/drivers/dri/intel/intel_swapbuffers.h
deleted file mode 100644
index 75bb6242ff..0000000000
--- a/src/mesa/drivers/dri/intel/intel_swapbuffers.h
+++ /dev/null
@@ -1,52 +0,0 @@
-
-/**************************************************************************
- * 
- * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef INTEL_SWAPBUFFERS_H
-#define INTEL_SWAPBUFFERS_H
-
-#include "dri_util.h"
-#include "drm.h"
-
-struct intel_context;
-struct intel_framebuffer;
-
-
-extern void
-intelSwapBuffers(__DRIdrawablePrivate * dPriv);
-
-extern void
-intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h);
-
-extern GLuint
-intelFixupVblank(struct intel_context *intel, __DRIdrawablePrivate *dPriv);
-
-extern void
-intelWindowMoved(struct intel_context *intel);
-
-
-#endif /* INTEL_SWAPBUFFERS_H */
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index bb21dd5ed9..d8e71093c4 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -109,8 +109,7 @@ do_copy_texsubimage(struct intel_context *intel,
       return GL_FALSE;
    }
 
-   intelFlush(ctx);
-   LOCK_HARDWARE(intel);
+   /* intelFlush(ctx); */
    {
       drm_intel_bo *dst_bo = intel_region_buffer(intel,
 						 intelImage->mt->region,
@@ -132,13 +131,12 @@ do_copy_texsubimage(struct intel_context *intel,
 
       /* Can't blit to tiled buffers with non-tile-aligned offset. */
       if (intelImage->mt->region->tiling == I915_TILING_Y) {
-	 UNLOCK_HARDWARE(intel);
 	 return GL_FALSE;
       }
 
       if (ctx->ReadBuffer->Name == 0) {
 	 /* reading from a window, adjust x, y */
-	 const __DRIdrawablePrivate *dPriv = intel->driReadDrawable;
+	 const __DRIdrawable *dPriv = intel->driReadDrawable;
 	 y = dPriv->y + (dPriv->h - (y + height));
 	 x += dPriv->x;
 
@@ -160,22 +158,20 @@ do_copy_texsubimage(struct intel_context *intel,
 			     intelImage->mt->cpp,
 			     src_pitch,
 			     src->buffer,
-			     src->draw_offset,
+			     0,
 			     src->tiling,
 			     intelImage->mt->pitch,
 			     dst_bo,
 			     0,
 			     intelImage->mt->region->tiling,
-			     x, y, image_x + dstx, image_y + dsty,
+			     src->draw_x + x, src->draw_y + y,
+			     image_x + dstx, image_y + dsty,
 			     width, height,
 			     GL_COPY)) {
-	 UNLOCK_HARDWARE(intel);
 	 return GL_FALSE;
       }
    }
 
-   UNLOCK_HARDWARE(intel);
-
    return GL_TRUE;
 }
 
@@ -221,6 +217,8 @@ intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
    return;
 
  fail:
+   if (INTEL_DEBUG & DEBUG_FALLBACKS)
+      fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
    _mesa_meta_CopyTexImage1D(ctx, target, level, internalFormat, x, y,
                              width, border);
 }
@@ -268,6 +266,8 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
    return;
 
  fail:
+   if (INTEL_DEBUG & DEBUG_FALLBACKS)
+      fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
    _mesa_meta_CopyTexImage2D(ctx, target, level, internalFormat, x, y,
                              width, height, border);
 }
@@ -292,6 +292,8 @@ intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
    if (!do_copy_texsubimage(intel_context(ctx), target,
                             intel_texture_image(texImage),
                             internalFormat, xoffset, 0, x, y, width, 1)) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+         fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
       _mesa_meta_CopyTexSubImage1D(ctx, target, level, xoffset, x, y, width);
    }
 }
@@ -317,8 +319,8 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                             internalFormat,
                             xoffset, yoffset, x, y, width, height)) {
 
-      DBG("%s - fallback to _mesa_meta_CopyTexSubImage2D\n", __FUNCTION__);
-
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+         fprintf(stderr, "%s - fallback to swrast\n", __FUNCTION__);
       _mesa_meta_CopyTexSubImage2D(ctx, target, level,
                                    xoffset, yoffset, x, y, width, height);
    }
diff --git a/src/mesa/drivers/dri/intel/intel_tex_format.c b/src/mesa/drivers/dri/intel/intel_tex_format.c
index f37a545c7f..a7c6c45ffe 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_format.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_format.c
@@ -50,8 +50,7 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
       if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
          return MESA_FORMAT_RGB565;
       }
-      /* XXX use MESA_FORMAT_XRGB8888 someday */
-      return do32bpt ? MESA_FORMAT_ARGB8888 : MESA_FORMAT_RGB565;
+      return do32bpt ? MESA_FORMAT_XRGB8888 : MESA_FORMAT_RGB565;
 
    case GL_RGBA8:
    case GL_RGB10_A2:
@@ -70,8 +69,7 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
    case GL_RGB10:
    case GL_RGB12:
    case GL_RGB16:
-      /* XXX use MESA_FORMAT_XRGB8888 someday */
-      return MESA_FORMAT_ARGB8888;
+      return MESA_FORMAT_XRGB8888;
 
    case GL_RGB5:
    case GL_RGB4:
@@ -95,14 +93,20 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
    case GL_COMPRESSED_LUMINANCE:
       return MESA_FORMAT_L8;
 
+   case GL_LUMINANCE12_ALPHA4:
+   case GL_LUMINANCE12_ALPHA12:
+   case GL_LUMINANCE16_ALPHA16:
+#ifndef I915
+      return MESA_FORMAT_AL1616;
+#else
+      /* FALLTHROUGH */
+#endif
+
    case 2:
    case GL_LUMINANCE_ALPHA:
    case GL_LUMINANCE4_ALPHA4:
    case GL_LUMINANCE6_ALPHA2:
    case GL_LUMINANCE8_ALPHA8:
-   case GL_LUMINANCE12_ALPHA4:
-   case GL_LUMINANCE12_ALPHA12:
-   case GL_LUMINANCE16_ALPHA16:
    case GL_COMPRESSED_LUMINANCE_ALPHA:
       return MESA_FORMAT_AL88;
 
@@ -169,13 +173,13 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
       return MESA_FORMAT_SARGB8;
    case GL_SLUMINANCE_EXT:
    case GL_SLUMINANCE8_EXT:
-      if (IS_G4X(intel->intelScreen->deviceID))
+      if (intel->has_luminance_srgb)
          return MESA_FORMAT_SL8;
       else
          return MESA_FORMAT_SARGB8;
    case GL_SLUMINANCE_ALPHA_EXT:
    case GL_SLUMINANCE8_ALPHA8_EXT:
-      if (IS_G4X(intel->intelScreen->deviceID))
+      if (intel->has_luminance_srgb)
          return MESA_FORMAT_SLA8;
       else
          return MESA_FORMAT_SARGB8;
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index 3412e761ca..6f41eafd0e 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -115,7 +115,8 @@ guess_and_alloc_mipmap_tree(struct intel_context *intel,
     */
    if ((intelObj->base.MinFilter == GL_NEAREST ||
         intelObj->base.MinFilter == GL_LINEAR) &&
-       intelImage->level == firstLevel) {
+       intelImage->level == firstLevel &&
+       (intel->gen < 4 || firstLevel == 0)) {
       lastLevel = firstLevel;
    }
    else {
@@ -234,7 +235,6 @@ try_pbo_upload(struct intel_context *intel,
 
    if (drm_intel_bo_references(intel->batch->buf, dst_buffer))
       intelFlush(&intel->ctx);
-   LOCK_HARDWARE(intel);
    {
       dri_bo *src_buffer = intel_bufferobj_buffer(intel, pbo, INTEL_READ);
 
@@ -244,11 +244,9 @@ try_pbo_upload(struct intel_context *intel,
 			     dst_stride, dst_buffer, 0, GL_FALSE,
 			     0, 0, dst_x, dst_y, width, height,
 			     GL_COPY)) {
-	 UNLOCK_HARDWARE(intel);
 	 return GL_FALSE;
       }
    }
-   UNLOCK_HARDWARE(intel);
 
    return GL_TRUE;
 }
@@ -368,8 +366,7 @@ intelTexImage(GLcontext * ctx,
        intelObj->mt->first_level == level &&
        intelObj->mt->last_level == level &&
        intelObj->mt->target != GL_TEXTURE_CUBE_MAP_ARB &&
-       !intel_miptree_match_image(intelObj->mt, &intelImage->base,
-                                  intelImage->face, intelImage->level)) {
+       !intel_miptree_match_image(intelObj->mt, &intelImage->base)) {
 
       DBG("release it\n");
       intel_miptree_release(intel, &intelObj->mt);
@@ -386,8 +383,7 @@ intelTexImage(GLcontext * ctx,
    assert(!intelImage->mt);
 
    if (intelObj->mt &&
-       intel_miptree_match_image(intelObj->mt, &intelImage->base,
-                                 intelImage->face, intelImage->level)) {
+       intel_miptree_match_image(intelObj->mt, &intelImage->base)) {
 
       intel_miptree_reference(&intelImage->mt, intelObj->mt);
       assert(intelImage->mt);
@@ -470,8 +466,6 @@ intelTexImage(GLcontext * ctx,
 					   pixels, unpack, "glTexImage");
    }
 
-   LOCK_HARDWARE(intel);
-
    if (intelImage->mt) {
       if (pixels != NULL) {
 	 /* Flush any queued rendering with the texture before mapping. */
@@ -552,8 +546,6 @@ intelTexImage(GLcontext * ctx,
          intel_miptree_image_unmap(intel, intelImage->mt);
       texImage->Data = NULL;
    }
-
-   UNLOCK_HARDWARE(intel);
 }
 
 
@@ -733,27 +725,27 @@ intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
 		   GLint glx_texture_format,
 		   __DRIdrawable *dPriv)
 {
-   struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
+   struct gl_framebuffer *fb = dPriv->driverPrivate;
    struct intel_context *intel = pDRICtx->driverPrivate;
+   GLcontext *ctx = &intel->ctx;
    struct intel_texture_object *intelObj;
    struct intel_texture_image *intelImage;
    struct intel_mipmap_tree *mt;
    struct intel_renderbuffer *rb;
-   struct gl_texture_unit *texUnit;
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
    int level = 0, internalFormat;
 
-   texUnit = &intel->ctx.Texture.Unit[intel->ctx.Texture.CurrentUnit];
-   texObj = _mesa_select_tex_object(&intel->ctx, texUnit, target);
+   texObj = _mesa_get_current_tex_object(ctx, target);
    intelObj = intel_texture_object(texObj);
 
    if (!intelObj)
       return;
 
-   intel_update_renderbuffers(pDRICtx, dPriv);
+   if (!dPriv->validBuffers)
+      intel_update_renderbuffers(pDRICtx, dPriv);
 
-   rb = intel_fb->color_rb[0];
+   rb = intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT);
    /* If the region isn't set, then intel_update_renderbuffers was unable
     * to get the buffers for the drawable.
     */
@@ -797,8 +789,7 @@ intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
    texImage->RowStride = rb->region->pitch;
    intel_miptree_reference(&intelImage->mt, intelObj->mt);
 
-   if (!intel_miptree_match_image(intelObj->mt, &intelImage->base,
-				  intelImage->face, intelImage->level)) {
+   if (!intel_miptree_match_image(intelObj->mt, &intelImage->base)) {
 	   fprintf(stderr, "miptree doesn't match image\n");
    }
 
diff --git a/src/mesa/drivers/dri/intel/intel_tex_obj.h b/src/mesa/drivers/dri/intel/intel_tex_obj.h
index 5a93461525..3ad10d3d23 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_obj.h
+++ b/src/mesa/drivers/dri/intel/intel_tex_obj.h
@@ -66,6 +66,7 @@ struct intel_texture_image
     * Else there is no image data.
     */
    struct intel_mipmap_tree *mt;
+   GLboolean used_as_render_target;
 };
 
 static INLINE struct intel_texture_object *
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index 1f68208266..7f1dc89022 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -72,8 +72,6 @@ intelTexSubimage(GLcontext * ctx,
    if (!pixels)
       return;
 
-   LOCK_HARDWARE(intel);
-
    /* Map buffer if necessary.  Need to lock to prevent other contexts
     * from uploading the buffer under us.
     */
@@ -129,8 +127,6 @@ intelTexSubimage(GLcontext * ctx,
       intel_miptree_image_unmap(intel, intelImage->mt);
       texImage->Data = NULL;
    }
-
-   UNLOCK_HARDWARE(intel);
 }
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 504993989a..c9a24ac398 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -42,7 +42,7 @@ intel_calculate_first_last_level(struct intel_context *intel,
          firstLevel = lastLevel = tObj->BaseLevel;
       }
       else {
-	 if (!IS_9XX(intel->intelScreen->deviceID)) {
+	 if (intel->gen == 2) {
 	    firstLevel = tObj->BaseLevel + (GLint) (tObj->MinLod + 0.5);
 	    firstLevel = MAX2(firstLevel, tObj->BaseLevel);
 	    firstLevel = MIN2(firstLevel, tObj->BaseLevel + baseImage->MaxLog2);
@@ -138,8 +138,7 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
    /* What levels must the tree include at a minimum?
     */
    intel_calculate_first_last_level(intel, intelObj);
-   firstImage =
-      intel_texture_image(intelObj->base.Image[0][intelObj->firstLevel]);
+   firstImage = intel_texture_image(tObj->Image[0][intelObj->firstLevel]);
 
    /* Fallback case:
     */
@@ -223,8 +222,13 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
             intel_texture_image(intelObj->base.Image[face][i]);
 
          /* Need to import images in main memory or held in other trees.
+	  * If it's a render target, then its data isn't needed to be in
+	  * the object tree (otherwise we'd be FBO incomplete), and we need
+	  * to keep track of the image's MT as needing to be pulled in still,
+	  * or we'll lose the rendering that's done to it.
           */
-         if (intelObj->mt != intelImage->mt) {
+         if (intelObj->mt != intelImage->mt &&
+	     !intelImage->used_as_render_target) {
             copy_image_data_to_tree(intel, intelObj, intelImage);
          }
       }
diff --git a/src/mesa/drivers/dri/mach64/mach64_context.c b/src/mesa/drivers/dri/mach64/mach64_context.c
index 2bca293b3c..3b4ef7ffd8 100644
--- a/src/mesa/drivers/dri/mach64/mach64_context.c
+++ b/src/mesa/drivers/dri/mach64/mach64_context.c
@@ -89,11 +89,11 @@ static const struct dri_extension card_extensions[] =
 /* Create the device specific context.
   */
 GLboolean mach64CreateContext( const __GLcontextModes *glVisual,
-			       __DRIcontextPrivate *driContextPriv,
+			       __DRIcontext *driContextPriv,
                                void *sharedContextPrivate )
 {
    GLcontext *ctx, *shareCtx;
-   __DRIscreenPrivate *driScreen = driContextPriv->driScreenPriv;
+   __DRIscreen *driScreen = driContextPriv->driScreenPriv;
    struct dd_function_table functions;
    mach64ContextPtr mmesa;
    mach64ScreenPtr mach64Screen;
@@ -260,7 +260,7 @@ GLboolean mach64CreateContext( const __GLcontextModes *glVisual,
 
 /* Destroy the device specific context.
  */
-void mach64DestroyContext( __DRIcontextPrivate *driContextPriv  )
+void mach64DestroyContext( __DRIcontext *driContextPriv  )
 {
    mach64ContextPtr mmesa = (mach64ContextPtr) driContextPriv->driverPrivate;
 
@@ -307,9 +307,9 @@ void mach64DestroyContext( __DRIcontextPrivate *driContextPriv  )
  * buffer `b'.
  */
 GLboolean
-mach64MakeCurrent( __DRIcontextPrivate *driContextPriv,
-                 __DRIdrawablePrivate *driDrawPriv,
-                 __DRIdrawablePrivate *driReadPriv )
+mach64MakeCurrent( __DRIcontext *driContextPriv,
+                 __DRIdrawable *driDrawPriv,
+                 __DRIdrawable *driReadPriv )
 {
    if ( driContextPriv ) {
       GET_CURRENT_CONTEXT(ctx);
@@ -352,7 +352,7 @@ mach64MakeCurrent( __DRIcontextPrivate *driContextPriv,
 /* Force the context `c' to be unbound from its buffer.
  */
 GLboolean
-mach64UnbindContext( __DRIcontextPrivate *driContextPriv )
+mach64UnbindContext( __DRIcontext *driContextPriv )
 {
    return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/mach64/mach64_context.h b/src/mesa/drivers/dri/mach64/mach64_context.h
index 854751626d..18fc859d01 100644
--- a/src/mesa/drivers/dri/mach64/mach64_context.h
+++ b/src/mesa/drivers/dri/mach64/mach64_context.h
@@ -232,9 +232,9 @@ struct mach64_context {
 
    /* Mirrors of some DRI state
     */
-   __DRIcontextPrivate	*driContext;	/* DRI context */
-   __DRIscreenPrivate	*driScreen;	/* DRI screen */
-   __DRIdrawablePrivate	*driDrawable;	/* DRI drawable bound to this ctx */
+   __DRIcontext	*driContext;	/* DRI context */
+   __DRIscreen	*driScreen;	/* DRI screen */
+   __DRIdrawable	*driDrawable;	/* DRI drawable bound to this ctx */
 
    unsigned int lastStamp;		/* mirror driDrawable->lastStamp */
 
@@ -274,16 +274,16 @@ struct mach64_context {
 
 
 extern GLboolean mach64CreateContext( const __GLcontextModes *glVisual,
-				      __DRIcontextPrivate *driContextPriv,
+				      __DRIcontext *driContextPriv,
                                       void *sharedContextPrivate );
 
-extern void mach64DestroyContext( __DRIcontextPrivate * );
+extern void mach64DestroyContext( __DRIcontext * );
 
-extern GLboolean mach64MakeCurrent( __DRIcontextPrivate *driContextPriv,
-                                    __DRIdrawablePrivate *driDrawPriv,
-                                    __DRIdrawablePrivate *driReadPriv );
+extern GLboolean mach64MakeCurrent( __DRIcontext *driContextPriv,
+                                    __DRIdrawable *driDrawPriv,
+                                    __DRIdrawable *driReadPriv );
 
-extern GLboolean mach64UnbindContext( __DRIcontextPrivate *driContextPriv );
+extern GLboolean mach64UnbindContext( __DRIcontext *driContextPriv );
 
 /* ================================================================
  * Byte ordering
diff --git a/src/mesa/drivers/dri/mach64/mach64_ioctl.c b/src/mesa/drivers/dri/mach64/mach64_ioctl.c
index ef5c0625c3..03587c44fd 100644
--- a/src/mesa/drivers/dri/mach64/mach64_ioctl.c
+++ b/src/mesa/drivers/dri/mach64/mach64_ioctl.c
@@ -279,7 +279,7 @@ static int mach64WaitForFrameCompletion( mach64ContextPtr mmesa )
 
 /* Copy the back color buffer to the front color buffer.
  */
-void mach64CopyBuffer( __DRIdrawablePrivate *dPriv )
+void mach64CopyBuffer( __DRIdrawable *dPriv )
 {
    mach64ContextPtr mmesa;
    GLint nbox, i, ret;
@@ -668,7 +668,7 @@ void mach64PerformanceBoxesLocked( mach64ContextPtr mmesa )
 static void mach64DDClear( GLcontext *ctx, GLbitfield mask )
 {
    mach64ContextPtr mmesa = MACH64_CONTEXT( ctx );
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;
+   __DRIdrawable *dPriv = mmesa->driDrawable;
    drm_mach64_clear_t clear;
    GLuint flags = 0;
    GLint i;
diff --git a/src/mesa/drivers/dri/mach64/mach64_ioctl.h b/src/mesa/drivers/dri/mach64/mach64_ioctl.h
index 6ef9bc0bca..1ffda1932f 100644
--- a/src/mesa/drivers/dri/mach64/mach64_ioctl.h
+++ b/src/mesa/drivers/dri/mach64/mach64_ioctl.h
@@ -78,7 +78,7 @@ extern void mach64FireBlitLocked( mach64ContextPtr mmesa, void *buffer,
 				  GLint offset, GLint pitch, GLint format,
 				  GLint x, GLint y, GLint width, GLint height );
 
-extern void mach64CopyBuffer( __DRIdrawablePrivate *dPriv );
+extern void mach64CopyBuffer( __DRIdrawable *dPriv );
 #if ENABLE_PERF_BOXES
 extern void mach64PerformanceCounters( mach64ContextPtr mmesa );
 extern void mach64PerformanceBoxesLocked( mach64ContextPtr mmesa );
diff --git a/src/mesa/drivers/dri/mach64/mach64_lock.c b/src/mesa/drivers/dri/mach64/mach64_lock.c
index d018ba4174..8653c77da5 100644
--- a/src/mesa/drivers/dri/mach64/mach64_lock.c
+++ b/src/mesa/drivers/dri/mach64/mach64_lock.c
@@ -51,8 +51,8 @@ int   prevLockLine = 0;
  */
 void mach64GetLock( mach64ContextPtr mmesa, GLuint flags )
 {
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;
-   __DRIscreenPrivate *sPriv = mmesa->driScreen;
+   __DRIdrawable *dPriv = mmesa->driDrawable;
+   __DRIscreen *sPriv = mmesa->driScreen;
    drm_mach64_sarea_t *sarea = mmesa->sarea;
    int i;
 
diff --git a/src/mesa/drivers/dri/mach64/mach64_native_vb.c b/src/mesa/drivers/dri/mach64/mach64_native_vb.c
index 99f1a14e17..816682ec5f 100644
--- a/src/mesa/drivers/dri/mach64/mach64_native_vb.c
+++ b/src/mesa/drivers/dri/mach64/mach64_native_vb.c
@@ -207,19 +207,19 @@ INTERP_QUALIFIER void TAG(interp_extras)( GLcontext *ctx,
    LOCALVARS
    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
 
-   if (VB->ColorPtr[1]) {
-      assert(VB->ColorPtr[1]->stride == 4 * sizeof(GLfloat));
+   if (VB->BackfaceColorPtr) {
+      assert(VB->BackfaceColorPtr->stride == 4 * sizeof(GLfloat));
       
       INTERP_4F( t,
-		    GET_COLOR(VB->ColorPtr[1], dst),
-		    GET_COLOR(VB->ColorPtr[1], out),
-		    GET_COLOR(VB->ColorPtr[1], in) );
+		 GET_COLOR(VB->BackfaceColorPtr, dst),
+		 GET_COLOR(VB->BackfaceColorPtr, out),
+		 GET_COLOR(VB->BackfaceColorPtr, in) );
 
-      if (VB->SecondaryColorPtr[1]) {
+      if (VB->BackfaceSecondaryColorPtr) {
 	 INTERP_3F( t,
-		       GET_COLOR(VB->SecondaryColorPtr[1], dst),
-		       GET_COLOR(VB->SecondaryColorPtr[1], out),
-		       GET_COLOR(VB->SecondaryColorPtr[1], in) );
+		    GET_COLOR(VB->BackfaceSecondaryColorPtr, dst),
+		    GET_COLOR(VB->BackfaceSecondaryColorPtr, out),
+		    GET_COLOR(VB->BackfaceSecondaryColorPtr, in) );
       }
    }
 
@@ -236,13 +236,13 @@ INTERP_QUALIFIER void TAG(copy_pv_extras)( GLcontext *ctx,
    LOCALVARS
       struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
 
-   if (VB->ColorPtr[1]) {
-      COPY_4FV( GET_COLOR(VB->ColorPtr[1], dst), 
-		GET_COLOR(VB->ColorPtr[1], src) );
+   if (VB->BackfaceColorPtr) {
+      COPY_4FV( GET_COLOR(VB->BackfaceColorPtr, dst),
+		GET_COLOR(VB->BackfaceColorPtr, src) );
 
-      if (VB->SecondaryColorPtr[1]) {
-	 COPY_4FV( GET_COLOR(VB->SecondaryColorPtr[1], dst), 
-		   GET_COLOR(VB->SecondaryColorPtr[1], src) );
+      if (VB->BackfaceSecondaryColorPtr) {
+	 COPY_4FV( GET_COLOR(VB->BackfaceSecondaryColorPtr, dst),
+		   GET_COLOR(VB->BackfaceSecondaryColorPtr, src) );
       }
    }
 
diff --git a/src/mesa/drivers/dri/mach64/mach64_native_vbtmp.h b/src/mesa/drivers/dri/mach64/mach64_native_vbtmp.h
index 684f2acc89..6e5fa3520e 100644
--- a/src/mesa/drivers/dri/mach64/mach64_native_vbtmp.h
+++ b/src/mesa/drivers/dri/mach64/mach64_native_vbtmp.h
@@ -103,10 +103,10 @@ static void TAG(emit)( GLcontext *ctx,
 #if DO_TEX1
    {
       const GLuint t1 = GET_TEXSOURCE(1);
-      tc1 = VB->TexCoordPtr[t1]->data;
-      tc1_stride = VB->TexCoordPtr[t1]->stride;
+      tc1 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->data;
+      tc1_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->stride;
 #if DO_PTEX
-      tc1_size = VB->TexCoordPtr[t1]->size;
+      tc1_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->size;
 #endif
    }
 #endif
@@ -114,18 +114,18 @@ static void TAG(emit)( GLcontext *ctx,
 #if DO_TEX0
    {
       const GLuint t0 = GET_TEXSOURCE(0);
-      tc0 = VB->TexCoordPtr[t0]->data;
-      tc0_stride = VB->TexCoordPtr[t0]->stride;
+      tc0 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->data;
+      tc0_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->stride;
 #if DO_PTEX
-      tc0_size = VB->TexCoordPtr[t0]->size;
+      tc0_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->size;
 #endif
    }
 #endif
 
 #if DO_SPEC
-   if (VB->SecondaryColorPtr[0]) {
-      spec = VB->SecondaryColorPtr[0]->data;
-      spec_stride = VB->SecondaryColorPtr[0]->stride;
+   if (VB->AttribPtr[_TNL_ATTRIB_COLOR1]) {
+      spec = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->data;
+      spec_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->stride;
    } else {
       spec = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_COLOR1];
       spec_stride = 0;
@@ -133,9 +133,9 @@ static void TAG(emit)( GLcontext *ctx,
 #endif
 
 #if DO_FOG
-   if (VB->FogCoordPtr) {
-      fog = VB->FogCoordPtr->data;
-      fog_stride = VB->FogCoordPtr->stride;
+   if (VB->AttribPtr[_TNL_ATTRIB_FOG]) {
+      fog = VB->AttribPtr[_TNL_ATTRIB_FOG]->data;
+      fog_stride = VB->AttribPtr[_TNL_ATTRIB_FOG]->stride;
    } else {
       static GLfloat tmp[4] = {0, 0, 0, 0};
       fog = &tmp;
@@ -144,8 +144,8 @@ static void TAG(emit)( GLcontext *ctx,
 #endif
 
 #if DO_RGBA
-   col = VB->ColorPtr[0]->data;
-   col_stride = VB->ColorPtr[0]->stride;
+   col = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+   col_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
 #endif
 
    coord = VB->NdcPtr->data;
@@ -319,8 +319,8 @@ static GLboolean TAG(check_tex_sizes)( GLcontext *ctx )
 
    /* Force 'missing' texcoords to something valid.
     */
-   if (DO_TEX1 && VB->TexCoordPtr[0] == 0)
-      VB->TexCoordPtr[0] = VB->TexCoordPtr[1];
+   if (DO_TEX1 && VB->AttribPtr[_TNL_ATTRIB_TEX0] == 0)
+      VB->AttribPtr[_TNL_ATTRIB_TEX0] = VB->AttribPtr[_TNL_ATTRIB_TEX1];
 
    if (DO_PTEX)
       return GL_TRUE;
@@ -328,12 +328,12 @@ static GLboolean TAG(check_tex_sizes)( GLcontext *ctx )
    /* No hardware support for projective texture.  Can fake it for
     * TEX0 only.
     */
-   if ((DO_TEX1 && VB->TexCoordPtr[GET_TEXSOURCE(1)]->size == 4)) {
+   if ((DO_TEX1 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(1)]->size == 4)) {
       PTEX_FALLBACK();
       return GL_FALSE;
    }
 
-   if (DO_TEX0 && VB->TexCoordPtr[GET_TEXSOURCE(0)]->size == 4) {
+   if (DO_TEX0 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(0)]->size == 4) {
       if (DO_TEX1) {
 	 PTEX_FALLBACK();
       }
diff --git a/src/mesa/drivers/dri/mach64/mach64_screen.c b/src/mesa/drivers/dri/mach64/mach64_screen.c
index 3b19cf5333..1ed3b0b70e 100644
--- a/src/mesa/drivers/dri/mach64/mach64_screen.c
+++ b/src/mesa/drivers/dri/mach64/mach64_screen.c
@@ -68,7 +68,7 @@ static const GLuint __driNConfigOptions = 2;
 #endif
 
 static const __DRIconfig **
-mach64FillInModes( __DRIscreenPrivate *psp,
+mach64FillInModes( __DRIscreen *psp,
 		   unsigned pixel_bits, unsigned depth_bits,
 		   unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -144,7 +144,7 @@ mach64FillInModes( __DRIscreenPrivate *psp,
 /* Create the device specific screen private data struct.
  */
 static mach64ScreenRec *
-mach64CreateScreen( __DRIscreenPrivate *sPriv )
+mach64CreateScreen( __DRIscreen *sPriv )
 {
    mach64ScreenPtr mach64Screen;
    ATIDRIPtr serverInfo = (ATIDRIPtr)sPriv->pDevPriv;
@@ -272,7 +272,7 @@ mach64CreateScreen( __DRIscreenPrivate *sPriv )
 /* Destroy the device specific screen private data struct.
  */
 static void
-mach64DestroyScreen( __DRIscreenPrivate *driScreen )
+mach64DestroyScreen( __DRIscreen *driScreen )
 {
    mach64ScreenRec *mach64Screen = (mach64ScreenRec *) driScreen->private;
 
@@ -299,8 +299,8 @@ mach64DestroyScreen( __DRIscreenPrivate *driScreen )
  * data.
  */
 static GLboolean
-mach64CreateBuffer( __DRIscreenPrivate *driScrnPriv,
-		    __DRIdrawablePrivate *driDrawPriv,
+mach64CreateBuffer( __DRIscreen *driScrnPriv,
+		    __DRIdrawable *driDrawPriv,
 		    const __GLcontextModes *mesaVis,
 		    GLboolean isPixmap )
 {
@@ -370,7 +370,7 @@ mach64CreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-mach64DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+mach64DestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
@@ -378,7 +378,7 @@ mach64DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 
 /* Copy the back color buffer to the front color buffer */
 static void
-mach64SwapBuffers(__DRIdrawablePrivate *dPriv)
+mach64SwapBuffers(__DRIdrawable *dPriv)
 {
    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
       mach64ContextPtr mmesa;
@@ -400,7 +400,7 @@ mach64SwapBuffers(__DRIdrawablePrivate *dPriv)
 /* Initialize the driver specific screen private data.
  */
 static GLboolean
-mach64InitDriver( __DRIscreenPrivate *driScreen )
+mach64InitDriver( __DRIscreen *driScreen )
 {
    driScreen->private = (void *) mach64CreateScreen( driScreen );
 
@@ -420,7 +420,7 @@ mach64InitDriver( __DRIscreenPrivate *driScreen )
  * \return the __GLcontextModes supported by this driver
  */
 static const __DRIconfig **
-mach64InitScreen(__DRIscreenPrivate *psp)
+mach64InitScreen(__DRIscreen *psp)
 {
    static const __DRIversion ddx_expected = { 6, 4, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
@@ -457,3 +457,9 @@ const struct __DriverAPIRec driDriverAPI = {
    .SwapBuffersMSC  = NULL
 };
 
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/mach64/mach64_screen.h b/src/mesa/drivers/dri/mach64/mach64_screen.h
index be5e29a3e5..1966809c03 100644
--- a/src/mesa/drivers/dri/mach64/mach64_screen.h
+++ b/src/mesa/drivers/dri/mach64/mach64_screen.h
@@ -70,7 +70,7 @@ typedef struct {
 
    drmBufMapPtr buffers;
 
-   __DRIscreenPrivate *driScreen;
+   __DRIscreen *driScreen;
 
    driOptionCache optionCache;
 
diff --git a/src/mesa/drivers/dri/mach64/mach64_span.c b/src/mesa/drivers/dri/mach64/mach64_span.c
index 500319e0e3..b4ba2a41c9 100644
--- a/src/mesa/drivers/dri/mach64/mach64_span.c
+++ b/src/mesa/drivers/dri/mach64/mach64_span.c
@@ -40,8 +40,8 @@
 
 #define LOCAL_VARS							\
    mach64ContextPtr mmesa = MACH64_CONTEXT(ctx);			\
-   __DRIscreenPrivate *sPriv = mmesa->driScreen;			\
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;			\
+   __DRIscreen *sPriv = mmesa->driScreen;			\
+   __DRIdrawable *dPriv = mmesa->driDrawable;			\
    driRenderbuffer *drb = (driRenderbuffer *) rb;			\
    GLuint height = dPriv->h;						\
    GLushort p;								\
@@ -49,8 +49,8 @@
 
 #define LOCAL_DEPTH_VARS						\
    mach64ContextPtr mmesa = MACH64_CONTEXT(ctx);			\
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;			\
-   __DRIscreenPrivate *driScreen = mmesa->driScreen;			\
+   __DRIdrawable *dPriv = mmesa->driDrawable;			\
+   __DRIscreen *driScreen = mmesa->driScreen;			\
    driRenderbuffer *drb = (driRenderbuffer *) rb;			\
    GLuint height = dPriv->h;						\
    char *buf = (char *)(driScreen->pFB + drb->offset +			\
diff --git a/src/mesa/drivers/dri/mach64/mach64_state.c b/src/mesa/drivers/dri/mach64/mach64_state.c
index 3a023187ce..df7cbc8670 100644
--- a/src/mesa/drivers/dri/mach64/mach64_state.c
+++ b/src/mesa/drivers/dri/mach64/mach64_state.c
@@ -388,7 +388,7 @@ static void mach64UpdateClipping( GLcontext *ctx )
    mach64ScreenPtr mach64Screen = mmesa->mach64Screen;
 
    if ( mmesa->driDrawable ) {
-      __DRIdrawablePrivate *drawable = mmesa->driDrawable;
+      __DRIdrawable *drawable = mmesa->driDrawable;
       int x1 = 0;
       int y1 = 0;
       int x2 = drawable->w - 1;
@@ -527,10 +527,10 @@ static void mach64UpdateMasks( GLcontext *ctx )
    /* mach64 can't color mask with alpha blending enabled */
    if ( !ctx->Color.BlendEnabled ) {
       mask = mach64PackColor( mmesa->mach64Screen->cpp,
-			      ctx->Color.ColorMask[RCOMP],
-			      ctx->Color.ColorMask[GCOMP],
-			      ctx->Color.ColorMask[BCOMP],
-			      ctx->Color.ColorMask[ACOMP] );
+			      ctx->Color.ColorMask[0][RCOMP],
+			      ctx->Color.ColorMask[0][GCOMP],
+			      ctx->Color.ColorMask[0][BCOMP],
+			      ctx->Color.ColorMask[0][ACOMP] );
    }
 
    if ( mmesa->setup.dp_write_mask != mask ) {
@@ -689,7 +689,7 @@ static void mach64DDLogicOpCode( GLcontext *ctx, GLenum opcode )
 void mach64SetCliprects( GLcontext *ctx, GLenum mode )
 {
    mach64ContextPtr mmesa = MACH64_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;
+   __DRIdrawable *dPriv = mmesa->driDrawable;
 
    switch ( mode ) {
    case GL_FRONT_LEFT:
diff --git a/src/mesa/drivers/dri/mach64/mach64_tex.c b/src/mesa/drivers/dri/mach64/mach64_tex.c
index a757362b11..6627d3c38a 100644
--- a/src/mesa/drivers/dri/mach64/mach64_tex.c
+++ b/src/mesa/drivers/dri/mach64/mach64_tex.c
@@ -130,7 +130,7 @@ mach64AllocTexObj( struct gl_texture_object *texObj )
 
    mach64SetTexWrap( t, texObj->WrapS, texObj->WrapT );
    mach64SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-   mach64SetTexBorderColor( t, texObj->BorderColor );
+   mach64SetTexBorderColor( t, texObj->BorderColor.f );
 
    return t;
 }
@@ -470,7 +470,7 @@ static void mach64DDTexParameter( GLcontext *ctx, GLenum target,
 
    case GL_TEXTURE_BORDER_COLOR:
       if ( t->base.bound ) FLUSH_BATCH( mmesa );
-      mach64SetTexBorderColor( t, tObj->BorderColor );
+      mach64SetTexBorderColor( t, tObj->BorderColor.f );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
@@ -565,7 +565,6 @@ void mach64InitTextureFuncs( struct dd_function_table *functions )
    functions->IsTextureResident		= driIsTextureResident;
 
    functions->UpdateTexturePalette	= NULL;
-   functions->ActiveTexture		= NULL;
 
    driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/mach64/mach64_tris.c b/src/mesa/drivers/dri/mach64/mach64_tris.c
index f2e8e2e3ae..c2a0adfef0 100644
--- a/src/mesa/drivers/dri/mach64/mach64_tris.c
+++ b/src/mesa/drivers/dri/mach64/mach64_tris.c
@@ -1297,7 +1297,8 @@ do {						\
 
 #define LOCAL_VARS(n)						\
    mach64ContextPtr mmesa = MACH64_CONTEXT(ctx);		\
-   GLuint color[n], spec[n];					\
+   GLuint color[n] = { 0 };					\
+   GLuint spec[n] = { 0 };					\
    GLuint vertex_size = mmesa->vertex_size;			\
    const GLuint xyoffset = 9;					\
    const GLuint coloroffset = 8;				\
diff --git a/src/mesa/drivers/dri/mach64/mach64_vbtmp.h b/src/mesa/drivers/dri/mach64/mach64_vbtmp.h
index 938804af9e..60bfab8f6d 100644
--- a/src/mesa/drivers/dri/mach64/mach64_vbtmp.h
+++ b/src/mesa/drivers/dri/mach64/mach64_vbtmp.h
@@ -156,53 +156,53 @@ static void TAG(emit)( GLcontext *ctx,
 
    if (DO_TEX3) {
       const GLuint t3 = GET_TEXSOURCE(3);
-      tc3 = VB->TexCoordPtr[t3]->data;
-      tc3_stride = VB->TexCoordPtr[t3]->stride;
+      tc3 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t3]->data;
+      tc3_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t3]->stride;
       if (DO_PTEX)
-	 tc3_size = VB->TexCoordPtr[t3]->size;
+	 tc3_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t3]->size;
    }
 
    if (DO_TEX2) {
       const GLuint t2 = GET_TEXSOURCE(2);
-      tc2 = VB->TexCoordPtr[t2]->data;
-      tc2_stride = VB->TexCoordPtr[t2]->stride;
+      tc2 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t2]->data;
+      tc2_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t2]->stride;
       if (DO_PTEX)
-	 tc2_size = VB->TexCoordPtr[t2]->size;
+	 tc2_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t2]->size;
    }
 
    if (DO_TEX1) {
       const GLuint t1 = GET_TEXSOURCE(1);
-      tc1 = VB->TexCoordPtr[t1]->data;
-      tc1_stride = VB->TexCoordPtr[t1]->stride;
+      tc1 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->data;
+      tc1_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->stride;
       if (DO_PTEX)
-	 tc1_size = VB->TexCoordPtr[t1]->size;
+	 tc1_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->size;
    }
 
    if (DO_TEX0) {
       const GLuint t0 = GET_TEXSOURCE(0);
-      tc0_stride = VB->TexCoordPtr[t0]->stride;
-      tc0 = VB->TexCoordPtr[t0]->data;
+      tc0_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->stride;
+      tc0 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->data;
       if (DO_PTEX) 
-	 tc0_size = VB->TexCoordPtr[t0]->size;
+	 tc0_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->size;
    }
 
    if (DO_RGBA) {
-      col = VB->ColorPtr[0]->data;
-      col_stride = VB->ColorPtr[0]->stride;
+      col = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+      col_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
    }
 
    if (DO_SPEC) {
-      spec = VB->SecondaryColorPtr[0]->data;
-      spec_stride = VB->SecondaryColorPtr[0]->stride;
+      spec = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->data;
+      spec_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->stride;
    } else {
       spec = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_COLOR1];
       spec_stride = 0;
    }
 
    if (DO_FOG) {
-      if (VB->FogCoordPtr) {
-	 fog = VB->FogCoordPtr->data;
-	 fog_stride = VB->FogCoordPtr->stride;
+      if (VB->AttribPtr[_TNL_ATTRIB_FOG]) {
+	 fog = VB->AttribPtr[_TNL_ATTRIB_FOG]->data;
+	 fog_stride = VB->AttribPtr[_TNL_ATTRIB_FOG]->stride;
       } else {
 	 static GLfloat tmp[4] = {0, 0, 0, 0};
 	 fog = &tmp;
@@ -384,8 +384,8 @@ static void TAG(emit)( GLcontext *ctx, GLuint start, GLuint end,
 
    ASSERT(stride == 4);
 
-   col = VB->ColorPtr[0]->data;
-   col_stride = VB->ColorPtr[0]->stride;
+   col = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+   col_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
 
    /* Pack what's left into a 4-dword vertex.  Color is in a different
     * place, and there is no 'w' coordinate.
@@ -432,8 +432,8 @@ static void TAG(emit)( GLcontext *ctx, GLuint start, GLuint end,
    GLfloat *v = (GLfloat *)dest;
    int i;
 
-   col = VB->ColorPtr[0]->data;
-   col_stride = VB->ColorPtr[0]->stride;
+   col = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+   col_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
 
    if (start)
       STRIDE_4F(col, col_stride * start);
@@ -473,22 +473,22 @@ static GLboolean TAG(check_tex_sizes)( GLcontext *ctx )
 
    /* Force 'missing' texcoords to something valid.
     */
-   if (DO_TEX3 && VB->TexCoordPtr[2] == 0)
-      VB->TexCoordPtr[2] = VB->TexCoordPtr[3];
+   if (DO_TEX3 && VB->AttribPtr[_TNL_ATTRIB_TEX2] == 0)
+      VB->AttribPtr[_TNL_ATTRIB_TEX2] = VB->AttribPtr[_TNL_ATTRIB_TEX3];
 
-   if (DO_TEX2 && VB->TexCoordPtr[1] == 0)
-      VB->TexCoordPtr[1] = VB->TexCoordPtr[2];
+   if (DO_TEX2 && VB->AttribPtr[_TNL_ATTRIB_TEX1] == 0)
+      VB->AttribPtr[_TNL_ATTRIB_TEX1] = VB->AttribPtr[_TNL_ATTRIB_TEX2];
 
-   if (DO_TEX1 && VB->TexCoordPtr[0] == 0)
-      VB->TexCoordPtr[0] = VB->TexCoordPtr[1];
+   if (DO_TEX1 && VB->AttribPtr[_TNL_ATTRIB_TEX0] == 0)
+      VB->AttribPtr[_TNL_ATTRIB_TEX0] = VB->AttribPtr[_TNL_ATTRIB_TEX1];
 
    if (DO_PTEX)
       return GL_TRUE;
    
-   if ((DO_TEX3 && VB->TexCoordPtr[GET_TEXSOURCE(3)]->size == 4) ||
-       (DO_TEX2 && VB->TexCoordPtr[GET_TEXSOURCE(2)]->size == 4) ||
-       (DO_TEX1 && VB->TexCoordPtr[GET_TEXSOURCE(1)]->size == 4) ||
-       (DO_TEX0 && VB->TexCoordPtr[GET_TEXSOURCE(0)]->size == 4))
+   if ((DO_TEX3 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(3)]->size == 4) ||
+       (DO_TEX2 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(2)]->size == 4) ||
+       (DO_TEX1 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(1)]->size == 4) ||
+       (DO_TEX0 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(0)]->size == 4))
       return GL_FALSE;
 
    return GL_TRUE;
@@ -501,14 +501,14 @@ static GLboolean TAG(check_tex_sizes)( GLcontext *ctx )
 
    /* Force 'missing' texcoords to something valid.
     */
-   if (DO_TEX3 && VB->TexCoordPtr[2] == 0)
-      VB->TexCoordPtr[2] = VB->TexCoordPtr[3];
+   if (DO_TEX3 && VB->AttribPtr[_TNL_ATTRIB_TEX2] == 0)
+      VB->AttribPtr[_TNL_ATTRIB_TEX2] = VB->AttribPtr[_TNL_ATTRIB_TEX3];
 
-   if (DO_TEX2 && VB->TexCoordPtr[1] == 0)
-      VB->TexCoordPtr[1] = VB->TexCoordPtr[2];
+   if (DO_TEX2 && VB->AttribPtr[_TNL_ATTRIB_TEX1] == 0)
+      VB->AttribPtr[_TNL_ATTRIB_TEX1] = VB->AttribPtr[_TNL_ATTRIB_TEX2];
 
-   if (DO_TEX1 && VB->TexCoordPtr[0] == 0)
-      VB->TexCoordPtr[0] = VB->TexCoordPtr[1];
+   if (DO_TEX1 && VB->AttribPtr[_TNL_ATTRIB_TEX0] == 0)
+      VB->AttribPtr[_TNL_ATTRIB_TEX0] = VB->AttribPtr[_TNL_ATTRIB_TEX1];
 
    if (DO_PTEX)
       return GL_TRUE;
@@ -516,14 +516,14 @@ static GLboolean TAG(check_tex_sizes)( GLcontext *ctx )
    /* No hardware support for projective texture.  Can fake it for
     * TEX0 only.
     */
-   if ((DO_TEX3 && VB->TexCoordPtr[GET_TEXSOURCE(3)]->size == 4) ||
-       (DO_TEX2 && VB->TexCoordPtr[GET_TEXSOURCE(2)]->size == 4) ||
-       (DO_TEX1 && VB->TexCoordPtr[GET_TEXSOURCE(1)]->size == 4)) {
+   if ((DO_TEX3 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(3)]->size == 4) ||
+       (DO_TEX2 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(2)]->size == 4) ||
+       (DO_TEX1 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(1)]->size == 4)) {
       PTEX_FALLBACK();
       return GL_FALSE;
    }
 
-   if (DO_TEX0 && VB->TexCoordPtr[GET_TEXSOURCE(0)]->size == 4) {
+   if (DO_TEX0 && VB->AttribPtr[_TNL_ATTRIB_TEX0 + GET_TEXSOURCE(0)]->size == 4) {
       if (DO_TEX1 || DO_TEX2 || DO_TEX3) {
 	 PTEX_FALLBACK();
       }
diff --git a/src/mesa/drivers/dri/mga/mga_xmesa.c b/src/mesa/drivers/dri/mga/mga_xmesa.c
index 2c7f50c498..f835cb8bd6 100644
--- a/src/mesa/drivers/dri/mga/mga_xmesa.c
+++ b/src/mesa/drivers/dri/mga/mga_xmesa.c
@@ -108,7 +108,7 @@ int MGA_DEBUG = 0;
 #endif
 
 static const __DRIconfig **
-mgaFillInModes( __DRIscreenPrivate *psp,
+mgaFillInModes( __DRIscreen *psp,
 		unsigned pixel_bits, unsigned depth_bits,
 		unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -190,7 +190,7 @@ const __DRIextension *mgaScreenExtensions[] = {
 };
 
 static GLboolean
-mgaInitDriver(__DRIscreenPrivate *sPriv)
+mgaInitDriver(__DRIscreen *sPriv)
 {
    mgaScreenPrivate *mgaScreen;
    MGADRIPtr         serverInfo = (MGADRIPtr)sPriv->pDevPriv;
@@ -332,7 +332,7 @@ mgaInitDriver(__DRIscreenPrivate *sPriv)
 
 
 static void
-mgaDestroyScreen(__DRIscreenPrivate *sPriv)
+mgaDestroyScreen(__DRIscreen *sPriv)
 {
    mgaScreenPrivate *mgaScreen = (mgaScreenPrivate *) sPriv->private;
 
@@ -426,14 +426,14 @@ static const struct dri_debug_control debug_control[] =
 
 static GLboolean
 mgaCreateContext( const __GLcontextModes *mesaVis,
-                  __DRIcontextPrivate *driContextPriv,
+                  __DRIcontext *driContextPriv,
                   void *sharedContextPrivate )
 {
    int i;
    unsigned   maxlevels;
    GLcontext *ctx, *shareCtx;
    mgaContextPtr mmesa;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    mgaScreenPrivate *mgaScreen = (mgaScreenPrivate *)sPriv->private;
    drm_mga_sarea_t *saPriv = (drm_mga_sarea_t *)(((char*)sPriv->pSAREA)+
 					      mgaScreen->sarea_priv_offset);
@@ -645,7 +645,7 @@ mgaCreateContext( const __GLcontextModes *mesaVis,
 }
 
 static void
-mgaDestroyContext(__DRIcontextPrivate *driContextPriv)
+mgaDestroyContext(__DRIcontext *driContextPriv)
 {
    mgaContextPtr mmesa = (mgaContextPtr) driContextPriv->driverPrivate;
 
@@ -697,8 +697,8 @@ mgaDestroyContext(__DRIcontextPrivate *driContextPriv)
 
 
 static GLboolean
-mgaCreateBuffer( __DRIscreenPrivate *driScrnPriv,
-                 __DRIdrawablePrivate *driDrawPriv,
+mgaCreateBuffer( __DRIscreen *driScrnPriv,
+                 __DRIdrawable *driDrawPriv,
                  const __GLcontextModes *mesaVis,
                  GLboolean isPixmap )
 {
@@ -814,13 +814,13 @@ mgaCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-mgaDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+mgaDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static void
-mgaSwapBuffers(__DRIdrawablePrivate *dPriv)
+mgaSwapBuffers(__DRIdrawable *dPriv)
 {
    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
       mgaContextPtr mmesa;
@@ -839,7 +839,7 @@ mgaSwapBuffers(__DRIdrawablePrivate *dPriv)
 }
 
 static GLboolean
-mgaUnbindContext(__DRIcontextPrivate *driContextPriv)
+mgaUnbindContext(__DRIcontext *driContextPriv)
 {
    mgaContextPtr mmesa = (mgaContextPtr) driContextPriv->driverPrivate;
    if (mmesa)
@@ -855,9 +855,9 @@ mgaUnbindContext(__DRIcontextPrivate *driContextPriv)
  * But why are we doing context initialization here???
  */
 static GLboolean
-mgaMakeCurrent(__DRIcontextPrivate *driContextPriv,
-               __DRIdrawablePrivate *driDrawPriv,
-               __DRIdrawablePrivate *driReadPriv)
+mgaMakeCurrent(__DRIcontext *driContextPriv,
+               __DRIdrawable *driDrawPriv,
+               __DRIdrawable *driReadPriv)
 {
    if (driContextPriv) {
       mgaContextPtr mmesa = (mgaContextPtr) driContextPriv->driverPrivate;
@@ -892,7 +892,7 @@ mgaMakeCurrent(__DRIcontextPrivate *driContextPriv,
 
 void mgaGetLock( mgaContextPtr mmesa, GLuint flags )
 {
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;
+   __DRIdrawable *dPriv = mmesa->driDrawable;
    drm_mga_sarea_t *sarea = mmesa->sarea;
    int me = mmesa->hHWContext;
    int i;
@@ -960,7 +960,7 @@ static const __DRIconfig **mgaInitScreen(__DRIscreen *psp)
  * Get information about previous buffer swaps.
  */
 static int
-getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
+getSwapInfo( __DRIdrawable *dPriv, __DRIswapInfo * sInfo )
 {
    mgaContextPtr  mmesa;
 
@@ -998,3 +998,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/mga/mga_xmesa.h b/src/mesa/drivers/dri/mga/mga_xmesa.h
index 07c22bd596..aee146090c 100644
--- a/src/mesa/drivers/dri/mga/mga_xmesa.h
+++ b/src/mesa/drivers/dri/mga/mga_xmesa.h
@@ -67,7 +67,7 @@ typedef struct mga_screen_private_s {
    char *texVirtual[MGA_NR_TEX_HEAPS];
 
 
-   __DRIscreenPrivate *sPriv;
+   __DRIscreen *sPriv;
    drmBufMapPtr  bufs;
 
    drmRegion mmio;
diff --git a/src/mesa/drivers/dri/mga/mgacontext.h b/src/mesa/drivers/dri/mga/mgacontext.h
index 30640a29b3..4141565931 100644
--- a/src/mesa/drivers/dri/mga/mgacontext.h
+++ b/src/mesa/drivers/dri/mga/mgacontext.h
@@ -294,10 +294,10 @@ struct mga_context_t {
    drm_context_t hHWContext;
    drm_hw_lock_t *driHwLock;
    int driFd;
-   __DRIdrawablePrivate *driDrawable;
-   __DRIdrawablePrivate *driReadable;
+   __DRIdrawable *driDrawable;
+   __DRIdrawable *driReadable;
 
-   __DRIscreenPrivate *driScreen;
+   __DRIscreen *driScreen;
    struct mga_screen_private_s *mgaScreen;
    drm_mga_sarea_t *sarea;
 
diff --git a/src/mesa/drivers/dri/mga/mgaioctl.c b/src/mesa/drivers/dri/mga/mgaioctl.c
index 4438bad920..8ce5d802ab 100644
--- a/src/mesa/drivers/dri/mga/mgaioctl.c
+++ b/src/mesa/drivers/dri/mga/mgaioctl.c
@@ -207,7 +207,7 @@ static void
 mgaClear( GLcontext *ctx, GLbitfield mask )
 {
    mgaContextPtr mmesa = MGA_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;
+   __DRIdrawable *dPriv = mmesa->driDrawable;
    GLuint flags = 0;
    GLuint clear_color = mmesa->ClearColor;
    GLuint clear_depth = 0;
@@ -409,7 +409,7 @@ static void mgaWaitForFrameCompletion( mgaContextPtr mmesa )
 /*
  * Copy the back buffer to the front buffer.
  */
-void mgaCopyBuffer( __DRIdrawablePrivate *dPriv )
+void mgaCopyBuffer( __DRIdrawable *dPriv )
 {
    mgaContextPtr mmesa;
    drm_clip_rect_t *pbox;
@@ -417,7 +417,7 @@ void mgaCopyBuffer( __DRIdrawablePrivate *dPriv )
    GLint ret;
    GLint i;
    GLboolean   missed_target;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+   __DRIscreen *psp = dPriv->driScreenPriv;
 
    assert(dPriv);
    assert(dPriv->driContextPriv);
diff --git a/src/mesa/drivers/dri/mga/mgaioctl.h b/src/mesa/drivers/dri/mga/mgaioctl.h
index dbc823de80..7a8660d203 100644
--- a/src/mesa/drivers/dri/mga/mgaioctl.h
+++ b/src/mesa/drivers/dri/mga/mgaioctl.h
@@ -32,7 +32,7 @@
 #include "mgacontext.h"
 #include "mga_xmesa.h"
 
-void mgaCopyBuffer( __DRIdrawablePrivate *dPriv );
+void mgaCopyBuffer( __DRIdrawable *dPriv );
 void mgaWaitForVBlank( mgaContextPtr mmesa );
 
 void mgaGetILoadBufferLocked( mgaContextPtr mmesa );
diff --git a/src/mesa/drivers/dri/mga/mgapixel.c b/src/mesa/drivers/dri/mga/mgapixel.c
index 977dfa0b76..69415f8a83 100644
--- a/src/mesa/drivers/dri/mga/mgapixel.c
+++ b/src/mesa/drivers/dri/mga/mgapixel.c
@@ -134,10 +134,10 @@ check_color_per_fragment_ops( const GLcontext *ctx )
 		    ctx->Fog.Enabled ||
 		    ctx->Scissor.Enabled ||
 		    ctx->Stencil._Enabled ||
-		    !ctx->Color.ColorMask[0] ||
-		    !ctx->Color.ColorMask[1] ||
-		    !ctx->Color.ColorMask[2] ||
-		    !ctx->Color.ColorMask[3] ||
+		    !ctx->Color.ColorMask[0][0] ||
+		    !ctx->Color.ColorMask[0][1] ||
+		    !ctx->Color.ColorMask[0][2] ||
+		    !ctx->Color.ColorMask[0][3] ||
 		    ctx->Color.ColorLogicOpEnabled ||
 		    ctx->Texture._EnabledUnits
            ) &&
@@ -150,10 +150,10 @@ static GLboolean
 check_depth_per_fragment_ops( const GLcontext *ctx )
 {
    return ( ctx->Current.RasterPosValid &&
-	    ctx->Color.ColorMask[RCOMP] == 0 &&
-	    ctx->Color.ColorMask[BCOMP] == 0 &&
-	    ctx->Color.ColorMask[GCOMP] == 0 &&
-	    ctx->Color.ColorMask[ACOMP] == 0 &&
+	    ctx->Color.ColorMask[0][RCOMP] == 0 &&
+	    ctx->Color.ColorMask[0][BCOMP] == 0 &&
+	    ctx->Color.ColorMask[0][GCOMP] == 0 &&
+	    ctx->Color.ColorMask[0][ACOMP] == 0 &&
 	    ctx->Pixel.ZoomX == 1.0F &&
 	    ( ctx->Pixel.ZoomY == 1.0F || ctx->Pixel.ZoomY == -1.0F ) );
 }
@@ -299,7 +299,7 @@ mgaTryReadPixels( GLcontext *ctx,
 
 #if 0
    {
-      __DRIdrawablePrivate *dPriv = mmesa->driDrawable;
+      __DRIdrawable *dPriv = mmesa->driDrawable;
       int nbox, retcode, i;
 
       UPDATE_LOCK( mmesa, DRM_LOCK_FLUSH | DRM_LOCK_QUIESCENT );
@@ -399,7 +399,7 @@ static void do_draw_pix( GLcontext *ctx,
 #if 0
    mgaContextPtr mmesa = MGA_CONTEXT(ctx);
    drmMGABlit blit;
-   __DRIdrawablePrivate *dPriv = mmesa->driDrawable;
+   __DRIdrawable *dPriv = mmesa->driDrawable;
    drm_clip_rect_t pbox = dPriv->pClipRects;
    int nbox = dPriv->numClipRects;
    int retcode, i;
@@ -525,10 +525,10 @@ mgaTryDrawPixels( GLcontext *ctx,
 	      mmesa->mgaScreen->backOffset);
 
       planemask = mgaPackColor(cpp,
-			       ctx->Color.ColorMask[RCOMP],
-			       ctx->Color.ColorMask[GCOMP],
-			       ctx->Color.ColorMask[BCOMP],
-			       ctx->Color.ColorMask[ACOMP]);
+			       ctx->Color.ColorMask[0][RCOMP],
+			       ctx->Color.ColorMask[0][GCOMP],
+			       ctx->Color.ColorMask[0][BCOMP],
+			       ctx->Color.ColorMask[0][ACOMP]);
 
       if (cpp == 2)
 	 planemask |= planemask << 16;
diff --git a/src/mesa/drivers/dri/mga/mgaspan.c b/src/mesa/drivers/dri/mga/mgaspan.c
index 2ff1cac8e2..10606c152c 100644
--- a/src/mesa/drivers/dri/mga/mgaspan.c
+++ b/src/mesa/drivers/dri/mga/mgaspan.c
@@ -36,9 +36,9 @@
 
 #define LOCAL_VARS					\
    mgaContextPtr mmesa = MGA_CONTEXT(ctx);		\
-   __DRIscreenPrivate *sPriv = mmesa->driScreen;	\
+   __DRIscreen *sPriv = mmesa->driScreen;	\
    driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+   const __DRIdrawable *dPriv = drb->dPriv;	\
    GLuint pitch = drb->pitch;				\
    GLuint height = dPriv->h;				\
    char *buf = (char *)(sPriv->pFB +			\
@@ -52,9 +52,9 @@
 
 #define LOCAL_DEPTH_VARS						\
    mgaContextPtr mmesa = MGA_CONTEXT(ctx);				\
-   __DRIscreenPrivate *sPriv = mmesa->driScreen;			\
+   __DRIscreen *sPriv = mmesa->driScreen;			\
    driRenderbuffer *drb = (driRenderbuffer *) rb;			\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;			\
+   const __DRIdrawable *dPriv = drb->dPriv;			\
    GLuint pitch = drb->pitch;						\
    GLuint height = dPriv->h;						\
    char *buf = (char *)(sPriv->pFB +					\
diff --git a/src/mesa/drivers/dri/mga/mgastate.c b/src/mesa/drivers/dri/mga/mgastate.c
index 7c830ec097..0253044761 100644
--- a/src/mesa/drivers/dri/mga/mgastate.c
+++ b/src/mesa/drivers/dri/mga/mgastate.c
@@ -374,13 +374,11 @@ static void mgaDDColorMask(GLcontext *ctx,
 {
    mgaContextPtr mmesa = MGA_CONTEXT( ctx );
    mgaScreenPrivate *mgaScreen = mmesa->mgaScreen;
-
-
    GLuint mask = mgaPackColor(mgaScreen->cpp,
-			      ctx->Color.ColorMask[RCOMP],
-			      ctx->Color.ColorMask[GCOMP],
-			      ctx->Color.ColorMask[BCOMP],
-			      ctx->Color.ColorMask[ACOMP]);
+			      ctx->Color.ColorMask[0][RCOMP],
+			      ctx->Color.ColorMask[0][GCOMP],
+			      ctx->Color.ColorMask[0][BCOMP],
+			      ctx->Color.ColorMask[0][ACOMP]);
 
    if (mgaScreen->cpp == 2)
       mask = mask | (mask << 16);
@@ -748,7 +746,7 @@ static void mgaDDLogicOp( GLcontext *ctx, GLenum opcode )
 
 static void mga_set_cliprects(mgaContextPtr mmesa)
 {
-   __DRIdrawablePrivate *driDrawable = mmesa->driDrawable;
+   __DRIdrawable *driDrawable = mmesa->driDrawable;
 
    if ((mmesa->draw_buffer != MGA_FRONT)
        || (driDrawable->numBackClipRects == 0)) {
@@ -776,8 +774,8 @@ static void mga_set_cliprects(mgaContextPtr mmesa)
 
 void mgaUpdateRects( mgaContextPtr mmesa, GLuint buffers )
 {
-   __DRIdrawablePrivate *const driDrawable = mmesa->driDrawable;
-   __DRIdrawablePrivate *const driReadable = mmesa->driReadable;
+   __DRIdrawable *const driDrawable = mmesa->driDrawable;
+   __DRIdrawable *const driReadable = mmesa->driReadable;
 
    mmesa->dirty_cliprects = 0;	
 
diff --git a/src/mesa/drivers/dri/mga/mgatex.c b/src/mesa/drivers/dri/mga/mgatex.c
index 9163371b33..62a9317cd4 100644
--- a/src/mesa/drivers/dri/mga/mgatex.c
+++ b/src/mesa/drivers/dri/mga/mgatex.c
@@ -332,7 +332,7 @@ mgaAllocTexObj( struct gl_texture_object *tObj )
 
       mgaSetTexWrapping( t, tObj->WrapS, tObj->WrapT );
       mgaSetTexFilter( t, tObj->MinFilter, tObj->MagFilter );
-      mgaSetTexBorderColor( t, tObj->BorderColor );
+      mgaSetTexBorderColor( t, tObj->BorderColor.f );
    }
 
    return( t );
@@ -461,7 +461,7 @@ mgaTexParameter( GLcontext *ctx, GLenum target,
 
    case GL_TEXTURE_BORDER_COLOR:
       FLUSH_BATCH(mmesa);
-      mgaSetTexBorderColor(t, tObj->BorderColor);
+      mgaSetTexBorderColor(t, tObj->BorderColor.f);
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
diff --git a/src/mesa/drivers/dri/mga/mgatris.c b/src/mesa/drivers/dri/mga/mgatris.c
index b93a21c3ac..c1bcd4b853 100644
--- a/src/mesa/drivers/dri/mga/mgatris.c
+++ b/src/mesa/drivers/dri/mga/mgatris.c
@@ -397,7 +397,8 @@ do {						\
 
 #define LOCAL_VARS(n)					\
    mgaContextPtr mmesa = MGA_CONTEXT(ctx);		\
-   GLuint color[n], spec[n];				\
+   GLuint color[n] = { 0 };				\
+   GLuint spec[n] = { 0 };				\
    (void) color; (void) spec;
 
 
diff --git a/src/mesa/drivers/dri/r128/r128_context.c b/src/mesa/drivers/dri/r128/r128_context.c
index 0b250876c5..e389e1c87b 100644
--- a/src/mesa/drivers/dri/r128/r128_context.c
+++ b/src/mesa/drivers/dri/r128/r128_context.c
@@ -101,11 +101,11 @@ static const struct dri_debug_control debug_control[] =
 /* Create the device specific context.
  */
 GLboolean r128CreateContext( const __GLcontextModes *glVisual,
-			     __DRIcontextPrivate *driContextPriv,
+			     __DRIcontext *driContextPriv,
                              void *sharedContextPrivate )
 {
    GLcontext *ctx, *shareCtx;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    struct dd_function_table functions;
    r128ContextPtr rmesa;
    r128ScreenPtr r128scrn;
@@ -274,7 +274,7 @@ GLboolean r128CreateContext( const __GLcontextModes *glVisual,
 
 /* Destroy the device specific context.
  */
-void r128DestroyContext( __DRIcontextPrivate *driContextPriv  )
+void r128DestroyContext( __DRIcontext *driContextPriv  )
 {
    r128ContextPtr rmesa = (r128ContextPtr) driContextPriv->driverPrivate;
 
@@ -325,9 +325,9 @@ void r128DestroyContext( __DRIcontextPrivate *driContextPriv  )
  * buffer `b'.
  */
 GLboolean
-r128MakeCurrent( __DRIcontextPrivate *driContextPriv,
-                 __DRIdrawablePrivate *driDrawPriv,
-                 __DRIdrawablePrivate *driReadPriv )
+r128MakeCurrent( __DRIcontext *driContextPriv,
+                 __DRIdrawable *driDrawPriv,
+                 __DRIdrawable *driReadPriv )
 {
    if ( driContextPriv ) {
       GET_CURRENT_CONTEXT(ctx);
@@ -364,7 +364,7 @@ r128MakeCurrent( __DRIcontextPrivate *driContextPriv,
 /* Force the context `c' to be unbound from its buffer.
  */
 GLboolean
-r128UnbindContext( __DRIcontextPrivate *driContextPriv )
+r128UnbindContext( __DRIcontext *driContextPriv )
 {
    return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/r128/r128_context.h b/src/mesa/drivers/dri/r128/r128_context.h
index 0e10209a6a..65f845c115 100644
--- a/src/mesa/drivers/dri/r128/r128_context.h
+++ b/src/mesa/drivers/dri/r128/r128_context.h
@@ -186,9 +186,9 @@ struct r128_context {
 
    /* Mirrors of some DRI state
     */
-   __DRIcontextPrivate	*driContext;	/* DRI context */
-   __DRIscreenPrivate	*driScreen;	/* DRI screen */
-   __DRIdrawablePrivate	*driDrawable;	/* DRI drawable bound to this ctx */
+   __DRIcontext	*driContext;	/* DRI context */
+   __DRIscreen	*driScreen;	/* DRI screen */
+   __DRIdrawable	*driDrawable;	/* DRI drawable bound to this ctx */
 
    unsigned int lastStamp;	        /* mirror driDrawable->lastStamp */
 
@@ -225,16 +225,16 @@ struct r128_context {
 
 
 extern GLboolean r128CreateContext( const __GLcontextModes *glVisual,
-				    __DRIcontextPrivate *driContextPriv,
+				    __DRIcontext *driContextPriv,
                                     void *sharedContextPrivate );
 
-extern void r128DestroyContext( __DRIcontextPrivate * );
+extern void r128DestroyContext( __DRIcontext * );
 
-extern GLboolean r128MakeCurrent( __DRIcontextPrivate *driContextPriv,
-                                  __DRIdrawablePrivate *driDrawPriv,
-                                  __DRIdrawablePrivate *driReadPriv );
+extern GLboolean r128MakeCurrent( __DRIcontext *driContextPriv,
+                                  __DRIdrawable *driDrawPriv,
+                                  __DRIdrawable *driReadPriv );
 
-extern GLboolean r128UnbindContext( __DRIcontextPrivate *driContextPriv );
+extern GLboolean r128UnbindContext( __DRIcontext *driContextPriv );
 
 /* ================================================================
  * Debugging:
diff --git a/src/mesa/drivers/dri/r128/r128_ioctl.c b/src/mesa/drivers/dri/r128/r128_ioctl.c
index 84ac3d9f79..56758d971c 100644
--- a/src/mesa/drivers/dri/r128/r128_ioctl.c
+++ b/src/mesa/drivers/dri/r128/r128_ioctl.c
@@ -248,7 +248,7 @@ static int r128WaitForFrameCompletion( r128ContextPtr rmesa )
 
 /* Copy the back color buffer to the front color buffer.
  */
-void r128CopyBuffer( __DRIdrawablePrivate *dPriv )
+void r128CopyBuffer( __DRIdrawable *dPriv )
 {
    r128ContextPtr rmesa;
    GLint nbox, i, ret;
@@ -327,7 +327,7 @@ void r128CopyBuffer( __DRIdrawablePrivate *dPriv )
 #endif
 }
 
-void r128PageFlip( __DRIdrawablePrivate *dPriv )
+void r128PageFlip( __DRIdrawable *dPriv )
 {
    r128ContextPtr rmesa;
    GLint ret;
@@ -401,7 +401,7 @@ void r128PageFlip( __DRIdrawablePrivate *dPriv )
 static void r128Clear( GLcontext *ctx, GLbitfield mask )
 {
    r128ContextPtr rmesa = R128_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->driDrawable;
+   __DRIdrawable *dPriv = rmesa->driDrawable;
    drm_r128_clear_t clear;
    GLuint flags = 0;
    GLint i;
diff --git a/src/mesa/drivers/dri/r128/r128_ioctl.h b/src/mesa/drivers/dri/r128/r128_ioctl.h
index 4b0c9cdc7f..84ace900ee 100644
--- a/src/mesa/drivers/dri/r128/r128_ioctl.h
+++ b/src/mesa/drivers/dri/r128/r128_ioctl.h
@@ -85,8 +85,8 @@ extern void r128ReadDepthSpanLocked( r128ContextPtr rmesa,
 extern void r128ReadDepthPixelsLocked( r128ContextPtr rmesa, GLuint n,
 				       const GLint x[], const GLint y[] );
 
-extern void r128CopyBuffer( __DRIdrawablePrivate *dPriv );
-extern void r128PageFlip( __DRIdrawablePrivate *dPriv );
+extern void r128CopyBuffer( __DRIdrawable *dPriv );
+extern void r128PageFlip( __DRIdrawable *dPriv );
 void r128WaitForVBlank( r128ContextPtr rmesa );
 
 extern void r128WaitForIdleLocked( r128ContextPtr rmesa );
diff --git a/src/mesa/drivers/dri/r128/r128_lock.c b/src/mesa/drivers/dri/r128/r128_lock.c
index 81488a2742..9bc3515b5a 100644
--- a/src/mesa/drivers/dri/r128/r128_lock.c
+++ b/src/mesa/drivers/dri/r128/r128_lock.c
@@ -68,8 +68,8 @@ r128UpdatePageFlipping( r128ContextPtr rmesa )
  */
 void r128GetLock( r128ContextPtr rmesa, GLuint flags )
 {
-   __DRIdrawablePrivate *dPriv = rmesa->driDrawable;
-   __DRIscreenPrivate *sPriv = rmesa->driScreen;
+   __DRIdrawable *dPriv = rmesa->driDrawable;
+   __DRIscreen *sPriv = rmesa->driScreen;
    drm_r128_sarea_t *sarea = rmesa->sarea;
    int i;
 
diff --git a/src/mesa/drivers/dri/r128/r128_screen.c b/src/mesa/drivers/dri/r128/r128_screen.c
index 9da3b5fb73..80b265811e 100644
--- a/src/mesa/drivers/dri/r128/r128_screen.c
+++ b/src/mesa/drivers/dri/r128/r128_screen.c
@@ -91,7 +91,7 @@ static const GLuint __driNConfigOptions = 3;
 /* Create the device specific screen private data struct.
  */
 static r128ScreenPtr
-r128CreateScreen( __DRIscreenPrivate *sPriv )
+r128CreateScreen( __DRIscreen *sPriv )
 {
    r128ScreenPtr r128Screen;
    R128DRIPtr r128DRIPriv = (R128DRIPtr)sPriv->pDevPriv;
@@ -236,7 +236,7 @@ r128CreateScreen( __DRIscreenPrivate *sPriv )
 /* Destroy the device specific screen private data struct.
  */
 static void
-r128DestroyScreen( __DRIscreenPrivate *sPriv )
+r128DestroyScreen( __DRIscreen *sPriv )
 {
    r128ScreenPtr r128Screen = (r128ScreenPtr)sPriv->private;
 
@@ -262,8 +262,8 @@ r128DestroyScreen( __DRIscreenPrivate *sPriv )
  * data.
  */
 static GLboolean
-r128CreateBuffer( __DRIscreenPrivate *driScrnPriv,
-                  __DRIdrawablePrivate *driDrawPriv,
+r128CreateBuffer( __DRIscreen *driScrnPriv,
+                  __DRIdrawable *driDrawPriv,
                   const __GLcontextModes *mesaVis,
                   GLboolean isPixmap )
 {
@@ -349,7 +349,7 @@ r128CreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-r128DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+r128DestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
@@ -357,7 +357,7 @@ r128DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 
 /* Copy the back color buffer to the front color buffer */
 static void
-r128SwapBuffers(__DRIdrawablePrivate *dPriv)
+r128SwapBuffers(__DRIdrawable *dPriv)
 {
    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
       r128ContextPtr rmesa;
@@ -384,7 +384,7 @@ r128SwapBuffers(__DRIdrawablePrivate *dPriv)
 /* Initialize the driver specific screen private data.
  */
 static GLboolean
-r128InitDriver( __DRIscreenPrivate *sPriv )
+r128InitDriver( __DRIscreen *sPriv )
 {
    sPriv->private = (void *) r128CreateScreen( sPriv );
 
@@ -397,7 +397,7 @@ r128InitDriver( __DRIscreenPrivate *sPriv )
 }
 
 static const __DRIconfig **
-r128FillInModes( __DRIscreenPrivate *psp,
+r128FillInModes( __DRIscreen *psp,
 		 unsigned pixel_bits, unsigned depth_bits,
 		 unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -478,7 +478,7 @@ r128FillInModes( __DRIscreenPrivate *psp,
  * \return the __GLcontextModes supported by this driver
  */
 static const __DRIconfig **
-r128InitScreen(__DRIscreenPrivate *psp)
+r128InitScreen(__DRIscreen *psp)
 {
    static const __DRIversion ddx_expected = { 4, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
@@ -517,3 +517,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/r128/r128_screen.h b/src/mesa/drivers/dri/r128/r128_screen.h
index e2fa1677c9..8d450adff3 100644
--- a/src/mesa/drivers/dri/r128/r128_screen.h
+++ b/src/mesa/drivers/dri/r128/r128_screen.h
@@ -71,7 +71,7 @@ typedef struct {
 
    drmBufMapPtr buffers;
 
-   __DRIscreenPrivate *driScreen;
+   __DRIscreen *driScreen;
    unsigned int sarea_priv_offset;
 
    /* Configuration cache with default values for all contexts */
diff --git a/src/mesa/drivers/dri/r128/r128_span.c b/src/mesa/drivers/dri/r128/r128_span.c
index d238cc3c94..0413e5b4f1 100644
--- a/src/mesa/drivers/dri/r128/r128_span.c
+++ b/src/mesa/drivers/dri/r128/r128_span.c
@@ -50,8 +50,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define LOCAL_VARS							\
    r128ContextPtr rmesa = R128_CONTEXT(ctx);				\
-   __DRIscreenPrivate *sPriv = rmesa->driScreen;			\
-   __DRIdrawablePrivate *dPriv = rmesa->driDrawable;			\
+   __DRIscreen *sPriv = rmesa->driScreen;			\
+   __DRIdrawable *dPriv = rmesa->driDrawable;			\
    driRenderbuffer *drb = (driRenderbuffer *) rb;			\
    GLuint height = dPriv->h;						\
    GLuint p;								\
@@ -60,8 +60,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define LOCAL_DEPTH_VARS						\
    r128ContextPtr rmesa = R128_CONTEXT(ctx);				\
    r128ScreenPtr r128scrn = rmesa->r128Screen;				\
-   __DRIscreenPrivate *sPriv = rmesa->driScreen;			\
-   __DRIdrawablePrivate *dPriv = rmesa->driDrawable;			\
+   __DRIscreen *sPriv = rmesa->driScreen;			\
+   __DRIdrawable *dPriv = rmesa->driDrawable;			\
    GLuint height = dPriv->h;						\
    (void) r128scrn; (void) sPriv; (void) height
 
diff --git a/src/mesa/drivers/dri/r128/r128_state.c b/src/mesa/drivers/dri/r128/r128_state.c
index 4ae7bf5b97..2254a7a4ff 100644
--- a/src/mesa/drivers/dri/r128/r128_state.c
+++ b/src/mesa/drivers/dri/r128/r128_state.c
@@ -572,7 +572,7 @@ static void r128UpdateClipping( GLcontext *ctx )
    r128ContextPtr rmesa = R128_CONTEXT(ctx);
 
    if ( rmesa->driDrawable ) {
-      __DRIdrawablePrivate *drawable = rmesa->driDrawable;
+      __DRIdrawable *drawable = rmesa->driDrawable;
       int x1 = 0;
       int y1 = 0;
       int x2 = drawable->w - 1;
@@ -702,10 +702,10 @@ static void r128UpdateMasks( GLcontext *ctx )
    r128ContextPtr rmesa = R128_CONTEXT(ctx);
 
    GLuint mask = r128PackColor( rmesa->r128Screen->cpp,
-				ctx->Color.ColorMask[RCOMP],
-				ctx->Color.ColorMask[GCOMP],
-				ctx->Color.ColorMask[BCOMP],
-				ctx->Color.ColorMask[ACOMP] );
+				ctx->Color.ColorMask[0][RCOMP],
+				ctx->Color.ColorMask[0][GCOMP],
+				ctx->Color.ColorMask[0][BCOMP],
+				ctx->Color.ColorMask[0][ACOMP] );
 
    if ( rmesa->setup.plane_3d_mask_c != mask ) {
       rmesa->setup.plane_3d_mask_c = mask;
diff --git a/src/mesa/drivers/dri/r128/r128_tex.c b/src/mesa/drivers/dri/r128/r128_tex.c
index 0a1207fb89..f1be7cc1c4 100644
--- a/src/mesa/drivers/dri/r128/r128_tex.c
+++ b/src/mesa/drivers/dri/r128/r128_tex.c
@@ -169,7 +169,7 @@ static r128TexObjPtr r128AllocTexObj( struct gl_texture_object *texObj )
 
       r128SetTexWrap( t, texObj->WrapS, texObj->WrapT );
       r128SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-      r128SetTexBorderColor( t, texObj->BorderColor );
+      r128SetTexBorderColor( t, texObj->BorderColor.f );
    }
 
    return t;
@@ -535,7 +535,7 @@ static void r128TexParameter( GLcontext *ctx, GLenum target,
 
    case GL_TEXTURE_BORDER_COLOR:
       if ( t->base.bound ) FLUSH_BATCH( rmesa );
-      r128SetTexBorderColor( t, tObj->BorderColor );
+      r128SetTexBorderColor( t, tObj->BorderColor.f );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
diff --git a/src/mesa/drivers/dri/r128/r128_tris.c b/src/mesa/drivers/dri/r128/r128_tris.c
index 5b91271d74..86d4717b05 100644
--- a/src/mesa/drivers/dri/r128/r128_tris.c
+++ b/src/mesa/drivers/dri/r128/r128_tris.c
@@ -216,7 +216,8 @@ do {							\
 
 #define LOCAL_VARS(n)						\
    r128ContextPtr rmesa = R128_CONTEXT(ctx);			\
-   GLuint color[n], spec[n];					\
+   GLuint color[n] = { 0 };					\
+   GLuint spec[n] = { 0 };					\
    GLuint coloroffset = rmesa->coloroffset;			\
    GLuint specoffset = rmesa->specoffset;			\
    GLboolean havespec = (rmesa->specoffset != 0);		\
@@ -650,12 +651,12 @@ static void r128RenderStart( GLcontext *ctx )
    }
 
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(rmesa->tmu_source[0]) )) {
-      if ( VB->TexCoordPtr[rmesa->tmu_source[0]]->size > 2 )
+      if ( VB->AttribPtr[_TNL_ATTRIB_TEX0 + rmesa->tmu_source[0]]->size > 2 )
 	 fallback_projtex = GL_TRUE;
       EMIT_ATTR( _TNL_ATTRIB_TEX0, EMIT_2F, R128_CCE_VC_FRMT_S_T, 8 );
    }
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(rmesa->tmu_source[1]) )) {
-      if ( VB->TexCoordPtr[rmesa->tmu_source[1]]->size > 2 )
+      if ( VB->AttribPtr[_TNL_ATTRIB_TEX0 + rmesa->tmu_source[1]]->size > 2 )
 	 fallback_projtex = GL_TRUE;
       EMIT_ATTR( _TNL_ATTRIB_TEX1, EMIT_2F, R128_CCE_VC_FRMT_S2_T2, 8 );
    }
diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index 776f1e3f3f..8212dc1203 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -14,7 +14,7 @@ EGL_SOURCES = server/radeon_egl.c
 endif
 
 ifeq ($(RADEON_LDFLAGS),)
-CS_SOURCES = radeon_cs_space_drm.c
+CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
 
 RADEON_COMMON_SOURCES = \
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index e3ae839235..f34e319222 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -274,10 +274,10 @@ static void r200_init_vtbl(radeonContextPtr radeon)
 /* Create the device specific rendering context.
  */
 GLboolean r200CreateContext( const __GLcontextModes *glVisual,
-			     __DRIcontextPrivate *driContextPriv,
+			     __DRIcontext *driContextPriv,
 			     void *sharedContextPrivate)
 {
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
    struct dd_function_table functions;
    r200ContextPtr rmesa;
@@ -325,9 +325,9 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    _mesa_init_driver_functions(&functions);
    r200InitDriverFuncs(&functions);
    r200InitIoctlFuncs(&functions);
-   r200InitStateFuncs(&functions, screen->kernel_mm);
+   r200InitStateFuncs(&functions);
    r200InitTextureFuncs(&functions);
-   r200InitShaderFuncs(&functions); 
+   r200InitShaderFuncs(&functions);
    radeonInitQueryObjFunctions(&functions);
 
    if (!radeonInitContext(&rmesa->radeon, &functions,
@@ -496,7 +496,7 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 }
 
 
-void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
+void r200DestroyContext( __DRIcontext *driContextPriv )
 {
 	int i;
 	r200ContextPtr rmesa = (r200ContextPtr)driContextPriv->driverPrivate;
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index 246f98c6dc..17e4d8962e 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -636,14 +636,14 @@ struct r200_context {
 #define R200_CONTEXT(ctx)		((r200ContextPtr)(ctx->DriverCtx))
 
 
-extern void r200DestroyContext( __DRIcontextPrivate *driContextPriv );
+extern void r200DestroyContext( __DRIcontext *driContextPriv );
 extern GLboolean r200CreateContext( const __GLcontextModes *glVisual,
-				    __DRIcontextPrivate *driContextPriv,
+				    __DRIcontext *driContextPriv,
 				    void *sharedContextPrivate);
-extern GLboolean r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
-				  __DRIdrawablePrivate *driDrawPriv,
-				  __DRIdrawablePrivate *driReadPriv );
-extern GLboolean r200UnbindContext( __DRIcontextPrivate *driContextPriv );
+extern GLboolean r200MakeCurrent( __DRIcontext *driContextPriv,
+				  __DRIdrawable *driDrawPriv,
+				  __DRIdrawable *driReadPriv );
+extern GLboolean r200UnbindContext( __DRIcontext *driContextPriv );
 
 /* ================================================================
  * Debugging:
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
index b238adb972..66c5d3655a 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
@@ -61,7 +61,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 static void r200KernelClear(GLcontext *ctx, GLuint flags)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLint cx, cy, cw, ch, ret;
    GLuint i;
 
@@ -185,7 +185,7 @@ static void r200KernelClear(GLcontext *ctx, GLuint flags)
 static void r200Clear( GLcontext *ctx, GLbitfield mask )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLuint flags = 0;
    GLuint color_mask = 0;
    GLuint orig_mask = mask;
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index 383a0c4b0d..249c0bbc11 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -90,12 +90,14 @@ static void r200_emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
 	aos->components = size;
 	aos->count = count;
 
+	radeon_bo_map(aos->bo, 1);
 	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
 	for (i = 0; i < count; i++) {
 	  out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
 	  out++;
 	  data += stride;
 	}
+	radeon_bo_unmap(aos->bo);
 }
 
 /* Emit any changed arrays to new GART memory, re-emit a packet to
diff --git a/src/mesa/drivers/dri/r200/r200_pixel.c b/src/mesa/drivers/dri/r200/r200_pixel.c
index 95773871e0..bfb7e2a2ed 100644
--- a/src/mesa/drivers/dri/r200/r200_pixel.c
+++ b/src/mesa/drivers/dri/r200/r200_pixel.c
@@ -88,10 +88,10 @@ check_color_per_fragment_ops( const GLcontext *ctx )
 		    ctx->Fog.Enabled ||
 		    ctx->Scissor.Enabled ||
 		    ctx->Stencil._Enabled ||
-		    !ctx->Color.ColorMask[0] ||
-		    !ctx->Color.ColorMask[1] ||
-		    !ctx->Color.ColorMask[2] ||
-		    !ctx->Color.ColorMask[3] ||
+		    !ctx->Color.ColorMask[0][0] ||
+		    !ctx->Color.ColorMask[0][1] ||
+		    !ctx->Color.ColorMask[0][2] ||
+		    !ctx->Color.ColorMask[0][3] ||
 		    ctx->Color.ColorLogicOpEnabled ||
 		    ctx->Texture._EnabledUnits
            ) &&
@@ -214,7 +214,7 @@ r200TryReadPixels( GLcontext *ctx,
    }
 
    {
-      __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+      __DRIdrawable *dPriv = rmesa->radeon.dri.drawable;
       driRenderbuffer *drb = (driRenderbuffer *) ctx->ReadBuffer->_ColorReadBuffer;
       int nbox = dPriv->numClipRects;
       int src_offset = drb->offset
@@ -298,7 +298,7 @@ static void do_draw_pix( GLcontext *ctx,
 
 #if 0
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    drm_clip_rect_t *box = dPriv->pClipRects;
    struct gl_renderbuffer *rb = ctx->ReadBuffer->_ColorDrawBuffers[0];
    driRenderbuffer *drb = (driRenderbuffer *) rb;
@@ -400,10 +400,10 @@ r200TryDrawPixels( GLcontext *ctx,
    case GL_RGBA:
    case GL_BGRA:
       planemask = radeonPackColor(cpp,
-				ctx->Color.ColorMask[RCOMP],
-				ctx->Color.ColorMask[GCOMP],
-				ctx->Color.ColorMask[BCOMP],
-				ctx->Color.ColorMask[ACOMP]);
+				ctx->Color.ColorMask[0][RCOMP],
+				ctx->Color.ColorMask[0][GCOMP],
+				ctx->Color.ColorMask[0][BCOMP],
+				ctx->Color.ColorMask[0][ACOMP]);
 
       if (cpp == 2)
 	 planemask |= planemask << 16;
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index d28e96d9d9..7fe482fe15 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -721,10 +721,10 @@ static void r200ColorMask( GLcontext *ctx,
    if (!rrb)
      return;
    mask = radeonPackColor( rrb->cpp,
-			   ctx->Color.ColorMask[RCOMP],
-			   ctx->Color.ColorMask[GCOMP],
-			   ctx->Color.ColorMask[BCOMP],
-			   ctx->Color.ColorMask[ACOMP] );
+			   ctx->Color.ColorMask[0][RCOMP],
+			   ctx->Color.ColorMask[0][GCOMP],
+			   ctx->Color.ColorMask[0][BCOMP],
+			   ctx->Color.ColorMask[0][ACOMP] );
 
 
    if (!(r && g && b && a))
@@ -1585,7 +1585,7 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
 void r200UpdateWindow( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1665,7 +1665,7 @@ static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
 void r200UpdateViewportOffset( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -2476,7 +2476,7 @@ static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
 }
 /* Initialize the driver's state functions.
  */
-void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 )
+void r200InitStateFuncs( struct dd_function_table *functions )
 {
    functions->UpdateState		= r200InvalidateState;
    functions->LightingSpaceChange	= r200LightingSpaceChange;
@@ -2510,10 +2510,7 @@ void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 )
    functions->LogicOpcode		= r200LogicOpCode;
    functions->PolygonMode		= r200PolygonMode;
    functions->PolygonOffset		= r200PolygonOffset;
-   if (dri2)
-      functions->PolygonStipple		= r200PolygonStipple;
-   else
-      functions->PolygonStipple		= radeonPolygonStipplePreKMS;
+   functions->PolygonStipple		= r200PolygonStipple;
    functions->PointParameterfv		= r200PointParameter;
    functions->PointSize			= r200PointSize;
    functions->RenderMode		= r200RenderMode;
diff --git a/src/mesa/drivers/dri/r200/r200_state.h b/src/mesa/drivers/dri/r200/r200_state.h
index 9c62f0a644..7b9b0c106a 100644
--- a/src/mesa/drivers/dri/r200/r200_state.h
+++ b/src/mesa/drivers/dri/r200/r200_state.h
@@ -38,7 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 
 extern void r200InitState( r200ContextPtr rmesa );
-extern void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 );
+extern void r200InitStateFuncs( struct dd_function_table *functions );
 extern void r200InitTnlFuncs( GLcontext *ctx );
 
 extern void r200UpdateMaterial( GLcontext *ctx );
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index 68bfeea701..6c5a0b79ee 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -640,7 +640,7 @@ static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    OUT_BATCH_TABLE(atom->cmd, 10);
 
    if (t && t->mt && !t->image_override) {
-     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, get_base_teximage_offset(t),
 		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
    } else if (!t) {
      /* workaround for old CS mechanism */
@@ -887,10 +887,8 @@ void r200InitState( r200ContextPtr rmesa )
          }
       }
    }
-   /* polygon stipple is done with irq for non-kms */
-   if (rmesa->radeon.radeonScreen->kernel_mm) {
-       ALLOC_STATE( stp, always, STP_STATE_SIZE, "STP/stp", 0 );
-   }
+
+   ALLOC_STATE( stp, always, STP_STATE_SIZE, "STP/stp", 0 );
 
    for (i = 0; i < 6; i++)
       if (rmesa->radeon.radeonScreen->kernel_mm)
@@ -1122,12 +1120,11 @@ void r200InitState( r200ContextPtr rmesa )
    rmesa->hw.sci.cmd[SCI_CMD_1] = CP_PACKET0(R200_RE_TOP_LEFT, 0);
    rmesa->hw.sci.cmd[SCI_CMD_2] = CP_PACKET0(R200_RE_WIDTH_HEIGHT, 0);
 
-   if (rmesa->radeon.radeonScreen->kernel_mm) {
-
-	rmesa->hw.stp.cmd[STP_CMD_0] = CP_PACKET0(RADEON_RE_STIPPLE_ADDR, 0);
-	rmesa->hw.stp.cmd[STP_DATA_0] = 0;
-	rmesa->hw.stp.cmd[STP_CMD_1] = CP_PACKET0_ONE(RADEON_RE_STIPPLE_DATA, 31);
+   rmesa->hw.stp.cmd[STP_CMD_0] = CP_PACKET0(RADEON_RE_STIPPLE_ADDR, 0);
+   rmesa->hw.stp.cmd[STP_DATA_0] = 0;
+   rmesa->hw.stp.cmd[STP_CMD_1] = CP_PACKET0_ONE(RADEON_RE_STIPPLE_DATA, 31);
 
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
         rmesa->hw.mtl[0].emit = mtl_emit;
         rmesa->hw.mtl[1].emit = mtl_emit;
 
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index 240fb45078..4596912ddc 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -168,7 +168,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
 
       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 	 if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-	    GLuint sz = VB->TexCoordPtr[i]->size;
+	    GLuint sz = VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->size;
 
 	    fmt_1 |= sz << (3 * i);
 	    EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1, 0 );
@@ -297,7 +297,7 @@ void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
    radeonEmitState(&rmesa->radeon);
    r200EmitVertexAOS( rmesa,
 		      rmesa->radeon.swtcl.vertex_size,
-		      first_elem(&rmesa->radeon.dma.reserved)->bo,
+		      rmesa->radeon.swtcl.bo,
 		      current_offset);
 
 
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index c702910ef2..e7d48a7f29 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -509,25 +509,26 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
 	 prog to a not enabled output however, so just don't mess with it.
 	 We only need to change compsel. */
       GLuint out_compsel = 0;
-      GLuint vp_out = rmesa->curr_vp_hw->mesa_program.Base.OutputsWritten;
+      const GLbitfield64 vp_out =
+	 rmesa->curr_vp_hw->mesa_program.Base.OutputsWritten;
 
       vimap_rev = &rmesa->curr_vp_hw->inputmap_rev[0];
-      assert(vp_out & (1 << VERT_RESULT_HPOS));
+      assert(vp_out & BITFIELD64_BIT(VERT_RESULT_HPOS));
       out_compsel = R200_OUTPUT_XYZW;
-      if (vp_out & (1 << VERT_RESULT_COL0)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_COL0)) {
 	 out_compsel |= R200_OUTPUT_COLOR_0;
       }
-      if (vp_out & (1 << VERT_RESULT_COL1)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_COL1)) {
 	 out_compsel |= R200_OUTPUT_COLOR_1;
       }
-      if (vp_out & (1 << VERT_RESULT_FOGC)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_FOGC)) {
          out_compsel |= R200_OUTPUT_DISCRETE_FOG;
       }
-      if (vp_out & (1 << VERT_RESULT_PSIZ)) {
+      if (vp_out & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
 	 out_compsel |= R200_OUTPUT_PT_SIZE;
       }
       for (i = VERT_RESULT_TEX0; i < VERT_RESULT_TEX6; i++) {
-	 if (vp_out & (1 << i)) {
+	 if (vp_out & BITFIELD64_BIT(i)) {
 	    out_compsel |= R200_OUTPUT_TEX_0 << (i - VERT_RESULT_TEX0);
 	 }
       }
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 5a21a8b9c5..5b87ba6ccd 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -378,23 +378,14 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
       break;
 
    case GL_TEXTURE_BORDER_COLOR:
-      r200SetTexBorderColor( t, texObj->BorderColor );
+      r200SetTexBorderColor( t, texObj->BorderColor.f );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
    case GL_TEXTURE_MAX_LEVEL:
    case GL_TEXTURE_MIN_LOD:
    case GL_TEXTURE_MAX_LOD:
-      /* This isn't the most efficient solution but there doesn't appear to
-       * be a nice alternative.  Since there's no LOD clamping,
-       * we just have to rely on loading the right subset of mipmap levels
-       * to simulate a clamped LOD.
-       */
-      if (t->mt) {
-         radeon_miptree_unreference(t->mt);
-	 t->mt = 0;
-	 t->validated = GL_FALSE;
-      }
+      t->validated = GL_FALSE;
       break;
 
    default:
@@ -413,7 +404,7 @@ static void r200DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 	      (void *)texObj,
 	      _mesa_lookup_enum_by_nr(texObj->Target));
    }
-   
+
    if (rmesa) {
       int i;
       radeon_firevertices(&rmesa->radeon);
@@ -425,11 +416,9 @@ static void r200DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 	 }
       }      
    }
-   
-   if (t->mt) {
-      radeon_miptree_unreference(t->mt);
-      t->mt = 0;
-   }
+
+   radeon_miptree_unreference(&t->mt);
+
    _mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -481,7 +470,7 @@ static struct gl_texture_object *r200NewTextureObject(GLcontext * ctx,
    r200SetTexWrap( t, t->base.WrapS, t->base.WrapT, t->base.WrapR );
    r200SetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
    r200SetTexFilter(t, t->base.MinFilter, t->base.MagFilter);
-   r200SetTexBorderColor(t, t->base.BorderColor);
+   r200SetTexBorderColor(t, t->base.BorderColor.f);
 
    return &t->base;
 }
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index 7d0afa1add..e2f9cf0ea8 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -797,24 +797,13 @@ void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
     	    return;
     	}
 
-	radeon_update_renderbuffers(pDRICtx, dPriv);
-	/* back & depth buffer are useless free them right away */
-	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-        rb->bo = NULL;
-	}
-	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-		rb->bo = NULL;
-	}
+	radeon_update_renderbuffers(pDRICtx, dPriv, GL_TRUE);
 	rb = rfb->color_rb[0];
 	if (rb->bo == NULL) {
 		/* Failed to BO for the buffer */
 		return;
 	}
-	
+
 	_mesa_lock_texture(radeon->glCtx, texObj);
 	if (t->bo) {
 		radeon_bo_unref(t->bo);
@@ -824,14 +813,10 @@ void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
@@ -1423,10 +1408,9 @@ void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
  */
 static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 {
-   int firstlevel = t->mt ? t->mt->firstLevel : 0;
-   const struct gl_texture_image *firstImage = t->base.Image[0][firstlevel];
+   const struct gl_texture_image *firstImage = t->base.Image[0][t->minLod];
    GLint log2Width, log2Height, log2Depth, texelBytes;
-   
+
    if ( t->bo ) {
        return;
    }
@@ -1454,9 +1438,9 @@ static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 	 return;
       }
    }
-   
+
    t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << R200_MAX_MIP_LEVEL_SHIFT;
+   t->pp_txfilter |= (t->maxLod - t->minLod) << R200_MAX_MIP_LEVEL_SHIFT;
 	
    t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
 		       R200_TXFORMAT_HEIGHT_MASK |
diff --git a/src/mesa/drivers/dri/r200/radeon_bo.c b/src/mesa/drivers/dri/r200/radeon_bo.c
new file mode 120000
index 0000000000..9448ffee54
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_bo.c
@@ -0,0 +1 @@
+../radeon/radeon_bo.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_bo_int_drm.h b/src/mesa/drivers/dri/r200/radeon_bo_int_drm.h
new file mode 120000
index 0000000000..029450928b
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_bo_int_drm.h
@@ -0,0 +1 @@
+../radeon/radeon_bo_int_drm.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_cs.c b/src/mesa/drivers/dri/r200/radeon_cs.c
new file mode 120000
index 0000000000..66b7ad1eb0
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_cs.c
@@ -0,0 +1 @@
+../radeon/radeon_cs.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_cs_int_drm.h b/src/mesa/drivers/dri/r200/radeon_cs_int_drm.h
new file mode 120000
index 0000000000..462f5245d0
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_cs_int_drm.h
@@ -0,0 +1 @@
+../radeon/radeon_cs_int_drm.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index cb0f715fa0..be005bd164 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -14,7 +14,7 @@ EGL_SOURCES = server/radeon_egl.c
 endif
 
 ifeq ($(RADEON_LDFLAGS),)
-CS_SOURCES = radeon_cs_space_drm.c
+CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
 
 COMMON_SOURCES = \
@@ -43,13 +43,14 @@ RADEON_COMMON_SOURCES = \
 
 DRIVER_SOURCES = \
 		 radeon_screen.c \
+		 r300_blit.c \
 		 r300_context.c \
 		 r300_draw.c \
-		 r300_ioctl.c \
 		 r300_cmdbuf.c \
 		 r300_state.c \
 		 r300_render.c \
 		 r300_tex.c \
+		 r300_texcopy.c \
 		 r300_texstate.c \
 		 r300_vertprog.c \
 		 r300_fragprog_common.c \
diff --git a/src/mesa/drivers/dri/r300/compiler/memory_pool.c b/src/mesa/drivers/dri/r300/compiler/memory_pool.c
index 37aa2b6579..76c7c60d8f 100644
--- a/src/mesa/drivers/dri/r300/compiler/memory_pool.c
+++ b/src/mesa/drivers/dri/r300/compiler/memory_pool.c
@@ -71,12 +71,14 @@ static void refill_pool(struct memory_pool * pool)
 void * memory_pool_malloc(struct memory_pool * pool, unsigned int bytes)
 {
 	if (bytes < POOL_LARGE_ALLOC) {
+		void * ptr;
+
 		if (pool->head + bytes > pool->end)
 			refill_pool(pool);
 
 		assert(pool->head + bytes <= pool->end);
 
-		void * ptr = pool->head;
+		ptr = pool->head;
 
 		pool->head += bytes;
 		pool->head = (unsigned char*)(((unsigned long)pool->head + POOL_ALIGN - 1) & ~(POOL_ALIGN - 1));
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.c b/src/mesa/drivers/dri/r300/compiler/radeon_code.c
index 1a3d8bb641..853b2becd1 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.c
@@ -143,7 +143,8 @@ unsigned rc_constants_add_immediate_scalar(struct rc_constant_list * c, float da
 
 	for(index = 0; index < c->Count; ++index) {
 		if (c->Constants[index].Type == RC_CONSTANT_IMMEDIATE) {
-			for(unsigned comp = 0; comp < c->Constants[index].Size; ++comp) {
+			unsigned comp;
+			for(comp = 0; comp < c->Constants[index].Size; ++comp) {
 				if (c->Constants[index].u.Immediate[comp] == data) {
 					*swizzle = RC_MAKE_SWIZZLE(comp, comp, comp, comp);
 					return index;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index 902b7cfa53..6d979bbaec 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -59,7 +59,9 @@ enum {
 	RC_STATE_SHADOW_AMBIENT = 0,
 
 	RC_STATE_R300_WINDOW_DIMENSION,
-	RC_STATE_R300_TEXRECT_FACTOR
+	RC_STATE_R300_TEXRECT_FACTOR,
+	RC_STATE_R300_VIEWPORT_SCALE,
+	RC_STATE_R300_VIEWPORT_OFFSET
 };
 
 struct rc_constant {
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
index c0e7a7f7a0..272f9072d4 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
@@ -229,15 +229,20 @@ void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_ou
 /**
  * Introduce standard code fragment to deal with fragment.position.
  */
-void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input)
+void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input,
+                                int full_vtransform)
 {
 	unsigned tempregi = rc_find_free_temporary(c);
+	struct rc_instruction * inst_rcp;
+	struct rc_instruction * inst_mul;
+	struct rc_instruction * inst_mad;
+	struct rc_instruction * inst;
 
 	c->Program.InputsRead &= ~(1 << wpos);
 	c->Program.InputsRead |= 1 << new_input;
 
 	/* perspective divide */
-	struct rc_instruction * inst_rcp = rc_insert_new_instruction(c, &c->Program.Instructions);
+	inst_rcp = rc_insert_new_instruction(c, &c->Program.Instructions);
 	inst_rcp->U.I.Opcode = RC_OPCODE_RCP;
 
 	inst_rcp->U.I.DstReg.File = RC_FILE_TEMPORARY;
@@ -248,7 +253,7 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig
 	inst_rcp->U.I.SrcReg[0].Index = new_input;
 	inst_rcp->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_WWWW;
 
-	struct rc_instruction * inst_mul = rc_insert_new_instruction(c, inst_rcp);
+	inst_mul = rc_insert_new_instruction(c, inst_rcp);
 	inst_mul->U.I.Opcode = RC_OPCODE_MUL;
 
 	inst_mul->U.I.DstReg.File = RC_FILE_TEMPORARY;
@@ -263,7 +268,7 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig
 	inst_mul->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_WWWW;
 
 	/* viewport transformation */
-	struct rc_instruction * inst_mad = rc_insert_new_instruction(c, inst_mul);
+	inst_mad = rc_insert_new_instruction(c, inst_mul);
 	inst_mad->U.I.Opcode = RC_OPCODE_MAD;
 
 	inst_mad->U.I.DstReg.File = RC_FILE_TEMPORARY;
@@ -275,14 +280,19 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig
 	inst_mad->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ZERO);
 
 	inst_mad->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
-	inst_mad->U.I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_WINDOW_DIMENSION, 0);
 	inst_mad->U.I.SrcReg[1].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ZERO);
 
 	inst_mad->U.I.SrcReg[2].File = RC_FILE_CONSTANT;
-	inst_mad->U.I.SrcReg[2].Index = inst_mad->U.I.SrcReg[1].Index;
 	inst_mad->U.I.SrcReg[2].Swizzle = RC_MAKE_SWIZZLE(RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ZERO);
 
-	struct rc_instruction * inst;
+	if (full_vtransform) {
+		inst_mad->U.I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_VIEWPORT_SCALE, 0);
+		inst_mad->U.I.SrcReg[2].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_VIEWPORT_OFFSET, 0);
+	} else {
+		inst_mad->U.I.SrcReg[1].Index =
+		inst_mad->U.I.SrcReg[2].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_WINDOW_DIMENSION, 0);
+	}
+
 	for (inst = inst_mad->Next; inst != &c->Program.Instructions; inst = inst->Next) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
 		unsigned i;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
index 87a732cd90..731adc1af2 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -73,7 +73,8 @@ void rc_calculate_inputs_outputs(struct radeon_compiler * c);
 void rc_move_input(struct radeon_compiler * c, unsigned input, struct rc_src_register new_input);
 void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_output, unsigned writemask);
 void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_output);
-void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input);
+void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input,
+                                int full_vtransform);
 
 struct r300_fragment_program_compiler {
 	struct radeon_compiler Base;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
index 828d0c8e28..b2fe7f76b2 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
@@ -49,7 +49,7 @@ struct register_info {
 
 	unsigned int Used:1;
 	unsigned int Allocated:1;
-	rc_register_file File:3;
+	unsigned int File:3;
 	unsigned int Index:RC_REGISTER_INDEX_BITS;
 };
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.c b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
index 0dbc5380bb..a3c41d7bd4 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
@@ -94,10 +94,11 @@ unsigned int rc_find_free_temporary(struct radeon_compiler * c)
 {
 	char used[RC_REGISTER_MAX_INDEX];
 	unsigned int i;
+	struct rc_instruction * rcinst;
 
 	memset(used, 0, sizeof(used));
 
-	for (struct rc_instruction * rcinst = c->Program.Instructions.Next; rcinst != &c->Program.Instructions; rcinst = rcinst->Next) {
+	for (rcinst = c->Program.Instructions.Next; rcinst != &c->Program.Instructions; rcinst = rcinst->Next) {
 		const struct rc_sub_instruction *inst = &rcinst->U.I;
 		const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->Opcode);
 		unsigned int k;
@@ -168,8 +169,9 @@ void rc_remove_instruction(struct rc_instruction * inst)
 unsigned int rc_recompute_ips(struct radeon_compiler * c)
 {
 	unsigned int ip = 0;
+	struct rc_instruction * inst;
 
-	for(struct rc_instruction * inst = c->Program.Instructions.Next;
+	for(inst = c->Program.Instructions.Next;
 	    inst != &c->Program.Instructions;
 	    inst = inst->Next) {
 		inst->IP = ip++;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.h b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
index 33db3ea0ff..e318867696 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
@@ -39,7 +39,7 @@
 struct radeon_compiler;
 
 struct rc_src_register {
-	rc_register_file File:3;
+	unsigned int File:3;
 
 	/** Negative values may be used for relative addressing. */
 	signed int Index:(RC_REGISTER_INDEX_BITS+1);
@@ -55,7 +55,7 @@ struct rc_src_register {
 };
 
 struct rc_dst_register {
-	rc_register_file File:3;
+	unsigned int File:3;
 
 	/** Negative values may be used for relative addressing. */
 	signed int Index:(RC_REGISTER_INDEX_BITS+1);
@@ -79,20 +79,20 @@ struct rc_sub_instruction {
 	/**
 	 * Opcode of this instruction, according to \ref rc_opcode enums.
 	 */
-	rc_opcode Opcode:8;
+	unsigned int Opcode:8;
 
 	/**
 	 * Saturate each value of the result to the range [0,1] or [-1,1],
 	 * according to \ref rc_saturate_mode enums.
 	 */
-	rc_saturate_mode SaturateMode:2;
+	unsigned int SaturateMode:2;
 
 	/**
 	 * Writing to the special register RC_SPECIAL_ALU_RESULT
 	 */
 	/*@{*/
-	rc_write_aluresult WriteALUResult:2;
-	rc_compare_func ALUResultCompare:3;
+	unsigned int WriteALUResult:2;
+	unsigned int ALUResultCompare:3;
 	/*@}*/
 
 	/**
@@ -103,7 +103,7 @@ struct rc_sub_instruction {
 	unsigned int TexSrcUnit:5;
 
 	/** Source texture target, one of the \ref rc_texture_target enums */
-	rc_texture_target TexSrcTarget:3;
+	unsigned int TexSrcTarget:3;
 
 	/** True if tex instruction should do shadow comparison */
 	unsigned int TexShadow:1;
@@ -191,7 +191,7 @@ struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register
 
 static inline void reset_srcreg(struct rc_src_register* reg)
 {
-	memset(reg, 0, sizeof(reg));
+	memset(reg, 0, sizeof(struct rc_src_register));
 	reg->Swizzle = RC_SWIZZLE_XYZW;
 }
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
index 0326d25233..b5c08aea49 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -267,9 +267,9 @@ static void transform_LIT(struct radeon_compiler* c,
 	temp = inst->U.I.DstReg.Index;
 	srctemp = srcreg(RC_FILE_TEMPORARY, temp);
 
-	// tmp.x = max(0.0, Src.x);
-	// tmp.y = max(0.0, Src.y);
-	// tmp.w = clamp(Src.z, -128+eps, 128-eps);
+	/* tmp.x = max(0.0, Src.x); */
+	/* tmp.y = max(0.0, Src.y); */
+	/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
 		dstregtmpmask(temp, RC_MASK_XYW),
 		inst->U.I.SrcReg[0],
@@ -280,7 +280,7 @@ static void transform_LIT(struct radeon_compiler* c,
 		swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
 		negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
 
-	// tmp.w = Pow(tmp.y, tmp.w)
+	/* tmp.w = Pow(tmp.y, tmp.w) */
 	emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
 		dstregtmpmask(temp, RC_MASK_W),
 		swizzle(srctemp, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y));
@@ -292,14 +292,14 @@ static void transform_LIT(struct radeon_compiler* c,
 		dstregtmpmask(temp, RC_MASK_W),
 		swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W));
 
-	// tmp.z = (tmp.x > 0) ? tmp.w : 0.0
+	/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
 		dstregtmpmask(temp, RC_MASK_Z),
 		negate(swizzle(srctemp, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)),
 		swizzle(srctemp, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
 		builtin_zero);
 
-	// tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
+	/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
 	emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
 		dstregtmpmask(temp, RC_MASK_XYW),
 		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
@@ -533,16 +533,16 @@ static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
 {
 	static const float SinCosConsts[2][4] = {
 		{
-			1.273239545,		// 4/PI
-			-0.405284735,		// -4/(PI*PI)
-			3.141592654,		// PI
-			0.2225			// weight
+			1.273239545,		/* 4/PI */
+			-0.405284735,		/* -4/(PI*PI) */
+			3.141592654,		/* PI */
+			0.2225			/* weight */
 		},
 		{
 			0.75,
 			0.5,
-			0.159154943,		// 1/(2*PI)
-			6.283185307		// 2*PI
+			0.159154943,		/* 1/(2*PI) */
+			6.283185307		/* 2*PI */
 		}
 	};
 	int i;
@@ -560,23 +560,23 @@ static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
  * MAD dest, tmp.y, weight, tmp.x
  */
 static void sin_approx(
-	struct radeon_compiler* c, struct rc_instruction * before,
+	struct radeon_compiler* c, struct rc_instruction * inst,
 	struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
 {
 	unsigned int tempreg = rc_find_free_temporary(c);
 
-	emit2(c, before, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
 		swizzle(src, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
 		srcreg(RC_FILE_CONSTANT, constants[0]));
-	emit3(c, before, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
+	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
 		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y),
 		absolute(swizzle(src, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)),
 		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X));
-	emit3(c, before, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
+	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
 		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
 		absolute(swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)),
 		negate(swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X)));
-	emit3(c, before, RC_OPCODE_MAD, 0, dst,
+	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
 		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y, RC_SWIZZLE_Y),
 		swizzle(srcreg(RC_FILE_CONSTANT, constants[0]), RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W, RC_SWIZZLE_W),
 		swizzle(srcreg(RC_FILE_TEMPORARY, tempreg), RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X));
@@ -602,9 +602,9 @@ int radeonTransformTrigSimple(struct radeon_compiler* c,
 	sincos_constants(c, constants);
 
 	if (inst->U.I.Opcode == RC_OPCODE_COS) {
-		// MAD tmp.x, src, 1/(2*PI), 0.75
-		// FRC tmp.x, tmp.x
-		// MAD tmp.z, tmp.x, 2*PI, -PI
+		/* MAD tmp.x, src, 1/(2*PI), 0.75 */
+		/* FRC tmp.x, tmp.x */
+		/* MAD tmp.z, tmp.x, 2*PI, -PI */
 		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
 			swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X, RC_SWIZZLE_X),
 			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z, RC_SWIZZLE_Z),
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
index 1600598428..6685ade3ea 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
@@ -52,12 +52,12 @@ struct r300_fragment_program_compiler;
 
 struct radeon_pair_instruction_source {
 	unsigned int Used:1;
-	rc_register_file File:3;
+	unsigned int File:3;
 	unsigned int Index:RC_REGISTER_INDEX_BITS;
 };
 
 struct radeon_pair_instruction_rgb {
-	rc_opcode Opcode:8;
+	unsigned int Opcode:8;
 	unsigned int DestIndex:RC_REGISTER_INDEX_BITS;
 	unsigned int WriteMask:3;
 	unsigned int OutputWriteMask:3;
@@ -74,7 +74,7 @@ struct radeon_pair_instruction_rgb {
 };
 
 struct radeon_pair_instruction_alpha {
-	rc_opcode Opcode:8;
+	unsigned int Opcode:8;
 	unsigned int DestIndex:RC_REGISTER_INDEX_BITS;
 	unsigned int WriteMask:1;
 	unsigned int OutputWriteMask:1;
@@ -95,8 +95,8 @@ struct rc_pair_instruction {
 	struct radeon_pair_instruction_rgb RGB;
 	struct radeon_pair_instruction_alpha Alpha;
 
-	rc_write_aluresult WriteALUResult:2;
-	rc_compare_func ALUResultCompare:3;
+	unsigned int WriteALUResult:2;
+	unsigned int ALUResultCompare:3;
 };
 
 
diff --git a/src/mesa/drivers/dri/r300/r300_blit.c b/src/mesa/drivers/dri/r300/r300_blit.c
new file mode 100644
index 0000000000..2eec27e900
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_blit.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_common.h"
+#include "r300_context.h"
+
+#include "r300_blit.h"
+#include "r300_cmdbuf.h"
+#include "r300_emit.h"
+#include "r300_tex.h"
+#include "compiler/radeon_compiler.h"
+#include "compiler/radeon_opcodes.h"
+
+static void vp_ins_outs(struct r300_vertex_program_compiler *c)
+{
+    c->code->inputs[VERT_ATTRIB_POS] = 0;
+    c->code->inputs[VERT_ATTRIB_TEX0] = 1;
+    c->code->outputs[VERT_RESULT_HPOS] = 0;
+    c->code->outputs[VERT_RESULT_TEX0] = 1;
+}
+
+static void fp_allocate_hw_inputs(
+    struct r300_fragment_program_compiler * c,
+    void (*allocate)(void * data, unsigned input, unsigned hwreg),
+    void * mydata)
+{
+    allocate(mydata, FRAG_ATTRIB_TEX0, 0);
+}
+
+static void create_vertex_program(struct r300_context *r300)
+{
+    struct r300_vertex_program_compiler compiler;
+    struct rc_instruction *inst;
+
+    rc_init(&compiler.Base);
+
+    inst = rc_insert_new_instruction(&compiler.Base, compiler.Base.Program.Instructions.Prev);
+    inst->U.I.Opcode = RC_OPCODE_MOV;
+    inst->U.I.DstReg.File = RC_FILE_OUTPUT;
+    inst->U.I.DstReg.Index = VERT_RESULT_HPOS;
+    inst->U.I.DstReg.RelAddr = 0;
+    inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
+    inst->U.I.SrcReg[0].Abs = 0;
+    inst->U.I.SrcReg[0].File = RC_FILE_INPUT;
+    inst->U.I.SrcReg[0].Index = VERT_ATTRIB_POS;
+    inst->U.I.SrcReg[0].Negate = 0;
+    inst->U.I.SrcReg[0].RelAddr = 0;
+    inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
+
+    inst = rc_insert_new_instruction(&compiler.Base, compiler.Base.Program.Instructions.Prev);
+    inst->U.I.Opcode = RC_OPCODE_MOV;
+    inst->U.I.DstReg.File = RC_FILE_OUTPUT;
+    inst->U.I.DstReg.Index = VERT_RESULT_TEX0;
+    inst->U.I.DstReg.RelAddr = 0;
+    inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
+    inst->U.I.SrcReg[0].Abs = 0;
+    inst->U.I.SrcReg[0].File = RC_FILE_INPUT;
+    inst->U.I.SrcReg[0].Index = VERT_ATTRIB_TEX0;
+    inst->U.I.SrcReg[0].Negate = 0;
+    inst->U.I.SrcReg[0].RelAddr = 0;
+    inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
+
+    compiler.Base.Program.InputsRead = (1 << VERT_ATTRIB_POS) | (1 << VERT_ATTRIB_TEX0);
+    compiler.RequiredOutputs = compiler.Base.Program.OutputsWritten = (1 << VERT_RESULT_HPOS) | (1 << VERT_RESULT_TEX0);
+    compiler.SetHwInputOutput = vp_ins_outs;
+    compiler.code = &r300->blit.vp_code;
+
+    r3xx_compile_vertex_program(&compiler);
+}
+
+static void create_fragment_program(struct r300_context *r300)
+{
+    struct r300_fragment_program_compiler compiler;
+    struct rc_instruction *inst;
+
+    rc_init(&compiler.Base);
+
+    inst = rc_insert_new_instruction(&compiler.Base, compiler.Base.Program.Instructions.Prev);
+    inst->U.I.Opcode = RC_OPCODE_TEX;
+    inst->U.I.TexSrcTarget = RC_TEXTURE_2D;
+    inst->U.I.TexSrcUnit = 0;
+    inst->U.I.DstReg.File = RC_FILE_OUTPUT;
+    inst->U.I.DstReg.Index = FRAG_RESULT_COLOR;
+    inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
+    inst->U.I.SrcReg[0].Abs = 0;
+    inst->U.I.SrcReg[0].File = RC_FILE_INPUT;
+    inst->U.I.SrcReg[0].Index = FRAG_ATTRIB_TEX0;
+    inst->U.I.SrcReg[0].Negate = 0;
+    inst->U.I.SrcReg[0].RelAddr = 0;
+    inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
+
+    compiler.Base.Program.InputsRead = (1 << FRAG_ATTRIB_TEX0);
+    compiler.OutputColor = FRAG_RESULT_COLOR;
+    compiler.OutputDepth = FRAG_RESULT_DEPTH;
+    compiler.is_r500 = (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515);
+    compiler.code = &r300->blit.fp_code;
+    compiler.AllocateHwInputs = fp_allocate_hw_inputs;
+
+    r3xx_compile_fragment_program(&compiler);
+}
+
+void r300_blit_init(struct r300_context *r300)
+{
+    create_vertex_program(r300);
+    create_fragment_program(r300);
+}
+
+static void r300_emit_tx_setup(struct r300_context *r300,
+                               gl_format mesa_format,
+                               struct radeon_bo *bo,
+                               intptr_t offset,
+                               unsigned width,
+                               unsigned height,
+                               unsigned pitch)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    assert(width <= 2048);
+    assert(height <= 2048);
+    assert(r300TranslateTexFormat(mesa_format) >= 0);
+    assert(offset % 32 == 0);
+
+    BEGIN_BATCH(17);
+    OUT_BATCH_REGVAL(R300_TX_FILTER0_0,
+                     (R300_TX_CLAMP_TO_EDGE  << R300_TX_WRAP_S_SHIFT) |
+                     (R300_TX_CLAMP_TO_EDGE  << R300_TX_WRAP_T_SHIFT) |
+                     (R300_TX_CLAMP_TO_EDGE  << R300_TX_WRAP_R_SHIFT) |
+                     R300_TX_MIN_FILTER_MIP_NONE |
+                     R300_TX_MIN_FILTER_LINEAR |
+                     R300_TX_MAG_FILTER_LINEAR |
+                     (0 << 28));
+    OUT_BATCH_REGVAL(R300_TX_FILTER1_0, 0);
+    OUT_BATCH_REGVAL(R300_TX_SIZE_0,
+                     ((width-1) << R300_TX_WIDTHMASK_SHIFT) |
+                     ((height-1) << R300_TX_HEIGHTMASK_SHIFT) |
+                     (0 << R300_TX_DEPTHMASK_SHIFT) |
+                     (0 << R300_TX_MAX_MIP_LEVEL_SHIFT) |
+                     R300_TX_SIZE_TXPITCH_EN);
+
+    OUT_BATCH_REGVAL(R300_TX_FORMAT_0, r300TranslateTexFormat(mesa_format));
+    OUT_BATCH_REGVAL(R300_TX_FORMAT2_0, pitch - 1);
+    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0, 1);
+    OUT_BATCH_RELOC(0, bo, offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+
+    OUT_BATCH_REGSEQ(R300_TX_INVALTAGS, 2);
+    OUT_BATCH(0);
+    OUT_BATCH(1);
+
+    END_BATCH();
+}
+
+#define EASY_US_FORMAT(FMT, C0, C1, C2, C3, SIGN) \
+    (FMT  | R500_C0_SEL_##C0 | R500_C1_SEL_##C1 | \
+    R500_C2_SEL_##C2 | R500_C3_SEL_##C3 | R500_OUT_SIGN(SIGN))
+
+static uint32_t mesa_format_to_us_format(gl_format mesa_format)
+{
+    switch(mesa_format)
+    {
+        case MESA_FORMAT_RGBA8888: // x
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_8, A, B, G, R, 0);
+        case MESA_FORMAT_RGB565: // x
+        case MESA_FORMAT_ARGB1555: // x
+        case MESA_FORMAT_RGBA8888_REV: // x
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_8, R, G, B, A, 0);
+        case MESA_FORMAT_ARGB8888: // x
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_8, B, G, R, A, 0);
+        case MESA_FORMAT_ARGB8888_REV:
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_8, A, R, G, B, 0);
+        case MESA_FORMAT_XRGB8888:
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_8, A, R, G, B, 0);
+
+        case MESA_FORMAT_RGB332:
+            return EASY_US_FORMAT(R500_OUT_FMT_C_3_3_2, A, R, G, B, 0);
+
+        case MESA_FORMAT_RGBA_FLOAT32:
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_32_FP, R, G, B, A, 0);
+        case MESA_FORMAT_RGBA_FLOAT16:
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_16_FP, R, G, B, A, 0);
+        case MESA_FORMAT_ALPHA_FLOAT32:
+            return EASY_US_FORMAT(R500_OUT_FMT_C_32_FP, A, A, A, A, 0);
+        case MESA_FORMAT_ALPHA_FLOAT16:
+            return EASY_US_FORMAT(R500_OUT_FMT_C_16_FP, A, A, A, A, 0);
+
+        case MESA_FORMAT_SIGNED_RGBA8888:
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_8, R, G, B, A, 0xf);
+        case MESA_FORMAT_SIGNED_RGBA8888_REV:
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_8, A, B, G, R, 0xf);
+        case MESA_FORMAT_SIGNED_RGBA_16:
+            return EASY_US_FORMAT(R500_OUT_FMT_C4_16, R, G, B, A, 0xf);
+
+        default:
+            fprintf(stderr, "Unsupported format %s for US output\n", _mesa_get_format_name(mesa_format));
+            assert(0);
+            return 0;
+    }
+}
+#undef EASY_US_FORMAT
+
+static void r500_emit_fp_setup(struct r300_context *r300,
+                               struct r500_fragment_program_code *fp,
+                               gl_format dst_format)
+{
+    r500_emit_fp(r300, (uint32_t *)fp->inst, (fp->inst_end + 1) * 6, 0, 0, 0);
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH(10);
+    OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
+    OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(fp->inst_end));
+    OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(fp->inst_end));
+    OUT_BATCH(0);
+    OUT_BATCH_REGVAL(R500_US_CONFIG, 0);
+    OUT_BATCH_REGVAL(R500_US_OUT_FMT_0, mesa_format_to_us_format(dst_format));
+    OUT_BATCH_REGVAL(R500_US_PIXSIZE, fp->max_temp_idx);
+    END_BATCH();
+}
+
+static void r500_emit_rs_setup(struct r300_context *r300)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH(7);
+    OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+    OUT_BATCH((4 << R300_IT_COUNT_SHIFT) | R300_HIRES_EN);
+    OUT_BATCH(0);
+    OUT_BATCH_REGVAL(R500_RS_INST_0,
+                     (0 << R500_RS_INST_TEX_ID_SHIFT) |
+                     (0 << R500_RS_INST_TEX_ADDR_SHIFT) |
+                     R500_RS_INST_TEX_CN_WRITE |
+                     R500_RS_INST_COL_CN_NO_WRITE);
+    OUT_BATCH_REGVAL(R500_RS_IP_0,
+                     (0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                     (1 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                     (2 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                     (3 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+    END_BATCH();
+}
+
+static void r300_emit_fp_setup(struct r300_context *r300,
+                               struct r300_fragment_program_code *code,
+                               gl_format dst_format)
+{
+    unsigned i;
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH((code->alu.length + 1) * 4 + code->tex.length + 1 + 11);
+
+    OUT_BATCH_REGSEQ(R300_US_ALU_RGB_INST_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++) {
+        OUT_BATCH(code->alu.inst[i].rgb_inst);
+    }
+    OUT_BATCH_REGSEQ(R300_US_ALU_RGB_ADDR_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++) {
+        OUT_BATCH(code->alu.inst[i].rgb_addr);
+    }
+    OUT_BATCH_REGSEQ(R300_US_ALU_ALPHA_INST_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++) {
+        OUT_BATCH(code->alu.inst[i].alpha_inst);
+    }
+    OUT_BATCH_REGSEQ(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+    for (i = 0; i < code->alu.length; i++) {
+        OUT_BATCH(code->alu.inst[i].alpha_addr);
+    }
+
+    OUT_BATCH_REGSEQ(R300_US_TEX_INST_0, code->tex.length);
+    OUT_BATCH_TABLE(code->tex.inst, code->tex.length);
+
+    OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
+    OUT_BATCH(R300_PFS_CNTL_FIRST_NODE_HAS_TEX);
+    OUT_BATCH(code->pixsize);
+    OUT_BATCH(code->code_offset);
+    OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
+    OUT_BATCH_TABLE(code->code_addr, 4);
+    OUT_BATCH_REGVAL(R500_US_OUT_FMT_0, mesa_format_to_us_format(dst_format));
+    END_BATCH();
+}
+
+static void r300_emit_rs_setup(struct r300_context *r300)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH(7);
+    OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+    OUT_BATCH((4 << R300_IT_COUNT_SHIFT) | R300_HIRES_EN);
+    OUT_BATCH(0);
+    OUT_BATCH_REGVAL(R300_RS_INST_0,
+                     R300_RS_INST_TEX_ID(0) |
+                     R300_RS_INST_TEX_ADDR(0) |
+                     R300_RS_INST_TEX_CN_WRITE);
+    OUT_BATCH_REGVAL(R300_RS_IP_0,
+                     R300_RS_TEX_PTR(0) |
+                     R300_RS_SEL_S(R300_RS_SEL_C0) |
+                     R300_RS_SEL_T(R300_RS_SEL_C1) |
+                     R300_RS_SEL_R(R300_RS_SEL_K0) |
+                     R300_RS_SEL_Q(R300_RS_SEL_K1));
+    END_BATCH();
+}
+
+static void emit_pvs_setup(struct r300_context *r300,
+                           uint32_t *vp_code,
+                           unsigned vp_len)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    r300_emit_vpu(r300, vp_code, vp_len * 4, R300_PVS_CODE_START);
+
+    BEGIN_BATCH(4);
+    OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+    OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
+              ((vp_len - 1)  << R300_PVS_XYZW_VALID_INST_SHIFT) |
+              ((vp_len - 1)<< R300_PVS_LAST_INST_SHIFT));
+    OUT_BATCH(0);
+    OUT_BATCH((vp_len - 1) << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+    END_BATCH();
+}
+
+static void emit_vap_setup(struct r300_context *r300)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH(12);
+    OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
+    OUT_BATCH(R300_VTX_XY_FMT | R300_VTX_Z_FMT);
+    OUT_BATCH(4);
+
+    OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
+    OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_0,
+                     ((R300_DATA_TYPE_FLOAT_2 | (0 << R300_DST_VEC_LOC_SHIFT)) << 0) |
+                     (((1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_2 | R300_LAST_VEC) << 16));
+    OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
+                    ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+                       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
+                       (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Z_SHIFT) |
+                       (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) | 
+                       (0xf << R300_WRITE_ENA_SHIFT) ) << 0) |
+                     (((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+                       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
+                       (R300_SWIZZLE_SELECT_FP_ZERO << R300_SWIZZLE_SELECT_Z_SHIFT) |
+                       (R300_SWIZZLE_SELECT_FP_ONE << R300_SWIZZLE_SELECT_W_SHIFT) |
+                       (0xf << R300_WRITE_ENA_SHIFT) ) << 16) ) );
+    OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+    OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT);
+    OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_1__4_COMPONENTS);
+    END_BATCH();
+}
+
+static GLboolean validate_buffers(struct r300_context *r300,
+                                  struct radeon_bo *src_bo,
+                                  struct radeon_bo *dst_bo)
+{
+    int ret;
+    radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+                                      src_bo, RADEON_GEM_DOMAIN_VRAM, 0);
+
+    radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+                                      dst_bo, 0, RADEON_GEM_DOMAIN_VRAM);
+
+    ret = radeon_cs_space_check_with_bo(r300->radeon.cmdbuf.cs,
+                                        first_elem(&r300->radeon.dma.reserved)->bo,
+                                        RADEON_GEM_DOMAIN_GTT, 0);
+    if (ret)
+        return GL_FALSE;
+
+    return GL_TRUE;
+}
+
+/**
+ * Calculate texcoords for given image region.
+ * Output values are [minx, maxx, miny, maxy]
+ */
+static void calc_tex_coords(float img_width, float img_height,
+                            float x, float y,
+                            float reg_width, float reg_height,
+                            unsigned flip_y, float *buf)
+{
+    buf[0] = x / img_width;
+    buf[1] = buf[0] + reg_width / img_width;
+    buf[2] = y / img_height;
+    buf[3] = buf[2] + reg_height / img_height;
+    if (flip_y)
+    {
+        float tmp = buf[2];
+        buf[2] = 1.0 - buf[3];
+        buf[3] = 1.0 - tmp;
+    }
+}
+
+static void emit_draw_packet(struct r300_context *r300,
+                             unsigned src_width, unsigned src_height,
+                             unsigned src_x_offset, unsigned src_y_offset,
+                             unsigned dst_x_offset, unsigned dst_y_offset,
+                             unsigned reg_width, unsigned reg_height,
+                             unsigned flip_y)
+{
+    float texcoords[4];
+
+    calc_tex_coords(src_width, src_height,
+                    src_x_offset, src_y_offset,
+                    reg_width, reg_height,
+                    flip_y, texcoords);
+
+    float verts[] = { dst_x_offset, dst_y_offset,
+                      texcoords[0], texcoords[3],
+                      dst_x_offset, dst_y_offset + reg_height,
+                      texcoords[0], texcoords[2],
+                      dst_x_offset + reg_width, dst_y_offset + reg_height,
+                      texcoords[1], texcoords[2],
+                      dst_x_offset + reg_width, dst_y_offset,
+                      texcoords[1], texcoords[3] };
+
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH(19);
+    OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_IMMD_2, 16);
+    OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED |
+              (4 << 16) | R300_VAP_VF_CNTL__PRIM_QUADS);
+    OUT_BATCH_TABLE(verts, 16);
+    END_BATCH();
+}
+
+static void other_stuff(struct r300_context *r300)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH(15);
+    OUT_BATCH_REGVAL(R300_GA_POLY_MODE,
+                     R300_GA_POLY_MODE_FRONT_PTYPE_TRI | R300_GA_POLY_MODE_BACK_PTYPE_TRI);
+    OUT_BATCH_REGVAL(R300_SU_CULL_MODE, R300_FRONT_FACE_CCW);
+    OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
+    OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
+    OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
+    OUT_BATCH(0x0);
+    OUT_BATCH(0x0);
+    OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_CLIP_DISABLE);
+    OUT_BATCH_REGVAL(R300_ZB_CNTL, 0);
+    END_BATCH();
+}
+
+static void emit_cb_setup(struct r300_context *r300,
+                          struct radeon_bo *bo,
+                          intptr_t offset,
+                          gl_format mesa_format,
+                          unsigned pitch,
+                          unsigned width,
+                          unsigned height)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    unsigned x1, y1, x2, y2;
+    x1 = 0;
+    y1 = 0;
+    x2 = width - 1;
+    y2 = height - 1;
+
+    if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+        x1 += R300_SCISSORS_OFFSET;
+        y1 += R300_SCISSORS_OFFSET;
+        x2 += R300_SCISSORS_OFFSET;
+        y2 += R300_SCISSORS_OFFSET;
+    }
+
+    r300_emit_cb_setup(r300, bo, offset, mesa_format,
+                       _mesa_get_format_bytes(mesa_format),
+                       _mesa_format_row_stride(mesa_format, pitch));
+
+    BEGIN_BATCH_NO_AUTOSTATE(5);
+    OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_BATCH((x1 << R300_SCISSORS_X_SHIFT)|(y1 << R300_SCISSORS_Y_SHIFT));
+    OUT_BATCH((x2 << R300_SCISSORS_X_SHIFT)|(y2 << R300_SCISSORS_Y_SHIFT));
+    OUT_BATCH_REGVAL(R300_RB3D_CCTL, 0);
+    END_BATCH();
+}
+
+/**
+ * Copy a region of [@a width x @a height] pixels from source buffer
+ * to destination buffer.
+ * @param[in] r300 r300 context
+ * @param[in] src_bo source radeon buffer object
+ * @param[in] src_offset offset of the source image in the @a src_bo
+ * @param[in] src_mesaformat source image format
+ * @param[in] src_pitch aligned source image width
+ * @param[in] src_width source image width
+ * @param[in] src_height source image height
+ * @param[in] src_x_offset x offset in the source image
+ * @param[in] src_y_offset y offset in the source image
+ * @param[in] dst_bo destination radeon buffer object
+ * @param[in] dst_offset offset of the destination image in the @a dst_bo
+ * @param[in] dst_mesaformat destination image format
+ * @param[in] dst_pitch aligned destination image width
+ * @param[in] dst_width destination image width
+ * @param[in] dst_height destination image height
+ * @param[in] dst_x_offset x offset in the destination image
+ * @param[in] dst_y_offset y offset in the destination image
+ * @param[in] width region width
+ * @param[in] height region height
+ * @param[in] flip_y set if y coords of the source image need to be flipped
+ */
+GLboolean r300_blit(struct r300_context *r300,
+                    struct radeon_bo *src_bo,
+                    intptr_t src_offset,
+                    gl_format src_mesaformat,
+                    unsigned src_pitch,
+                    unsigned src_width,
+                    unsigned src_height,
+                    unsigned src_x_offset,
+                    unsigned src_y_offset,
+                    struct radeon_bo *dst_bo,
+                    intptr_t dst_offset,
+                    gl_format dst_mesaformat,
+                    unsigned dst_pitch,
+                    unsigned dst_width,
+                    unsigned dst_height,
+                    unsigned dst_x_offset,
+                    unsigned dst_y_offset,
+                    unsigned reg_width,
+                    unsigned reg_height,
+                    unsigned flip_y)
+{
+    if (_mesa_get_format_bits(src_mesaformat, GL_DEPTH_BITS) > 0)
+        return GL_FALSE;
+
+    /* Make sure that colorbuffer has even width - hw limitation */
+    if (dst_pitch % 2 > 0)
+        ++dst_pitch;
+
+    /* Rendering to small buffer doesn't work.
+     * Looks like a hw limitation.
+     */
+    if (dst_pitch < 32)
+        return GL_FALSE;
+
+    /* Need to clamp the region size to make sure
+     * we don't read outside of the source buffer
+     * or write outside of the destination buffer.
+     */
+    if (reg_width + src_x_offset > src_width)
+        reg_width = src_width - src_x_offset;
+    if (reg_height + src_y_offset > src_height)
+        reg_height = src_height - src_y_offset;
+    if (reg_width + dst_x_offset > dst_width)
+        reg_width = dst_width - dst_x_offset;
+    if (reg_height + dst_y_offset > dst_height)
+        reg_height = dst_height - dst_y_offset;
+
+    if (src_bo == dst_bo) {
+        return GL_FALSE;
+    }
+
+    if (0) {
+        fprintf(stderr, "src: size [%d x %d], pitch %d, "
+                "offset [%d x %d], format %s, bo %p\n",
+                src_width, src_height, src_pitch,
+                src_x_offset, src_y_offset,
+                _mesa_get_format_name(src_mesaformat),
+                src_bo);
+        fprintf(stderr, "dst: pitch %d, offset[%d x %d], format %s, bo %p\n",
+                dst_pitch, dst_x_offset, dst_y_offset,
+                _mesa_get_format_name(dst_mesaformat), dst_bo);
+        fprintf(stderr, "region: %d x %d\n", reg_width, reg_height);
+    }
+
+    /* Flush is needed to make sure that source buffer has correct data */
+    radeonFlush(r300->radeon.glCtx);
+
+    if (!validate_buffers(r300, src_bo, dst_bo))
+        return GL_FALSE;
+
+    rcommonEnsureCmdBufSpace(&r300->radeon, 200, __FUNCTION__);
+
+    other_stuff(r300);
+
+    r300_emit_tx_setup(r300, src_mesaformat, src_bo, src_offset, src_width, src_height, src_pitch);
+
+    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+        r500_emit_fp_setup(r300, &r300->blit.fp_code.code.r500, dst_mesaformat);
+        r500_emit_rs_setup(r300);
+    } else {
+        r300_emit_fp_setup(r300, &r300->blit.fp_code.code.r300, dst_mesaformat);
+        r300_emit_rs_setup(r300);
+    }
+
+    emit_pvs_setup(r300, r300->blit.vp_code.body.d, 2);
+    emit_vap_setup(r300);
+
+    emit_cb_setup(r300, dst_bo, dst_offset, dst_mesaformat, dst_pitch, dst_width, dst_height);
+
+    emit_draw_packet(r300, src_width, src_height,
+                     src_x_offset, src_y_offset,
+                     dst_x_offset, dst_y_offset,
+                     reg_width, reg_height,
+                     flip_y);
+
+    r300EmitCacheFlush(r300);
+
+    radeonFlush(r300->radeon.glCtx);
+
+    return GL_TRUE;
+}
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/r300_blit.h b/src/mesa/drivers/dri/r300/r300_blit.h
new file mode 100644
index 0000000000..dc21e88098
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_blit.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef R300_BLIT_H
+#define R300_BLIT_H
+
+void r300_blit_init(struct r300_context *r300);
+
+GLboolean r300_blit(struct r300_context *r300,
+                    struct radeon_bo *src_bo,
+                    intptr_t src_offset,
+                    gl_format src_mesaformat,
+                    unsigned src_pitch,
+                    unsigned src_width,
+                    unsigned src_height,
+                    unsigned src_x_offset,
+                    unsigned src_y_offset,
+                    struct radeon_bo *dst_bo,
+                    intptr_t dst_offset,
+                    gl_format dst_mesaformat,
+                    unsigned dst_pitch,
+                    unsigned dst_width,
+                    unsigned dst_height,
+                    unsigned dst_x_offset,
+                    unsigned dst_y_offset,
+                    unsigned width,
+                    unsigned height,
+                    unsigned flip_y);
+
+#endif // R300_BLIT_H
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
index 1e2a54f634..e1c33bbb2c 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
@@ -45,15 +45,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_drm.h"
 
 #include "r300_context.h"
-#include "r300_ioctl.h"
-#include "radeon_reg.h"
 #include "r300_reg.h"
 #include "r300_cmdbuf.h"
 #include "r300_emit.h"
 #include "radeon_bocs_wrapper.h"
 #include "radeon_mipmap_tree.h"
 #include "r300_state.h"
-#include "radeon_reg.h"
 #include "radeon_queryobj.h"
 
 /** # of dwords reserved for additional instructions that may need to be written
@@ -74,7 +71,7 @@ static unsigned packet0_count(r300ContextPtr r300, uint32_t *pkt)
 #define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
 #define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
 
-int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
+static int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int cnt;
@@ -88,54 +85,73 @@ int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
 	return cnt ? (cnt * 4) + extra : 0;
 }
 
+void r300_emit_vpu(struct r300_context *r300,
+                   uint32_t *data,
+                   unsigned len,
+                   uint32_t addr)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    BEGIN_BATCH_NO_AUTOSTATE(5 + len);
+    OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+    OUT_BATCH_REGVAL(R300_VAP_PVS_VECTOR_INDX_REG, addr);
+    OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, len-1) | RADEON_ONE_REG_WR);
+    OUT_BATCH_TABLE(data, len);
+    END_BATCH();
+}
 
-void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
+static void emit_vpu_state(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	BATCH_LOCALS(&r300->radeon);
-	drm_r300_cmd_header_t cmd;
-	uint32_t addr, ndw;
+    r300ContextPtr r300 = R300_CONTEXT(ctx);
+    drm_r300_cmd_header_t cmd;
+    uint32_t addr, ndw;
 
-	cmd.u = atom->cmd[0];
-	addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
-	ndw = atom->check(ctx, atom);
+    cmd.u = atom->cmd[0];
+    addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
+    ndw = atom->check(ctx, atom);
 
-	BEGIN_BATCH_NO_AUTOSTATE(ndw);
+    r300_emit_vpu(r300, &atom->cmd[1], vpu_count(atom->cmd) * 4, addr);
+}
 
-	ndw -= 5;
-	OUT_BATCH_REGVAL(R300_VAP_PVS_VECTOR_INDX_REG, addr);
-	OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, ndw-1) | RADEON_ONE_REG_WR);
-	OUT_BATCH_TABLE(&atom->cmd[1], ndw);
-	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
-	END_BATCH();
+void r500_emit_fp(struct r300_context *r300,
+                  uint32_t *data,
+                  unsigned len,
+                  uint32_t addr,
+                  unsigned type,
+                  unsigned clamp)
+{
+    BATCH_LOCALS(&r300->radeon);
+
+    addr |= (type << 16);
+    addr |= (clamp << 17);
+
+    BEGIN_BATCH_NO_AUTOSTATE(len + 3);
+    OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_INDEX, 0));
+    OUT_BATCH(addr);
+    OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_DATA, len-1) | RADEON_ONE_REG_WR);
+    OUT_BATCH_TABLE(data, len);
+    END_BATCH();
 }
 
-void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom)
+static void emit_r500fp_atom(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	BATCH_LOCALS(&r300->radeon);
-	drm_r300_cmd_header_t cmd;
-	uint32_t addr, ndw, sz;
-	int type, clamp;
-
-	ndw = atom->check(ctx, atom);
-
-	cmd.u = atom->cmd[0];
-	sz = cmd.r500fp.count;
-	addr = ((cmd.r500fp.adrhi_flags & 1) << 8) | cmd.r500fp.adrlo;
-	type = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_TYPE);
-	clamp = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_CLAMP);
-
-	addr |= (type << 16);
-	addr |= (clamp << 17);
-
-	BEGIN_BATCH_NO_AUTOSTATE(ndw);
-	OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_INDEX, 0));
-	OUT_BATCH(addr);
-	ndw-=3;
-	OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_DATA, ndw-1) | RADEON_ONE_REG_WR);
-	OUT_BATCH_TABLE(&atom->cmd[1], ndw);
-	END_BATCH();
+    r300ContextPtr r300 = R300_CONTEXT(ctx);
+    drm_r300_cmd_header_t cmd;
+    uint32_t addr, count;
+    int type, clamp;
+
+    cmd.u = atom->cmd[0];
+    addr = ((cmd.r500fp.adrhi_flags & 1) << 8) | cmd.r500fp.adrlo;
+    type = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_TYPE);
+    clamp = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_CLAMP);
+
+    if (type) {
+        count = r500fp_count(atom->cmd) * 4;
+    } else {
+        count = r500fp_count(atom->cmd) * 6;
+    }
+
+    r500_emit_fp(r300, &atom->cmd[1], count, addr, type, clamp);
 }
 
 static int check_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
@@ -171,7 +187,7 @@ static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 		if (t && !t->image_override) {
 			BEGIN_BATCH_NO_AUTOSTATE(4);
 			OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
-			OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+			OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, get_base_teximage_offset(t),
 					RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
 			END_BATCH();
 		} else if (!t) {
@@ -258,95 +274,136 @@ static int check_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 	return dw;
 }
 
-static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+static void emit_scissor(struct r300_context *r300,
+                         unsigned width,
+                         unsigned height)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	BATCH_LOCALS(&r300->radeon);
-	struct radeon_renderbuffer *rrb;
-	uint32_t cbpitch;
-	uint32_t offset = r300->radeon.state.color.draw_offset;
-	uint32_t dw = 6;
-	int i;
+    int i;
+    BATCH_LOCALS(&r300->radeon);
+    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+        OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+        OUT_BATCH(0);
+        OUT_BATCH(((width - 1) << R300_SCISSORS_X_SHIFT) |
+                ((height - 1) << R300_SCISSORS_Y_SHIFT));
+        END_BATCH();
+        BEGIN_BATCH_NO_AUTOSTATE(16);
+        for (i = 0; i < 4; i++) {
+            OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+            OUT_BATCH((0 << R300_CLIPRECT_X_SHIFT) | (0 << R300_CLIPRECT_Y_SHIFT));
+            OUT_BATCH(((width - 1) << R300_CLIPRECT_X_SHIFT) | ((height - 1) << R300_CLIPRECT_Y_SHIFT));
+        }
+        OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+        OUT_BATCH(0xAAAA);
+        OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+        OUT_BATCH(0xffffff);
+        END_BATCH();
+    } else {
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+        OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+        OUT_BATCH((R300_SCISSORS_OFFSET << R300_SCISSORS_X_SHIFT) |
+                (R300_SCISSORS_OFFSET << R300_SCISSORS_Y_SHIFT));
+        OUT_BATCH(((width + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_X_SHIFT) |
+                ((height + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_Y_SHIFT));
+        END_BATCH();
+        BEGIN_BATCH_NO_AUTOSTATE(16);
+        for (i = 0; i < 4; i++) {
+            OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+            OUT_BATCH((R300_SCISSORS_OFFSET << R300_CLIPRECT_X_SHIFT) | (R300_SCISSORS_OFFSET << R300_CLIPRECT_Y_SHIFT));
+            OUT_BATCH(((R300_SCISSORS_OFFSET + width - 1) << R300_CLIPRECT_X_SHIFT) |
+                        ((R300_SCISSORS_OFFSET + height - 1) << R300_CLIPRECT_Y_SHIFT));
+        }
+        OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+        OUT_BATCH(0xAAAA);
+        OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+        OUT_BATCH(0xffffff);
+        END_BATCH();
+    }
+}
 
-	rrb = radeon_get_colorbuffer(&r300->radeon);
-	if (!rrb || !rrb->bo) {
-		fprintf(stderr, "no rrb\n");
-		return;
-	}
+void r300_emit_cb_setup(struct r300_context *r300,
+                        struct radeon_bo *bo,
+                        uint32_t offset,
+                        GLuint format,
+                        unsigned cpp,
+                        unsigned pitch)
+{
+    BATCH_LOCALS(&r300->radeon);
+    uint32_t cbpitch = pitch / cpp;
+    uint32_t dw = 6;
 
-        if (RADEON_DEBUG & RADEON_STATE)
-           fprintf(stderr,"rrb is %p %d %dx%d\n", rrb, offset, rrb->base.Width, rrb->base.Height);
-	cbpitch = (rrb->pitch / rrb->cpp);
-	if (rrb->cpp == 4)
-		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
-	else switch (rrb->base.Format) {
+    assert(offset % 32 == 0);
+
+    switch (format) {
         case MESA_FORMAT_RGB565:
-		cbpitch |= R300_COLOR_FORMAT_RGB565;
-		break;
+            assert(_mesa_little_endian());
+            cbpitch |= R300_COLOR_FORMAT_RGB565;
+            break;
+        case MESA_FORMAT_RGB565_REV:
+            assert(!_mesa_little_endian());
+            cbpitch |= R300_COLOR_FORMAT_RGB565;
+            break;
         case MESA_FORMAT_ARGB4444:
-		cbpitch |= R300_COLOR_FORMAT_ARGB4444;
-		break;
-	case MESA_FORMAT_ARGB1555:
-		cbpitch |= R300_COLOR_FORMAT_ARGB1555;
-		break;
-	default:
-		_mesa_problem(ctx, "unexpected format in emit_cb_offset()");
-	}
+            assert(_mesa_little_endian());
+            cbpitch |= R300_COLOR_FORMAT_ARGB4444;
+            break;
+        case MESA_FORMAT_ARGB4444_REV:
+            assert(!_mesa_little_endian());
+            cbpitch |= R300_COLOR_FORMAT_ARGB4444;
+            break;
+        case MESA_FORMAT_ARGB1555:
+            assert(_mesa_little_endian());
+            cbpitch |= R300_COLOR_FORMAT_ARGB1555;
+            break;
+        case MESA_FORMAT_ARGB1555_REV:
+            assert(!_mesa_little_endian());
+            cbpitch |= R300_COLOR_FORMAT_ARGB1555;
+            break;
+        default:
+            if (cpp == 4) {
+                cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+            } else {
+                _mesa_problem(r300->radeon.glCtx, "unexpected format in emit_cb_offset()");;
+            }
+            break;
+    }
 
-	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
-		cbpitch |= R300_COLOR_TILE_ENABLE;
+    if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+        cbpitch |= R300_COLOR_TILE_ENABLE;
+
+    if (r300->radeon.radeonScreen->kernel_mm)
+        dw += 2;
+
+    BEGIN_BATCH_NO_AUTOSTATE(dw);
+    OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+    OUT_BATCH_RELOC(offset, bo, offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+    OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
+    if (!r300->radeon.radeonScreen->kernel_mm)
+        OUT_BATCH(cbpitch);
+    else
+        OUT_BATCH_RELOC(cbpitch, bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+    END_BATCH();
+}
+
+static void emit_cb_offset_atom(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+    r300ContextPtr r300 = R300_CONTEXT(ctx);
+    struct radeon_renderbuffer *rrb;
+    uint32_t offset = r300->radeon.state.color.draw_offset;
+
+    rrb = radeon_get_colorbuffer(&r300->radeon);
+    if (!rrb || !rrb->bo) {
+        fprintf(stderr, "no rrb\n");
+        return;
+    }
+
+    if (RADEON_DEBUG & RADEON_STATE)
+        fprintf(stderr,"rrb is %p %d %dx%d\n", rrb, offset, rrb->base.Width, rrb->base.Height);
+
+    r300_emit_cb_setup(r300, rrb->bo, offset, rrb->base.Format, rrb->cpp, rrb->pitch);
 
-    	if (r300->radeon.radeonScreen->kernel_mm)
-		dw += 2;
-	BEGIN_BATCH_NO_AUTOSTATE(dw);
-	OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
-	OUT_BATCH_RELOC(offset, rrb->bo, offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
-	OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
-    	if (!r300->radeon.radeonScreen->kernel_mm)
-		OUT_BATCH(cbpitch);
-	else
-		OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
-	END_BATCH();
     if (r300->radeon.radeonScreen->driScreen->dri2.enabled) {
-        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-            BEGIN_BATCH_NO_AUTOSTATE(3);
-            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
-            OUT_BATCH(0);
-            OUT_BATCH(((rrb->base.Width - 1) << R300_SCISSORS_X_SHIFT) |
-                    ((rrb->base.Height - 1) << R300_SCISSORS_Y_SHIFT));
-            END_BATCH();
-            BEGIN_BATCH_NO_AUTOSTATE(16);
-            for (i = 0; i < 4; i++) {
-                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
-                OUT_BATCH((0 << R300_CLIPRECT_X_SHIFT) | (0 << R300_CLIPRECT_Y_SHIFT));
-                OUT_BATCH(((rrb->base.Width - 1) << R300_CLIPRECT_X_SHIFT) | ((rrb->base.Height - 1) << R300_CLIPRECT_Y_SHIFT));
-            }
-            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
-            OUT_BATCH(0xAAAA);
-            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
-            OUT_BATCH(0xffffff);
-            END_BATCH();
-        } else {
-            BEGIN_BATCH_NO_AUTOSTATE(3);
-            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
-            OUT_BATCH((R300_SCISSORS_OFFSET << R300_SCISSORS_X_SHIFT) |
-                    (R300_SCISSORS_OFFSET << R300_SCISSORS_Y_SHIFT));
-            OUT_BATCH(((rrb->base.Width + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_X_SHIFT) |
-                    ((rrb->base.Height + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_Y_SHIFT));
-            END_BATCH();
-            BEGIN_BATCH_NO_AUTOSTATE(16);
-            for (i = 0; i < 4; i++) {
-                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
-                OUT_BATCH((R300_SCISSORS_OFFSET << R300_CLIPRECT_X_SHIFT) | (R300_SCISSORS_OFFSET << R300_CLIPRECT_Y_SHIFT));
-                OUT_BATCH(((R300_SCISSORS_OFFSET + rrb->base.Width - 1) << R300_CLIPRECT_X_SHIFT) |
-                          ((R300_SCISSORS_OFFSET + rrb->base.Height - 1) << R300_CLIPRECT_Y_SHIFT));
-            }
-            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
-            OUT_BATCH(0xAAAA);
-            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
-            OUT_BATCH(0xffffff);
-            END_BATCH();
-        }
+        emit_scissor(r300, rrb->base.Width, rrb->base.Height);
     }
 }
 
@@ -442,7 +499,7 @@ static int check_variable(GLcontext *ctx, struct radeon_state_atom *atom)
 	return cnt ? cnt + 1 : 0;
 }
 
-int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
+static int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
@@ -454,7 +511,7 @@ int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
 	return cnt ? (cnt * 6) + extra : 0;
 }
 
-int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
+static int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
@@ -631,13 +688,13 @@ void r300InitCmdBuf(r300ContextPtr r300)
 		r300->hw.r500fp.cmd[R300_FPI_CMD_0] =
 			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 0, 0);
 		if (r300->radeon.radeonScreen->kernel_mm)
-			r300->hw.r500fp.emit = emit_r500fp;
+			r300->hw.r500fp.emit = emit_r500fp_atom;
 
 		ALLOC_STATE(r500fp_const, r500fp_const, R500_FPP_CMDSIZE, 0);
 		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] =
 			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 1, 0);
 		if (r300->radeon.radeonScreen->kernel_mm)
-			r300->hw.r500fp_const.emit = emit_r500fp;
+			r300->hw.r500fp_const.emit = emit_r500fp_atom;
 	} else {
 		ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
 		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CONFIG, 3);
@@ -681,7 +738,7 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	ALLOC_STATE(rop, always, 2, 0);
 	r300->hw.rop.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_ROPCNTL, 1);
 	ALLOC_STATE(cb, cb_offset, R300_CB_CMDSIZE, 0);
-	r300->hw.cb.emit = &emit_cb_offset;
+	r300->hw.cb.emit = &emit_cb_offset_atom;
 	ALLOC_STATE(rb3d_dither_ctl, always, 10, 0);
 	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
@@ -745,20 +802,20 @@ void r300InitCmdBuf(r300ContextPtr r300)
 		r300->hw.vpi.cmd[0] =
 			cmdvpu(r300->radeon.radeonScreen, R300_PVS_CODE_START, 0);
 		if (r300->radeon.radeonScreen->kernel_mm)
-			r300->hw.vpi.emit = emit_vpu;
+			r300->hw.vpi.emit = emit_vpu_state;
 
 		if (is_r500) {
 			ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
 			r300->hw.vpp.cmd[0] =
 				cmdvpu(r300->radeon.radeonScreen, R500_PVS_CONST_START, 0);
 			if (r300->radeon.radeonScreen->kernel_mm)
-				r300->hw.vpp.emit = emit_vpu;
+				r300->hw.vpp.emit = emit_vpu_state;
 
 			ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
 			r300->hw.vps.cmd[0] =
 				cmdvpu(r300->radeon.radeonScreen, R500_POINT_VPORT_SCALE_OFFSET, 1);
 			if (r300->radeon.radeonScreen->kernel_mm)
-				r300->hw.vps.emit = emit_vpu;
+				r300->hw.vps.emit = emit_vpu_state;
 
 			for (i = 0; i < 6; i++) {
 				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
@@ -766,20 +823,20 @@ void r300InitCmdBuf(r300ContextPtr r300)
 					cmdvpu(r300->radeon.radeonScreen,
 							R500_PVS_UCP_START + i, 1);
 				if (r300->radeon.radeonScreen->kernel_mm)
-					r300->hw.vpucp[i].emit = emit_vpu;
+					r300->hw.vpucp[i].emit = emit_vpu_state;
 			}
 		} else {
 			ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
 			r300->hw.vpp.cmd[0] =
 				cmdvpu(r300->radeon.radeonScreen, R300_PVS_CONST_START, 0);
 			if (r300->radeon.radeonScreen->kernel_mm)
-				r300->hw.vpp.emit = emit_vpu;
+				r300->hw.vpp.emit = emit_vpu_state;
 
 			ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
 			r300->hw.vps.cmd[0] =
 				cmdvpu(r300->radeon.radeonScreen, R300_POINT_VPORT_SCALE_OFFSET, 1);
 			if (r300->radeon.radeonScreen->kernel_mm)
-				r300->hw.vps.emit = emit_vpu;
+				r300->hw.vps.emit = emit_vpu_state;
 
 			for (i = 0; i < 6; i++) {
 				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
@@ -787,7 +844,7 @@ void r300InitCmdBuf(r300ContextPtr r300)
 					cmdvpu(r300->radeon.radeonScreen,
 							R300_PVS_UCP_START + i, 1);
 				if (r300->radeon.radeonScreen->kernel_mm)
-					r300->hw.vpucp[i].emit = emit_vpu;
+					r300->hw.vpucp[i].emit = emit_vpu_state;
 			}
 		}
 	}
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.h b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
index 1b703e518a..0e68da928e 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
@@ -44,14 +44,26 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define FIREAOS_BUFSZ          (3)
 #define SCISSORS_BUFSZ         (3)
 
-extern void r300InitCmdBuf(r300ContextPtr r300);
+void r300InitCmdBuf(r300ContextPtr r300);
 void r300_emit_scissor(GLcontext *ctx);
 
-void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom);
-int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom);
+void r300_emit_vpu(struct r300_context *ctx,
+                   uint32_t *data,
+                   unsigned len,
+                   uint32_t addr);
 
-void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom);
-int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom);
-int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom);
+void r500_emit_fp(struct r300_context *r300,
+                  uint32_t *data,
+                  unsigned len,
+                  uint32_t addr,
+                  unsigned type,
+                  unsigned clamp);
 
-#endif				/* __R300_CMDBUF_H__ */
+void r300_emit_cb_setup(struct r300_context *r300,
+                        struct radeon_bo *bo,
+                        uint32_t offset,
+                        GLuint format,
+                        unsigned cpp,
+                        unsigned pitch);
+
+#endif /* __R300_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 6f66e970e4..1f6ccf6ddc 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -55,13 +55,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_vp_build.h"
 
 #include "drivers/common/driverfuncs.h"
+#include "drivers/common/meta.h"
 
 #include "r300_context.h"
 #include "radeon_context.h"
 #include "radeon_span.h"
+#include "r300_blit.h"
 #include "r300_cmdbuf.h"
 #include "r300_state.h"
-#include "r300_ioctl.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
 #include "r300_swtcl.h"
@@ -92,6 +93,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/remap_helper.h"
 
+void r300_init_texcopy_functions(struct dd_function_table *table);
 
 static const struct dri_extension card_extensions[] = {
   /* *INDENT-OFF* */
@@ -439,11 +441,11 @@ static void r300InitGLExtensions(GLcontext *ctx)
 	if (r300->options.stencil_two_side_disabled)
 		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
 
-	if (r300->options.s3tc_force_enabled) {
+	if (r300->options.s3tc_force_disabled) {
+		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	} else if (ctx->Mesa_DXTn || r300->options.s3tc_force_enabled) {
 		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
 		_mesa_enable_extension(ctx, "GL_S3_s3tc");
-	} else if (r300->options.s3tc_force_disabled) {
-		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
 	}
 
 	if (!r300->radeon.radeonScreen->drmSupportsOcclusionQueries) {
@@ -451,13 +453,20 @@ static void r300InitGLExtensions(GLcontext *ctx)
 	}
 }
 
+static void r300InitIoctlFuncs(struct dd_function_table *functions)
+{
+	functions->Clear = _mesa_meta_Clear;
+	functions->Finish = radeonFinish;
+	functions->Flush = radeonFlush;
+}
+
 /* Create the device specific rendering context.
  */
 GLboolean r300CreateContext(const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
+			    __DRIcontext * driContextPriv,
 			    void *sharedContextPrivate)
 {
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	__DRIscreen *sPriv = driContextPriv->driScreenPriv;
 	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
 	struct dd_function_table functions;
 	r300ContextPtr r300;
@@ -484,6 +493,10 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	radeonInitQueryObjFunctions(&functions);
 	radeonInitBufferObjectFuncs(&functions);
 
+	if (r300->radeon.radeonScreen->kernel_mm) {
+		r300_init_texcopy_functions(&functions);
+	}
+
 	if (!radeonInitContext(&r300->radeon, &functions,
 			       glVisual, driContextPriv,
 			       sharedContextPrivate)) {
@@ -530,6 +543,7 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 		r300InitSwtcl(ctx);
 	}
 
+	r300_blit_init(r300);
 	radeon_fbo_init(&r300->radeon);
 	radeonInitSpanFuncs( ctx );
 	r300InitCmdBuf(r300);
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 518d5cdbf4..546cd8ddde 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -533,14 +533,19 @@ struct r300_context {
 
 	uint32_t fallback;
 
+	struct {
+		struct r300_vertex_program_code vp_code;
+		struct rX00_fragment_program_code fp_code;
+	} blit;
+
 	DECLARE_RENDERINPUTS(render_inputs_bitset);
 };
 
 #define R300_CONTEXT(ctx)		((r300ContextPtr)(ctx->DriverCtx))
 
-extern void r300DestroyContext(__DRIcontextPrivate * driContextPriv);
+extern void r300DestroyContext(__DRIcontext * driContextPriv);
 extern GLboolean r300CreateContext(const __GLcontextModes * glVisual,
-				   __DRIcontextPrivate * driContextPriv,
+				   __DRIcontext * driContextPriv,
 				   void *sharedContextPrivate);
 
 extern void r300InitShaderFuncs(struct dd_function_table *functions);
@@ -549,6 +554,8 @@ extern void r300InitShaderFunctions(r300ContextPtr r300);
 
 extern void r300InitDraw(GLcontext *ctx);
 
+extern void r300_init_texcopy_functions(struct dd_function_table *table);
+
 #define r300PackFloat32 radeonPackFloat32
 #define r300PackFloat24 radeonPackFloat24
 
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
index e9968f9ffe..3dcd986e22 100644
--- a/src/mesa/drivers/dri/r300/r300_draw.c
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -100,7 +100,7 @@ static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 		GLubyte *in = (GLubyte *)src_ptr;
 
 		radeonAllocDmaRegion(&r300->radeon, &r300->ind_buf.bo, &r300->ind_buf.bo_offset, size, 4);
-
+		radeon_bo_map(r300->ind_buf.bo, 1);
 		assert(r300->ind_buf.bo->ptr != NULL);
 		out = (GLuint *)ADD_POINTERS(r300->ind_buf.bo->ptr, r300->ind_buf.bo_offset);
 
@@ -111,7 +111,7 @@ static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 		if (i < mesa_ind_buf->count) {
 			*out++ = in[i];
 		}
-
+		radeon_bo_unmap(r300->ind_buf.bo);
 #if MESA_BIG_ENDIAN
 	} else { /* if (mesa_ind_buf->type == GL_UNSIGNED_SHORT) */
 		GLushort *in = (GLushort *)src_ptr;
@@ -120,6 +120,7 @@ static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 		radeonAllocDmaRegion(&r300->radeon, &r300->ind_buf.bo,
 				     &r300->ind_buf.bo_offset, size, 4);
 
+		radeon_bo_map(r300->ind_buf.bo, 1);
 		assert(r300->ind_buf.bo->ptr != NULL);
 		out = (GLuint *)ADD_POINTERS(r300->ind_buf.bo->ptr, r300->ind_buf.bo_offset);
 
@@ -130,6 +131,7 @@ static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 		if (i < mesa_ind_buf->count) {
 			*out++ = in[i];
 		}
+		radeon_bo_unmap(r300->ind_buf.bo);
 #endif
 	}
 
@@ -173,10 +175,12 @@ static void r300SetupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 
 		radeonAllocDmaRegion(&r300->radeon, &r300->ind_buf.bo, &r300->ind_buf.bo_offset, size, 4);
 
+		radeon_bo_map(r300->ind_buf.bo, 1);
 		assert(r300->ind_buf.bo->ptr != NULL);
 		dst_ptr = ADD_POINTERS(r300->ind_buf.bo->ptr, r300->ind_buf.bo_offset);
 		_mesa_memcpy(dst_ptr, src_ptr, size);
 
+		radeon_bo_unmap(r300->ind_buf.bo);
 		r300->ind_buf.is_32bit = (mesa_ind_buf->type == GL_UNSIGNED_INT);
 		r300->ind_buf.count = mesa_ind_buf->count;
 
@@ -242,6 +246,7 @@ static void r300ConvertAttrib(GLcontext *ctx, int count, const struct gl_client_
 	}
 
 	radeonAllocDmaRegion(&r300->radeon, &attr->bo, &attr->bo_offset, sizeof(GLfloat) * input->Size * count, 32);
+	radeon_bo_map(attr->bo, 1);
 	dst_ptr = (GLfloat *)ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
 
 	radeon_print(RADEON_FALLBACKS, RADEON_IMPORTANT,
@@ -280,6 +285,7 @@ static void r300ConvertAttrib(GLcontext *ctx, int count, const struct gl_client_
 			break;
 	}
 
+	radeon_bo_unmap(attr->bo);
 	if (mapped_named_bo) {
 		ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
 	}
@@ -294,6 +300,8 @@ static void r300AlignDataToDword(GLcontext *ctx, const struct gl_client_array *i
 
 	radeonAllocDmaRegion(&r300->radeon, &attr->bo, &attr->bo_offset, size, 32);
 
+	radeon_bo_map(attr->bo, 1);
+
 	if (!input->BufferObj->Pointer) {
 		ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
 		mapped_named_bo = GL_TRUE;
@@ -317,6 +325,7 @@ static void r300AlignDataToDword(GLcontext *ctx, const struct gl_client_array *i
 		ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
 	}
 
+	radeon_bo_unmap(attr->bo);
 	attr->stride = dst_stride;
 }
 
@@ -527,6 +536,7 @@ static void r300AllocDmaRegions(GLcontext *ctx, const struct gl_client_array *in
 				}
 
 				radeonAllocDmaRegion(&r300->radeon, &vbuf->attribs[index].bo, &vbuf->attribs[index].bo_offset, size, 32);
+				radeon_bo_map(vbuf->attribs[index].bo, 1);
 				assert(vbuf->attribs[index].bo->ptr != NULL);
 				dst = (uint32_t *)ADD_POINTERS(vbuf->attribs[index].bo->ptr, vbuf->attribs[index].bo_offset);
 				switch (vbuf->attribs[index].dwords) {
@@ -536,6 +546,7 @@ static void r300AllocDmaRegions(GLcontext *ctx, const struct gl_client_array *in
 					case 4: radeonEmitVec16(dst, input[i]->Ptr, input[i]->StrideB, local_count); break;
 					default: assert(0); break;
 				}
+				radeon_bo_unmap(vbuf->attribs[index].bo);
 
 			}
 		}
diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
index 07e6223087..15aeaf0514 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_emit.c
@@ -49,7 +49,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_emit.h"
-#include "r300_ioctl.h"
 #include "r300_render.h"
 #include "r300_swtcl.h"
 
@@ -118,7 +117,7 @@ GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes)
 
 	if (first_free_texcoord > 8) {
 		fprintf(stderr, "\tout of free texcoords\n");
-		_mesa_exit(-1);
+		exit(-1);
 	}
 
 	return ret;
diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
index 8e57e354d1..a456d8867c 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.h
+++ b/src/mesa/drivers/dri/r300/r300_emit.h
@@ -42,7 +42,6 @@
 #include "main/glheader.h"
 #include "r300_context.h"
 #include "r300_cmdbuf.h"
-#include "radeon_reg.h"
 
 static INLINE uint32_t cmdpacket0(struct radeon_screen *rscrn,
                                   int reg, int count)
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
index 267ee81a7a..2933d31136 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_common.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
@@ -120,7 +120,7 @@ static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler,
 		return;
 	}
 
-	rc_transform_fragment_wpos(&compiler->Base, FRAG_ATTRIB_WPOS, fp->wpos_attr);
+	rc_transform_fragment_wpos(&compiler->Base, FRAG_ATTRIB_WPOS, fp->wpos_attr, GL_FALSE);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
deleted file mode 100644
index 5cb04e2bb6..0000000000
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.
-Copyright (C) 2004 Nicolai Haehnle.
-All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/**
- * \file
- *
- * \author Keith Whitwell <keith@tungstengraphics.com>
- *
- * \author Nicolai Haehnle <prefect_@gmx.net>
- */
-
-#include <sched.h>
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/macros.h"
-#include "main/context.h"
-#include "main/simple_list.h"
-#include "swrast/swrast.h"
-
-#include "radeon_common.h"
-#include "radeon_lock.h"
-#include "r300_context.h"
-#include "r300_ioctl.h"
-#include "r300_cmdbuf.h"
-#include "r300_state.h"
-#include "r300_vertprog.h"
-#include "radeon_reg.h"
-#include "r300_emit.h"
-#include "r300_context.h"
-
-#include "vblank.h"
-
-#define R200_3D_DRAW_IMMD_2      0xC0003500
-
-#define CLEARBUFFER_COLOR	0x1
-#define CLEARBUFFER_DEPTH	0x2
-#define CLEARBUFFER_STENCIL	0x4
-
-#if 1
-
-/**
- * Fragment program helper macros
- */
-
-/* Produce unshifted source selectors */
-#define FP_TMP(idx) (idx)
-#define FP_CONST(idx) ((idx) | (1 << 5))
-
-/* Produce source/dest selector dword */
-#define FP_SELC_MASK_NO		0
-#define FP_SELC_MASK_X		1
-#define FP_SELC_MASK_Y		2
-#define FP_SELC_MASK_XY		3
-#define FP_SELC_MASK_Z		4
-#define FP_SELC_MASK_XZ		5
-#define FP_SELC_MASK_YZ		6
-#define FP_SELC_MASK_XYZ	7
-
-#define FP_SELC(destidx,regmask,outmask,src0,src1,src2) \
-	(((destidx) << R300_ALU_DSTC_SHIFT) |		\
-	 (FP_SELC_MASK_##regmask << 23) |		\
-	 (FP_SELC_MASK_##outmask << 26) |		\
-	 ((src0) << R300_ALU_SRC0C_SHIFT) |		\
-	 ((src1) << R300_ALU_SRC1C_SHIFT) |		\
-	 ((src2) << R300_ALU_SRC2C_SHIFT))
-
-#define FP_SELA_MASK_NO		0
-#define FP_SELA_MASK_W		1
-
-#define FP_SELA(destidx,regmask,outmask,src0,src1,src2) \
-	(((destidx) << R300_ALU_DSTA_SHIFT) |		\
-	 (FP_SELA_MASK_##regmask << 23) |		\
-	 (FP_SELA_MASK_##outmask << 24) |		\
-	 ((src0) << R300_ALU_SRC0A_SHIFT) |		\
-	 ((src1) << R300_ALU_SRC1A_SHIFT) |		\
-	 ((src2) << R300_ALU_SRC2A_SHIFT))
-
-/* Produce unshifted argument selectors */
-#define FP_ARGC(source)	R300_ALU_ARGC_##source
-#define FP_ARGA(source) R300_ALU_ARGA_##source
-#define FP_ABS(arg) ((arg) | (1 << 6))
-#define FP_NEG(arg) ((arg) ^ (1 << 5))
-
-/* Produce instruction dword */
-#define FP_INSTRC(opcode,arg0,arg1,arg2) \
-	(R300_ALU_OUTC_##opcode | 		\
-	((arg0) << R300_ALU_ARG0C_SHIFT) |	\
-	((arg1) << R300_ALU_ARG1C_SHIFT) |	\
-	((arg2) << R300_ALU_ARG2C_SHIFT))
-
-#define FP_INSTRA(opcode,arg0,arg1,arg2) \
-	(R300_ALU_OUTA_##opcode | 		\
-	((arg0) << R300_ALU_ARG0A_SHIFT) |	\
-	((arg1) << R300_ALU_ARG1A_SHIFT) |	\
-	((arg2) << R300_ALU_ARG2A_SHIFT))
-
-#endif
-
-static void r300EmitClearState(GLcontext * ctx);
-
-static void r300ClearBuffer(r300ContextPtr r300, int flags,
-			    struct radeon_renderbuffer *rrb,
-			    struct radeon_renderbuffer *rrbd)
-{
-	BATCH_LOCALS(&r300->radeon);
-	GLcontext *ctx = r300->radeon.glCtx;
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
-	GLuint cbpitch = 0;
-	r300ContextPtr rmesa = r300;
-
-	if (RADEON_DEBUG & RADEON_IOCTL)
-		fprintf(stderr, "%s: buffer %p (%i,%i %ix%i)\n",
-			__FUNCTION__, rrb, dPriv->x, dPriv->y,
-			dPriv->w, dPriv->h);
-
-	if (rrb) {
-		cbpitch = (rrb->pitch / rrb->cpp);
-		if (rrb->cpp == 4)
-			cbpitch |= R300_COLOR_FORMAT_ARGB8888;
-		else
-			cbpitch |= R300_COLOR_FORMAT_RGB565;
-
-		if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
-			cbpitch |= R300_COLOR_TILE_ENABLE;
-        }
-	}
-
-	/* TODO in bufmgr */
-	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	end_3d(&rmesa->radeon);
-
-	if (flags & CLEARBUFFER_COLOR) {
-		assert(rrb != 0);
-		BEGIN_BATCH_NO_AUTOSTATE(6);
-		OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
-		OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
-		OUT_BATCH_REGVAL(R300_RB3D_COLORPITCH0, cbpitch);
-		END_BATCH();
-	}
-#if 1
-	if (flags & (CLEARBUFFER_DEPTH | CLEARBUFFER_STENCIL)) {
-		uint32_t zbpitch = (rrbd->pitch / rrbd->cpp);
-		if (rrbd->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
-			zbpitch |= R300_DEPTHMACROTILE_ENABLE;
-        }
-		if (rrbd->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
-            zbpitch |= R300_DEPTHMICROTILE_TILED;
-        }
-		BEGIN_BATCH_NO_AUTOSTATE(6);
-		OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
-		OUT_BATCH_RELOC(0, rrbd->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
-		OUT_BATCH_REGSEQ(R300_ZB_DEPTHPITCH, 1);
-		if (!r300->radeon.radeonScreen->kernel_mm)
-			OUT_BATCH(zbpitch);
-		else
-			OUT_BATCH_RELOC(zbpitch, rrbd->bo, zbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
-		END_BATCH();
-	}
-#endif
-	BEGIN_BATCH_NO_AUTOSTATE(6);
-	OUT_BATCH_REGSEQ(RB3D_COLOR_CHANNEL_MASK, 1);
-	if (flags & CLEARBUFFER_COLOR) {
-		OUT_BATCH((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
-			  (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
-			  (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
-			  (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
-	} else {
-		OUT_BATCH(0);
-	}
-
-
-	{
-		uint32_t t1, t2;
-
-		t1 = 0x0;
-		t2 = 0x0;
-
-		if (flags & CLEARBUFFER_DEPTH) {
-			t1 |= R300_Z_ENABLE | R300_Z_WRITE_ENABLE;
-			t2 |=
-			    (R300_ZS_ALWAYS << R300_Z_FUNC_SHIFT);
-		}
-
-		if (flags & CLEARBUFFER_STENCIL) {
-			t1 |= R300_STENCIL_ENABLE;
-			t2 |=
-			    (R300_ZS_ALWAYS <<
-			     R300_S_FRONT_FUNC_SHIFT) |
-			    (R300_ZS_REPLACE <<
-			     R300_S_FRONT_SFAIL_OP_SHIFT) |
-			    (R300_ZS_REPLACE <<
-			     R300_S_FRONT_ZPASS_OP_SHIFT) |
-			    (R300_ZS_REPLACE <<
-			     R300_S_FRONT_ZFAIL_OP_SHIFT);
-		}
-
-		OUT_BATCH_REGSEQ(R300_ZB_CNTL, 3);
-		OUT_BATCH(t1);
-		OUT_BATCH(t2);
-		OUT_BATCH(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
-                   R300_STENCILWRITEMASK_SHIFT) |
-			  (ctx->Stencil.Clear & R300_STENCILREF_MASK));
-		END_BATCH();
-	}
-
-	if (!rmesa->radeon.radeonScreen->kernel_mm) {
-		BEGIN_BATCH_NO_AUTOSTATE(9);
-		OUT_BATCH(cmdpacket3(r300->radeon.radeonScreen, R300_CMD_PACKET3_CLEAR));
-		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
-		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
-		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
-		OUT_BATCH_FLOAT32(1.0);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
-		END_BATCH();
-	} else {
-		OUT_BATCH(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
-		OUT_BATCH(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
-			  (1 << R300_PRIM_NUM_VERTICES_SHIFT));
-		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
-		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
-		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
-		OUT_BATCH_FLOAT32(1.0);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
-		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
-	}
-
-	r300EmitCacheFlush(rmesa);
-	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-
-	R300_STATECHANGE(r300, cb);
-	R300_STATECHANGE(r300, cmk);
-	R300_STATECHANGE(r300, zs);
-}
-
-static void r300EmitClearState(GLcontext * ctx)
-{
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	BATCH_LOCALS(&r300->radeon);
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
-	int i;
-	int has_tcl;
-	int is_r500 = 0;
-	GLuint vap_cntl;
-
-	has_tcl = r300->options.hw_tcl_enabled;
-
-	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		is_r500 = 1;
-
-	/* State atom dirty tracking is a little subtle here.
-	 *
-	 * On the one hand, we need to make sure base state is emitted
-	 * here if we start with an empty batch buffer, otherwise clear
-	 * works incorrectly with multiple processes. Therefore, the first
-	 * BEGIN_BATCH cannot be a BEGIN_BATCH_NO_AUTOSTATE.
-	 *
-	 * On the other hand, implicit state emission clears the state atom
-	 * dirty bits, so we have to call R300_STATECHANGE later than the
-	 * first BEGIN_BATCH.
-	 *
-	 * The final trickiness is that, because we change state, we need
-	 * to ensure that any stored swtcl primitives are flushed properly
-	 * before we start changing state. See the R300_NEWPRIM in r300Clear
-	 * for this.
-	 */
-	BEGIN_BATCH(31);
-	OUT_BATCH_REGSEQ(R300_VAP_PROG_STREAM_CNTL_0, 1);
-	if (!has_tcl)
-		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
-		 ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
-	else
-		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
-		 ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
-
-	OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
-	OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
-	   ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
-	       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
-	       (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
-	       (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
-	       ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT))
-	      << R300_SWIZZLE0_SHIFT) |
-	     (((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
-	       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
-	       (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
-	       (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
-	       ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT))
-	      << R300_SWIZZLE1_SHIFT)));
-
-	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
-	OUT_BATCH_REGSEQ(R300_VAP_VTX_STATE_CNTL, 2);
-	OUT_BATCH((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
-	OUT_BATCH(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
-
-	/* comes from fglrx startup of clear */
-	OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
-	OUT_BATCH(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
-		  R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
-		  R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
-		  R300_VPORT_Z_OFFSET_ENA);
-	OUT_BATCH(0x8);
-
-	OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
-
-	OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
-	OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
-		  R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
-	OUT_BATCH(0); /* no textures */
-
-	OUT_BATCH_REGVAL(R300_TX_ENABLE, 0);
-
-	OUT_BATCH_REGSEQ(R300_SE_VPORT_XSCALE, 6);
-	OUT_BATCH_FLOAT32(1.0);
-	OUT_BATCH_FLOAT32(dPriv->x);
-	OUT_BATCH_FLOAT32(1.0);
-	OUT_BATCH_FLOAT32(dPriv->y);
-	OUT_BATCH_FLOAT32(1.0);
-	OUT_BATCH_FLOAT32(0.0);
-
-	OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
-
-	OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
-	OUT_BATCH(0x0);
-	OUT_BATCH(0x0);
-	END_BATCH();
-
-	R300_STATECHANGE(r300, vir[0]);
-	R300_STATECHANGE(r300, fogs);
-	R300_STATECHANGE(r300, vir[1]);
-	R300_STATECHANGE(r300, vic);
-	R300_STATECHANGE(r300, vte);
-	R300_STATECHANGE(r300, vof);
-	R300_STATECHANGE(r300, txe);
-	R300_STATECHANGE(r300, vpt);
-	R300_STATECHANGE(r300, at);
-	R300_STATECHANGE(r300, bld);
-	R300_STATECHANGE(r300, ps);
-
-	if (has_tcl) {
-		R300_STATECHANGE(r300, vap_clip_cntl);
-
-		BEGIN_BATCH_NO_AUTOSTATE(2);
-		OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
-		END_BATCH();
-        }
-
-	BEGIN_BATCH_NO_AUTOSTATE(2);
-	OUT_BATCH_REGVAL(R300_GA_POINT_SIZE,
-		((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
-		((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
-	END_BATCH();
-
-	if (!is_r500) {
-		R300_STATECHANGE(r300, ri);
-		R300_STATECHANGE(r300, rc);
-		R300_STATECHANGE(r300, rr);
-
-		BEGIN_BATCH(14);
-		OUT_BATCH_REGSEQ(R300_RS_IP_0, 8);
-		for (i = 0; i < 8; ++i)
-			OUT_BATCH(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
-
-		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
-		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		OUT_BATCH(0x0);
-
-		OUT_BATCH_REGVAL(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
-		END_BATCH();
-	} else {
-		R300_STATECHANGE(r300, ri);
-		R300_STATECHANGE(r300, rc);
-		R300_STATECHANGE(r300, rr);
-
-		BEGIN_BATCH(14);
-		OUT_BATCH_REGSEQ(R500_RS_IP_0, 8);
-		for (i = 0; i < 8; ++i) {
-			OUT_BATCH((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-				  (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
-		}
-
-		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
-		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		OUT_BATCH(0x0);
-
-		OUT_BATCH_REGVAL(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
-		END_BATCH();
-	}
-
-	if (!is_r500) {
-		R300_STATECHANGE(r300, fp);
-		R300_STATECHANGE(r300, fpi[0]);
-		R300_STATECHANGE(r300, fpi[1]);
-		R300_STATECHANGE(r300, fpi[2]);
-		R300_STATECHANGE(r300, fpi[3]);
-
-		BEGIN_BATCH(17);
-		OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
-		OUT_BATCH(0x0);
-		OUT_BATCH(0x0);
-		OUT_BATCH(0x0);
-		OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
-		OUT_BATCH(0x0);
-		OUT_BATCH(0x0);
-		OUT_BATCH(0x0);
-		OUT_BATCH(R300_RGBA_OUT);
-
-		OUT_BATCH_REGVAL(R300_US_ALU_RGB_INST_0,
-			FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
-		OUT_BATCH_REGVAL(R300_US_ALU_RGB_ADDR_0,
-			FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
-		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_INST_0,
-			FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
-		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_ADDR_0,
-			FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
-		END_BATCH();
-	} else {
-		struct radeon_state_atom r500fp;
-		uint32_t _cmd[10];
-
-		R300_STATECHANGE(r300, fp);
-		R300_STATECHANGE(r300, r500fp);
-
-		BEGIN_BATCH(7);
-		OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
-		OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
-		OUT_BATCH(0x0);
-		OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
-		OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
-		OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
-		OUT_BATCH(R500_US_CODE_OFFSET_ADDR(0));
-		END_BATCH();
-
-		r500fp.check = check_r500fp;
-		r500fp.cmd = _cmd;
-		r500fp.cmd[0] = cmdr500fp(r300->radeon.radeonScreen, 0, 1, 0, 0);
-		r500fp.cmd[1] = R500_INST_TYPE_OUT |
-			R500_INST_TEX_SEM_WAIT |
-			R500_INST_LAST |
-			R500_INST_RGB_OMASK_R |
-			R500_INST_RGB_OMASK_G |
-			R500_INST_RGB_OMASK_B |
-			R500_INST_ALPHA_OMASK |
-			R500_INST_RGB_CLAMP |
-			R500_INST_ALPHA_CLAMP;
-		r500fp.cmd[2] = R500_RGB_ADDR0(0) |
-			R500_RGB_ADDR1(0) |
-			R500_RGB_ADDR1_CONST |
-			R500_RGB_ADDR2(0) |
-			R500_RGB_ADDR2_CONST;
-		r500fp.cmd[3] = R500_ALPHA_ADDR0(0) |
-			R500_ALPHA_ADDR1(0) |
-			R500_ALPHA_ADDR1_CONST |
-			R500_ALPHA_ADDR2(0) |
-			R500_ALPHA_ADDR2_CONST;
-		r500fp.cmd[4] = R500_ALU_RGB_SEL_A_SRC0 |
-			R500_ALU_RGB_R_SWIZ_A_R |
-			R500_ALU_RGB_G_SWIZ_A_G |
-			R500_ALU_RGB_B_SWIZ_A_B |
-			R500_ALU_RGB_SEL_B_SRC0 |
-			R500_ALU_RGB_R_SWIZ_B_R |
-			R500_ALU_RGB_B_SWIZ_B_G |
-			R500_ALU_RGB_G_SWIZ_B_B;
-		r500fp.cmd[5] = R500_ALPHA_OP_CMP |
-			R500_ALPHA_SWIZ_A_A |
-			R500_ALPHA_SWIZ_B_A;
-		r500fp.cmd[6] = R500_ALU_RGBA_OP_CMP |
-			R500_ALU_RGBA_R_SWIZ_0 |
-			R500_ALU_RGBA_G_SWIZ_0 |
-			R500_ALU_RGBA_B_SWIZ_0 |
-			R500_ALU_RGBA_A_SWIZ_0;
-
-		r500fp.cmd[7] = 0;
-		if (r300->radeon.radeonScreen->kernel_mm) {
-			emit_r500fp(ctx, &r500fp);
-		} else {
-			int dwords = r500fp.check(ctx,&r500fp);
-			BEGIN_BATCH_NO_AUTOSTATE(dwords);
-			OUT_BATCH_TABLE(r500fp.cmd, dwords);
-			END_BATCH();
-		}
-
-	}
-
-	BEGIN_BATCH(2);
-	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
-	END_BATCH();
-
-	if (has_tcl) {
-		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
-			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
-			(12 << R300_VF_MAX_VTX_NUM_SHIFT));
-		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-			vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
-	} else {
-		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
-			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
-			(5 << R300_VF_MAX_VTX_NUM_SHIFT));
-	}
-
-	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
-		vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
-	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
-		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
-		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
-		vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
-	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
-		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
-		vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
-	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
-		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
-		vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
-	else
-		vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
-
-	R300_STATECHANGE(r300, vap_cntl);
-
-	BEGIN_BATCH(2);
-	OUT_BATCH_REGVAL(R300_VAP_CNTL, vap_cntl);
-	END_BATCH();
-
-	if (has_tcl) {
-        struct radeon_state_atom vpu;
-        uint32_t _cmd[10];
-		R300_STATECHANGE(r300, pvs);
-		R300_STATECHANGE(r300, vap_flush);
-		R300_STATECHANGE(r300, vpi);
-
-		BEGIN_BATCH(4);
-		OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
-		OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
-			  (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-			  (1 << R300_PVS_LAST_INST_SHIFT));
-		OUT_BATCH((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-			  (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
-		OUT_BATCH(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-		END_BATCH();
-
-		vpu.check = check_vpu;
-		vpu.cmd = _cmd;
-		vpu.cmd[0] = cmdvpu(r300->radeon.radeonScreen, 0, 2);
-
-		vpu.cmd[1] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE,
-                                         0, 0xf, PVS_DST_REG_OUT);
-		vpu.cmd[2] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
-                                      PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
-                                      PVS_SRC_REG_INPUT, NEGATE_NONE);
-		vpu.cmd[3] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_REG_INPUT, NEGATE_NONE);
-		vpu.cmd[4] = 0x0;
-
-		vpu.cmd[5] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf,
-                                         PVS_DST_REG_OUT);
-		vpu.cmd[6] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X,
-                                      PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z,
-                                      PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT,
-                                      NEGATE_NONE);
-		vpu.cmd[7] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_SELECT_FORCE_0,
-                                      PVS_SRC_REG_INPUT, NEGATE_NONE);
-		vpu.cmd[8] = 0x0;
-
-		if (r300->radeon.radeonScreen->kernel_mm) {
-			int dwords = r300->hw.vap_flush.check(ctx,&r300->hw.vap_flush);
-			BEGIN_BATCH_NO_AUTOSTATE(dwords);
-			OUT_BATCH_TABLE(r300->hw.vap_flush.cmd, dwords);
-			END_BATCH();
-			emit_vpu(ctx, &vpu);
-		} else {
-			int dwords = vpu.check(ctx,&vpu);
-			BEGIN_BATCH_NO_AUTOSTATE(dwords);
-			OUT_BATCH_TABLE(vpu.cmd, dwords);
-			END_BATCH();
-		}
-
-	}
-}
-
-static int r300KernelClear(GLcontext *ctx, GLuint flags)
-{
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
-	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
-	struct radeon_renderbuffer *rrb;
-	struct radeon_renderbuffer *rrbd;
-	int bits = 0, ret;
-
-	/* Make sure it fits there. */
-	radeon_cs_space_reset_bos(r300->radeon.cmdbuf.cs);
-
-	if (flags & BUFFER_BIT_COLOR0) {
-		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
-		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
-						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
-	}
-
-	if (flags & BUFFER_BIT_FRONT_LEFT) {
-		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
-		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
-						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
-	}
-
-	if (flags & BUFFER_BIT_BACK_LEFT) {
-		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
-		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
-						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
-	}
-
-	rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
-	if (rrbd) {
-		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
-						  rrbd->bo, 0, RADEON_GEM_DOMAIN_VRAM);
-	}
-
-	ret = radeon_cs_space_check(r300->radeon.cmdbuf.cs);
-	if (ret)
-	  return -1;
-
-	rcommonEnsureCmdBufSpace(&r300->radeon, 421 * 3, __FUNCTION__);
-	if (flags || bits)
-		r300EmitClearState(ctx);
-
-	rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
-	if (rrbd && (flags & BUFFER_BIT_DEPTH))
-		bits |= CLEARBUFFER_DEPTH;
-
-	if (rrbd && (flags & BUFFER_BIT_STENCIL))
-		bits |= CLEARBUFFER_STENCIL;
-
-	if (flags & BUFFER_BIT_COLOR0) {
-		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
-		r300ClearBuffer(r300, CLEARBUFFER_COLOR, rrb, NULL);
-		bits = 0;
-	}
-
-	if (flags & BUFFER_BIT_FRONT_LEFT) {
-		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
-		bits = 0;
-	}
-
-	if (flags & BUFFER_BIT_BACK_LEFT) {
-		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
-		bits = 0;
-	}
-
-	if (bits)
-		r300ClearBuffer(r300, bits, NULL, rrbd);
-
-	COMMIT_BATCH();
-	return 0;
-}
-
-/**
- * Buffer clear
- */
-static void r300Clear(GLcontext * ctx, GLbitfield mask)
-{
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
-	const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
-	GLbitfield swrast_mask = 0, tri_mask = 0;
-	int i, ret;
-	struct gl_framebuffer *fb = ctx->DrawBuffer;
-
-	if (RADEON_DEBUG & RADEON_IOCTL)
-		fprintf(stderr, "r300Clear\n");
-
-	if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
-		LOCK_HARDWARE(&r300->radeon);
-		UNLOCK_HARDWARE(&r300->radeon);
-		if (dPriv->numClipRects == 0)
-			return;
-	}
-
-	/* Flush swtcl vertices if necessary, because we will change hardware
-	 * state during clear. See also the state-related comment in
-	 * r300EmitClearState.
-	 */
-	R300_NEWPRIM(r300);
-
-	if (colorMask == ~0)
-	  tri_mask |= (mask & BUFFER_BITS_COLOR);
-	else
-	  tri_mask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
-
-
-	/* HW stencil */
-	if (mask & BUFFER_BIT_STENCIL) {
-		tri_mask |= BUFFER_BIT_STENCIL;
-	}
-
-	/* HW depth */
-	if (mask & BUFFER_BIT_DEPTH) {
-    	        tri_mask |= BUFFER_BIT_DEPTH;
-	}
-
-	/* If we're doing a tri pass for depth/stencil, include a likely color
-	 * buffer with it.
-	 */
-
-	for (i = 0; i < BUFFER_COUNT; i++) {
-	  GLuint bufBit = 1 << i;
-	  if ((tri_mask) & bufBit) {
-	    if (!fb->Attachment[i].Renderbuffer->ClassID) {
-	      tri_mask &= ~bufBit;
-	      swrast_mask |= bufBit;
-	    }
-	  }
-	}
-
-	/* SW fallback clearing */
-	swrast_mask = mask & ~tri_mask;
-
-	ret = 0;
-	if (tri_mask) {
-		if (r300->radeon.radeonScreen->kernel_mm)
-			radeonUserClear(ctx, tri_mask);
-		else {
-			/* if kernel clear fails due to size restraints fallback */
-			ret = r300KernelClear(ctx, tri_mask);
-			if (ret < 0)
-				swrast_mask |= tri_mask;
-		}
-	}
-
-	if (swrast_mask) {
-		if (RADEON_DEBUG & RADEON_FALLBACKS)
-			fprintf(stderr, "%s: swrast clear, mask: %x\n",
-				__FUNCTION__, swrast_mask);
-		_swrast_Clear(ctx, swrast_mask);
-	}
-}
-
-void r300InitIoctlFuncs(struct dd_function_table *functions)
-{
-	functions->Clear = r300Clear;
-	functions->Finish = radeonFinish;
-	functions->Flush = radeonFlush;
-}
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.h b/src/mesa/drivers/dri/r300/r300_ioctl.h
deleted file mode 100644
index 3abfa71a6e..0000000000
--- a/src/mesa/drivers/dri/r300/r300_ioctl.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- *   Nicolai Haehnle <prefect_@gmx.net>
- */
-
-#ifndef __R300_IOCTL_H__
-#define __R300_IOCTL_H__
-
-#include "r300_context.h"
-#include "radeon_drm.h"
-
-extern void r300InitIoctlFuncs(struct dd_function_table *functions);
-
-#endif				/* __R300_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index 623da60333..ea684e7df1 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -1789,6 +1789,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_ALU_DSTC_OUTPUT_X           (1 << 26)
 #       define R300_ALU_DSTC_OUTPUT_Y           (1 << 27)
 #       define R300_ALU_DSTC_OUTPUT_Z           (1 << 28)
+#       define R300_RGB_TARGET(x)               ((x) << 29)
 
 #define R300_US_ALU_ALPHA_ADDR_0                 0x47C0
 #       define R300_ALU_SRC0A_SHIFT             0
@@ -1806,6 +1807,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_ALU_DSTA_REG                (1 << 23)
 #       define R300_ALU_DSTA_OUTPUT             (1 << 24)
 #		define R300_ALU_DSTA_DEPTH              (1 << 27)
+#		define R300_ALPHA_TARGET(x)             ((x) << 25)
 
 #define R300_US_ALU_RGB_INST_0                   0x48C0
 #       define R300_ALU_ARGC_SRC0C_XYZ          0
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index 3cd38753b8..02c94250a8 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -67,10 +67,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "vbo/vbo_split.h"
 #include "tnl/tnl.h"
 #include "tnl/t_vp_build.h"
-#include "radeon_reg.h"
-#include "radeon_macros.h"
 #include "r300_context.h"
-#include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_tex.h"
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index ac20c08e20..c51285aad9 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -55,7 +55,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_vp_build.h"
 
 #include "r300_context.h"
-#include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_emit.h"
@@ -998,7 +997,7 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
 static void r300UpdateWindow(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+	__DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
 	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1051,7 +1050,7 @@ static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
 void r300UpdateViewportOffset(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+	__DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = (GLfloat) dPriv->x;
 	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1312,7 +1311,7 @@ static void r300SetupTextures(GLcontext * ctx)
 		fprintf(stderr,
 			"Aiiee ! mtu=%d is greater than R300_MAX_TEXTURE_UNITS=%d\n",
 			mtu, R300_MAX_TEXTURE_UNITS);
-		_mesa_exit(-1);
+		exit(-1);
 	}
 
 	/* We cannot let disabled tmu offsets pass DRM */
@@ -1769,9 +1768,10 @@ static void r300ResetHwState(r300ContextPtr r300)
 	radeon_firevertices(&r300->radeon);
 
 	r300ColorMask(ctx,
-		      ctx->Color.ColorMask[RCOMP],
-		      ctx->Color.ColorMask[GCOMP],
-		      ctx->Color.ColorMask[BCOMP], ctx->Color.ColorMask[ACOMP]);
+		      ctx->Color.ColorMask[0][RCOMP],
+		      ctx->Color.ColorMask[0][GCOMP],
+		      ctx->Color.ColorMask[0][BCOMP],
+                      ctx->Color.ColorMask[0][ACOMP]);
 
 	r300Enable(ctx, GL_DEPTH_TEST, ctx->Depth.Test);
 	r300DepthMask(ctx, ctx->Depth.Mask);
@@ -2040,7 +2040,7 @@ static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx, GLuint index,
 		}
 
 		case RC_STATE_R300_WINDOW_DIMENSION: {
-			__DRIdrawablePrivate * drawable = radeon_get_drawable(&rmesa->radeon);
+			__DRIdrawable * drawable = radeon_get_drawable(&rmesa->radeon);
 			buffer[0] = drawable->w * 0.5f;	/* width*0.5 */
 			buffer[1] = drawable->h * 0.5f;	/* height*0.5 */
 			buffer[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
index ee2c71e1a7..93983cee20 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
@@ -124,7 +124,7 @@ void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_
 	}
 
 	if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
-		VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->ColorPtr[1];
+		VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->BackfaceColorPtr;
 		OutputsWritten |= 1 << VERT_RESULT_BFC0;
 #if MESA_LITTLE_ENDIAN
 		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_RGBA );
@@ -134,7 +134,7 @@ void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_
 		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
 #endif
 		if (fp_reads & FRAG_BIT_COL1) {
-			VB->AttribPtr[VERT_ATTRIB_GENERIC1] = VB->SecondaryColorPtr[1];
+			VB->AttribPtr[VERT_ATTRIB_GENERIC1] = VB->BackfaceSecondaryColorPtr;
 			GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
 			OutputsWritten |= 1 << VERT_RESULT_BFC1;
 #if MESA_LITTLE_ENDIAN
@@ -159,7 +159,7 @@ void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_
 		int tex_id = rmesa->selected_fp->wpos_attr - FRAG_ATTRIB_TEX0;
 
 		VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
-		VB->TexCoordPtr[tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
+		VB->AttribPtr[_TNL_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
 		RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
 	}
 
@@ -167,7 +167,7 @@ void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_
 		int tex_id = rmesa->selected_fp->fog_attr - FRAG_ATTRIB_TEX0;
 
 		VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
-		VB->TexCoordPtr[tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
+		VB->AttribPtr[_TNL_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
 		RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
 	}
 
@@ -180,7 +180,7 @@ void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_
 		GLuint swiz, format, hw_format;
 		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 			if (fp_reads & FRAG_BIT_TEX(i)) {
-				switch (VB->TexCoordPtr[i]->size) {
+				switch (VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->size) {
 					case 1:
 						format = EMIT_1F;
 						hw_format = R300_DATA_TYPE_FLOAT_1;
@@ -215,7 +215,7 @@ void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_
 
 	if (first_free_tex >= ctx->Const.MaxTextureUnits) {
 		fprintf(stderr, "\tout of free texcoords to write fog coordinate\n");
-		_mesa_exit(-1);
+		exit(-1);
 	}
 
 	R300_NEWPRIM(rmesa);
@@ -665,11 +665,11 @@ void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 	r300EmitCacheFlush(rmesa);
 
 	radeonEmitState(&rmesa->radeon);
-    r300_emit_scissor(ctx);
+	r300_emit_scissor(ctx);
 	r300EmitVertexAOS(rmesa,
-			rmesa->radeon.swtcl.vertex_size,
-			first_elem(&rmesa->radeon.dma.reserved)->bo,
-			current_offset);
+			  rmesa->radeon.swtcl.vertex_size,
+			  rmesa->radeon.swtcl.bo,
+			  current_offset);
 
 	r300EmitVbufPrim(rmesa,
 		   rmesa->radeon.swtcl.hw_primitive,
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 27b78a912f..963f648cb1 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -48,7 +48,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "r300_context.h"
 #include "r300_state.h"
-#include "r300_ioctl.h"
 #include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 
@@ -216,23 +215,14 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 		break;
 
 	case GL_TEXTURE_BORDER_COLOR:
-		r300SetTexBorderColor(t, texObj->BorderColor);
+		r300SetTexBorderColor(t, texObj->BorderColor.f);
 		break;
 
 	case GL_TEXTURE_BASE_LEVEL:
 	case GL_TEXTURE_MAX_LEVEL:
 	case GL_TEXTURE_MIN_LOD:
 	case GL_TEXTURE_MAX_LOD:
-		/* This isn't the most efficient solution but there doesn't appear to
-		 * be a nice alternative.  Since there's no LOD clamping,
-		 * we just have to rely on loading the right subset of mipmap levels
-		 * to simulate a clamped LOD.
-		 */
-		if (t->mt) {
-			radeon_miptree_unreference(t->mt);
-			t->mt = 0;
-			t->validated = GL_FALSE;
-		}
+		t->validated = GL_FALSE;
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
@@ -270,7 +260,11 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 
 	if (rmesa) {
 		int i;
-		radeon_firevertices(&rmesa->radeon);
+		struct radeon_bo *bo;
+		bo = !t->mt ? t->bo : t->mt->bo;
+		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->radeon.cmdbuf.cs)) {
+			radeon_firevertices(&rmesa->radeon);
+		}
 
 		for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
 			if (rmesa->hw.textures[i] == t)
@@ -282,10 +276,8 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 		t->bo = NULL;
 	}
 
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = 0;
-	}
+	radeon_miptree_unreference(&t->mt);
+
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -315,7 +307,7 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
 	/* Initialize hardware state */
 	r300UpdateTexWrap(t);
 	r300SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
-	r300SetTexBorderColor(t, t->base.BorderColor);
+	r300SetTexBorderColor(t, t->base.BorderColor.f);
 
 	return &t->base;
 }
diff --git a/src/mesa/drivers/dri/r300/r300_tex.h b/src/mesa/drivers/dri/r300/r300_tex.h
index 8a653ea2d1..6ede0fe25c 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.h
+++ b/src/mesa/drivers/dri/r300/r300_tex.h
@@ -51,4 +51,6 @@ extern GLboolean r300ValidateBuffers(GLcontext * ctx);
 
 extern void r300InitTextureFuncs(struct dd_function_table *functions);
 
+int32_t r300TranslateTexFormat(gl_format mesaFormat);
+
 #endif				/* __r300_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_texcopy.c b/src/mesa/drivers/dri/r300/r300_texcopy.c
new file mode 100644
index 0000000000..ebc9c05b8a
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_texcopy.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_common.h"
+#include "r300_context.h"
+
+#include "main/image.h"
+#include "main/teximage.h"
+#include "main/texstate.h"
+#include "drivers/common/meta.h"
+
+#include "radeon_mipmap_tree.h"
+#include "r300_blit.h"
+#include <main/debug.h>
+
+// TODO:
+// need to pass correct pitch for small dst textures!
+static GLboolean
+do_copy_texsubimage(GLcontext *ctx,
+                    GLenum target, GLint level,
+                    struct radeon_tex_obj *tobj,
+                    radeon_texture_image *timg,
+                    GLint dstx, GLint dsty,
+                    GLint x, GLint y,
+                    GLsizei width, GLsizei height)
+{
+    struct r300_context *r300 = R300_CONTEXT(ctx);
+    struct radeon_renderbuffer *rrb;
+
+    if (_mesa_get_format_bits(timg->base.TexFormat, GL_DEPTH_BITS) > 0) {
+        rrb = radeon_get_depthbuffer(&r300->radeon);
+    } else {
+        rrb = radeon_get_colorbuffer(&r300->radeon);
+    }
+
+    if (!timg->mt) {
+        radeon_validate_texture_miptree(ctx, &tobj->base);
+    }
+
+    assert(rrb && rrb->bo);
+    assert(timg->mt->bo);
+    assert(timg->base.Width >= dstx + width);
+    assert(timg->base.Height >= dsty + height);
+
+    intptr_t src_offset = rrb->draw_offset;
+    intptr_t dst_offset = radeon_miptree_image_offset(timg->mt, _mesa_tex_target_to_face(target), level);
+
+    if (src_offset % 32 || dst_offset % 32) {
+        return GL_FALSE;
+    }
+
+    if (0) {
+        fprintf(stderr, "%s: copying to face %d, level %d\n",
+                __FUNCTION__, _mesa_tex_target_to_face(target), level);
+        fprintf(stderr, "to: x %d, y %d, offset %d\n", dstx, dsty, (uint32_t) dst_offset);
+        fprintf(stderr, "from (%dx%d) width %d, height %d, offset %d, pitch %d\n",
+                x, y, rrb->base.Width, rrb->base.Height, (uint32_t) src_offset, rrb->pitch/rrb->cpp);
+        fprintf(stderr, "src size %d, dst size %d\n", rrb->bo->size, timg->mt->bo->size);
+
+    }
+
+    /* blit from src buffer to texture */
+    return r300_blit(r300, rrb->bo, src_offset, rrb->base.Format, rrb->pitch/rrb->cpp,
+                     rrb->base.Width, rrb->base.Height, x, y,
+                     timg->mt->bo, dst_offset, timg->base.TexFormat,
+                     timg->base.Width, timg->base.Width, timg->base.Height,
+                     dstx, dsty, width, height, 1);
+}
+
+static void
+r300CopyTexImage2D(GLcontext *ctx, GLenum target, GLint level,
+                   GLenum internalFormat,
+                   GLint x, GLint y, GLsizei width, GLsizei height,
+                   GLint border)
+{
+    struct gl_texture_unit *texUnit = _mesa_get_current_tex_unit(ctx);
+    struct gl_texture_object *texObj =
+        _mesa_select_tex_object(ctx, texUnit, target);
+    struct gl_texture_image *texImage =
+        _mesa_select_tex_image(ctx, texObj, target, level);
+    int srcx, srcy, dstx, dsty;
+
+    if (border)
+        goto fail;
+
+    /* Setup or redefine the texture object, mipmap tree and texture
+     * image.  Don't populate yet.
+     */
+    ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
+                           width, height, border,
+                           GL_RGBA, GL_UNSIGNED_BYTE, NULL,
+                           &ctx->DefaultPacking, texObj, texImage);
+
+    srcx = x;
+    srcy = y;
+    dstx = 0;
+    dsty = 0;
+    if (!_mesa_clip_copytexsubimage(ctx,
+                                    &dstx, &dsty,
+                                    &srcx, &srcy,
+                                    &width, &height)) {
+        return;
+    }
+
+    if (!do_copy_texsubimage(ctx, target, level,
+                             radeon_tex_obj(texObj), (radeon_texture_image *)texImage,
+                             0, 0, x, y, width, height)) {
+        goto fail;
+    }
+
+    return;
+
+fail:
+    _mesa_meta_CopyTexImage2D(ctx, target, level, internalFormat, x, y,
+                              width, height, border);
+}
+
+static void
+r300CopyTexSubImage2D(GLcontext *ctx, GLenum target, GLint level,
+                      GLint xoffset, GLint yoffset,
+                      GLint x, GLint y,
+                      GLsizei width, GLsizei height)
+{
+    struct gl_texture_unit *texUnit = _mesa_get_current_tex_unit(ctx);
+    struct gl_texture_object *texObj = _mesa_select_tex_object(ctx, texUnit, target);
+    struct gl_texture_image *texImage = _mesa_select_tex_image(ctx, texObj, target, level);
+
+    if (!do_copy_texsubimage(ctx, target, level,
+                             radeon_tex_obj(texObj), (radeon_texture_image *)texImage,
+                             xoffset, yoffset, x, y, width, height)) {
+
+       //DEBUG_FALLBACKS
+
+        _mesa_meta_CopyTexSubImage2D(ctx, target, level,
+                                     xoffset, yoffset, x, y, width, height);
+    }
+}
+
+
+void r300_init_texcopy_functions(struct dd_function_table *table)
+{
+    table->CopyTexImage2D = r300CopyTexImage2D;
+    table->CopyTexSubImage2D = r300CopyTexSubImage2D;
+}
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index 44ca24daf8..78ff54574f 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -46,19 +46,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "r300_context.h"
 #include "r300_state.h"
-#include "r300_ioctl.h"
 #include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 #include "r300_reg.h"
 
-#define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5			\
-			   || ((f) >= MESA_FORMAT_RGBA_FLOAT32 &&	\
-			       (f) <= MESA_FORMAT_INTENSITY_FLOAT16))	\
-			  && tx_table[f].flag )
-
-#define _ASSIGN(entry, format)				\
-	[ MESA_FORMAT_ ## entry ] = { format, 0, 1}
-
 /*
  * Note that the _REV formats are the same as the non-REV formats.  This is
  * because the REV and non-REV formats are identical as a byte string, but
@@ -68,66 +59,119 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * identically.  -- paulus
  */
 
-static const struct tx_table {
-	GLuint format, filter, flag;
-} tx_table[] = {
-	/* *INDENT-OFF* */
+int32_t r300TranslateTexFormat(gl_format mesaFormat)
+{
+	switch (mesaFormat)
+	{
 #ifdef MESA_LITTLE_ENDIAN
-	_ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
-	_ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
-	_ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
-	_ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
+		case MESA_FORMAT_RGBA8888:
+			return R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8);
+		case MESA_FORMAT_RGBA8888_REV:
+			return R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8);
+		case MESA_FORMAT_ARGB8888:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		case MESA_FORMAT_ARGB8888_REV:
+			return R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8);
 #else
-	_ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
-	_ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
-	_ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
-	_ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
+		case MESA_FORMAT_RGBA8888:
+			return R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8);
+		case MESA_FORMAT_RGBA8888_REV:
+			return R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8);
+		case MESA_FORMAT_ARGB8888:
+			return R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8);
+		case MESA_FORMAT_ARGB8888_REV:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
 #endif
-	_ASSIGN(RGB888, R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8)),
-	_ASSIGN(RGB565, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
-	_ASSIGN(RGB565_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
-	_ASSIGN(ARGB4444, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
-	_ASSIGN(ARGB4444_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
-	_ASSIGN(ARGB1555, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
-	_ASSIGN(ARGB1555_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
-	_ASSIGN(AL88, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
-	_ASSIGN(AL88_REV, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
-	_ASSIGN(RGB332, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z3Y3X2)),
-	_ASSIGN(A8, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8)),
-	_ASSIGN(L8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8)),
-	_ASSIGN(I8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
-	_ASSIGN(CI8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
-	_ASSIGN(YCBCR, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8) | R300_TX_FORMAT_YUV_MODE),
-	_ASSIGN(YCBCR_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8) | R300_TX_FORMAT_YUV_MODE),
-	_ASSIGN(RGB_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1)),
-	_ASSIGN(RGBA_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1)),
-	_ASSIGN(RGBA_DXT3, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3)),
-	_ASSIGN(RGBA_DXT5, R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5)),
-	_ASSIGN(RGBA_FLOAT32, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R32G32B32A32)),
-	_ASSIGN(RGBA_FLOAT16, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16)),
-	_ASSIGN(RGB_FLOAT32, 0xffffffff),
-	_ASSIGN(RGB_FLOAT16, 0xffffffff),
-	_ASSIGN(ALPHA_FLOAT32, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I32)),
-	_ASSIGN(ALPHA_FLOAT16, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I16)),
-	_ASSIGN(LUMINANCE_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I32)),
-	_ASSIGN(LUMINANCE_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I16)),
-	_ASSIGN(LUMINANCE_ALPHA_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I32A32)),
-	_ASSIGN(LUMINANCE_ALPHA_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I16A16)),
-	_ASSIGN(INTENSITY_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, X, FL_I32)),
-	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
-	_ASSIGN(Z16, R300_EASY_TX_FORMAT(X, X, X, X, X16)),
-	_ASSIGN(Z24_S8, R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8)),
-	_ASSIGN(S8_Z24, R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8)),
-	_ASSIGN(Z32, R300_EASY_TX_FORMAT(X, X, X, X, X32)),
-	/* EXT_texture_sRGB */
-	_ASSIGN(SRGBA8, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA),
-	_ASSIGN(SLA8, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA),
-	_ASSIGN(SL8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8) | R300_TX_FORMAT_GAMMA),
-	/* *INDENT-ON* */
+		case MESA_FORMAT_XRGB8888:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		case MESA_FORMAT_RGB888:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		case MESA_FORMAT_RGB565:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		case MESA_FORMAT_RGB565_REV:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		case MESA_FORMAT_ARGB4444:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4);
+		case MESA_FORMAT_ARGB4444_REV:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4);
+		case MESA_FORMAT_ARGB1555:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5);
+		case MESA_FORMAT_ARGB1555_REV:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5);
+		case MESA_FORMAT_AL88:
+			return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8);
+		case MESA_FORMAT_AL88_REV:
+			return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8);
+		case MESA_FORMAT_RGB332:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z3Y3X2);
+		case MESA_FORMAT_A8:
+			return R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8);
+		case MESA_FORMAT_L8:
+			return R300_EASY_TX_FORMAT(X, X, X, ONE, X8);
+		case MESA_FORMAT_I8:
+			return R300_EASY_TX_FORMAT(X, X, X, X, X8);
+		case MESA_FORMAT_CI8:
+			return R300_EASY_TX_FORMAT(X, X, X, X, X8);
+		case MESA_FORMAT_YCBCR:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8) | R300_TX_FORMAT_YUV_MODE;
+		case MESA_FORMAT_YCBCR_REV:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8) | R300_TX_FORMAT_YUV_MODE;
+		case MESA_FORMAT_RGB_DXT1:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1);
+		case MESA_FORMAT_RGBA_DXT1:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1);
+		case MESA_FORMAT_RGBA_DXT3:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3);
+		case MESA_FORMAT_RGBA_DXT5:
+			return R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5);
+		case MESA_FORMAT_RGBA_FLOAT32:
+			return R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R32G32B32A32);
+		case MESA_FORMAT_RGBA_FLOAT16:
+			return R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16);
+		case MESA_FORMAT_ALPHA_FLOAT32:
+			return R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I32);
+		case MESA_FORMAT_ALPHA_FLOAT16:
+			return R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I16);
+		case MESA_FORMAT_LUMINANCE_FLOAT32:
+			return R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I32);
+		case MESA_FORMAT_LUMINANCE_FLOAT16:
+			return R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I16);
+		case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+			return R300_EASY_TX_FORMAT(X, X, X, Y, FL_I32A32);
+		case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+			return R300_EASY_TX_FORMAT(X, X, X, Y, FL_I16A16);
+		case MESA_FORMAT_INTENSITY_FLOAT32:
+			return R300_EASY_TX_FORMAT(X, X, X, X, FL_I32);
+		case MESA_FORMAT_INTENSITY_FLOAT16:
+			return R300_EASY_TX_FORMAT(X, X, X, X, FL_I16);
+		case MESA_FORMAT_Z16:
+			return R300_EASY_TX_FORMAT(X, X, X, X, X16);
+		case MESA_FORMAT_Z24_S8:
+			return R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8);
+		case MESA_FORMAT_S8_Z24:
+			return R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8);
+		case MESA_FORMAT_Z32:
+			return R300_EASY_TX_FORMAT(X, X, X, X, X32);
+		/* EXT_texture_sRGB */
+		case MESA_FORMAT_SRGBA8:
+			return R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SLA8:
+			return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SL8:
+			return R300_EASY_TX_FORMAT(X, X, X, ONE, X8) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SRGB_DXT1:
+			return R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SRGBA_DXT1:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SRGBA_DXT3:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SRGBA_DXT5:
+			return R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5) | R300_TX_FORMAT_GAMMA;
+		default:
+			return -1;
+	}
 };
 
-#undef _ASSIGN
-
 void r300SetDepthTexMode(struct gl_texture_object *tObj)
 {
 	static const GLuint formats[3][3] = {
@@ -137,9 +181,9 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X16),
 		},
 		{
-			R300_EASY_TX_FORMAT(X, X, X, ONE, X24_Y8),
-			R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8),
-			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X24_Y8),
+			R300_EASY_TX_FORMAT(Y, Y, Y, ONE, X24_Y8),
+			R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8),
+			R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, Y, X24_Y8),
 		},
 		{
 			R300_EASY_TX_FORMAT(X, X, X, ONE, X32),
@@ -202,23 +246,20 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 {
 	const struct gl_texture_image *firstImage;
-	int firstlevel = t->mt ? t->mt->firstLevel : 0;
-	    
-	firstImage = t->base.Image[0][firstlevel];
+	firstImage = t->base.Image[0][t->minLod];
 
-	if (!t->image_override
-	    && VALID_FORMAT(firstImage->TexFormat)) {
+	if (!t->image_override) {
 		if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
 			r300SetDepthTexMode(&t->base);
 		} else {
-			t->pp_txformat = tx_table[firstImage->TexFormat].format;
+			int32_t txformat = r300TranslateTexFormat(firstImage->TexFormat);
+			if (txformat < 0) {
+				_mesa_problem(rmesa->radeon.glCtx, "%s: Invalid format %s",
+							  __FUNCTION__, _mesa_get_format_name(firstImage->TexFormat));
+				exit(1);
+			}
+			t->pp_txformat = (uint32_t) txformat;
 		}
-
-		t->pp_txfilter |= tx_table[firstImage->TexFormat].filter;
-	} else if (!t->image_override) {
-		_mesa_problem(NULL, "unexpected texture format in %s",
-			      __FUNCTION__);
-		return;
 	}
 
 	if (t->image_override && t->bo)
@@ -227,7 +268,7 @@ static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 	t->pp_txsize = (((R300_TX_WIDTHMASK_MASK & ((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)))
 			| ((R300_TX_HEIGHTMASK_MASK & ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)))
 			| ((R300_TX_DEPTHMASK_MASK & ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)))
-			| ((R300_TX_MAX_MIP_LEVEL_MASK & ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT))));
+			| ((R300_TX_MAX_MIP_LEVEL_MASK & ((t->maxLod - t->minLod) << R300_TX_MAX_MIP_LEVEL_SHIFT))));
 
 	t->tile_bits = 0;
 
@@ -238,7 +279,7 @@ static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 
 
 	if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
-		unsigned int align = (64 / t->mt->bpp) - 1;
+		unsigned int align = (64 / _mesa_get_format_bytes(firstImage->TexFormat)) - 1;
 		t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
 		if (!t->image_override)
 			t->pp_txpitch = ((firstImage->Width + align) & ~align) - 1;
@@ -358,18 +399,15 @@ void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	switch (depth) {
 	case 32:
 		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
-		t->pp_txfilter |= tx_table[2].filter;
 		pitch_val /= 4;
 		break;
 	case 24:
 	default:
 		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
-		t->pp_txfilter |= tx_table[4].filter;
 		pitch_val /= 4;
 		break;
 	case 16:
 		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
-		t->pp_txfilter |= tx_table[5].filter;
 		pitch_val /= 2;
 		break;
 	}
@@ -410,18 +448,7 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
     	    return;
     	}
 
-	radeon_update_renderbuffers(pDRICtx, dPriv);
-	/* back & depth buffer are useless free them right away */
-	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-        rb->bo = NULL;
-	}
-	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-		rb->bo = NULL;
-	}
+	radeon_update_renderbuffers(pDRICtx, dPriv, GL_TRUE);
 	rb = rfb->color_rb[0];
 	if (rb->bo == NULL) {
 		/* Failed to BO for the buffer */
@@ -437,14 +464,10 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
@@ -463,18 +486,15 @@ void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
 		else
 			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
-		t->pp_txfilter |= tx_table[2].filter;
 		pitch_val /= 4;
 		break;
 	case 3:
 	default:
 		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
-		t->pp_txfilter |= tx_table[4].filter;
 		pitch_val /= 4;
 		break;
 	case 2:
 		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
-		t->pp_txfilter |= tx_table[5].filter;
 		pitch_val /= 2;
 		break;
 	}
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c
index c2f96af2c1..aa98a049aa 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.c
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.c
@@ -365,7 +365,7 @@ static void r300EmitVertexProgram(r300ContextPtr r300, int dest, struct r300_ver
 			break;
 		default:
 			fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
-			_mesa_exit(-1);
+			exit(-1);
 	}
 }
 
diff --git a/src/mesa/drivers/dri/r300/radeon_bo.c b/src/mesa/drivers/dri/r300/radeon_bo.c
new file mode 120000
index 0000000000..9448ffee54
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_bo.c
@@ -0,0 +1 @@
+../radeon/radeon_bo.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_bo_int_drm.h b/src/mesa/drivers/dri/r300/radeon_bo_int_drm.h
new file mode 120000
index 0000000000..029450928b
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_bo_int_drm.h
@@ -0,0 +1 @@
+../radeon/radeon_bo_int_drm.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_cs.c b/src/mesa/drivers/dri/r300/radeon_cs.c
new file mode 120000
index 0000000000..66b7ad1eb0
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_cs.c
@@ -0,0 +1 @@
+../radeon/radeon_cs.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_cs_int_drm.h b/src/mesa/drivers/dri/r300/radeon_cs_int_drm.h
new file mode 120000
index 0000000000..462f5245d0
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_cs_int_drm.h
@@ -0,0 +1 @@
+../radeon/radeon_cs_int_drm.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/Makefile b/src/mesa/drivers/dri/r600/Makefile
index 9b7c42042e..26f47b7268 100644
--- a/src/mesa/drivers/dri/r600/Makefile
+++ b/src/mesa/drivers/dri/r600/Makefile
@@ -14,7 +14,7 @@ EGL_SOURCES = server/radeon_egl.c
 endif
 
 ifeq ($(RADEON_LDFLAGS),)
-CS_SOURCES = radeon_cs_space_drm.c
+CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
 
 COMMON_SOURCES = \
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
index d27a3245a3..370bb04f93 100644
--- a/src/mesa/drivers/dri/r600/r600_cmdbuf.c
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
@@ -52,29 +52,49 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_mipmap_tree.h"
 #include "radeon_reg.h"
 
+#ifdef HAVE_LIBDRM_RADEON
+#include "radeon_cs_int.h"
+#else
+#include "radeon_cs_int_drm.h"
+#endif
 
+struct r600_cs_manager_legacy
+{
+    struct radeon_cs_manager    base;
+    struct radeon_context       *ctx;
+    /* hack for scratch stuff */
+    uint32_t                    pending_age;
+    uint32_t                    pending_count;
+};
+
+struct r600_cs_reloc_legacy {
+    struct radeon_cs_reloc  base;
+    uint32_t                cindices;
+    uint32_t                *indices;
+    uint32_t                *reloc_indices;
+};
 
-static struct radeon_cs * r600_cs_create(struct radeon_cs_manager *csm,
-                                   uint32_t ndw)
+static struct radeon_cs_int *r600_cs_create(struct radeon_cs_manager *csm,
+					    uint32_t ndw)
 {
-    struct radeon_cs *cs;
+    struct radeon_cs_int *csi;
 
-    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
-    if (cs == NULL) {
+    csi = (struct radeon_cs_int*)calloc(1, sizeof(struct radeon_cs_int));
+    if (csi == NULL) {
         return NULL;
     }
-    cs->csm = csm;
-    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
-    cs->packets = (uint32_t*)malloc(4*cs->ndw);
-    if (cs->packets == NULL) {
-        free(cs);
+    csi->csm = csm;
+    csi->ndw = (ndw + 0x3FF) & (~0x3FF);
+    csi->packets = (uint32_t*)malloc(4*csi->ndw);
+    if (csi->packets == NULL) {
+        free(csi);
         return NULL;
     }
-    cs->relocs_total_size = 0;
-    return cs;
+    csi->relocs_total_size = 0;
+    return csi;
 }
 
-static int r600_cs_write_reloc(struct radeon_cs *cs,
+static int r600_cs_write_reloc(struct radeon_cs_int *csi,
 			       struct radeon_bo *bo,
 			       uint32_t read_domain,
 			       uint32_t write_domain,
@@ -83,7 +103,7 @@ static int r600_cs_write_reloc(struct radeon_cs *cs,
     struct r600_cs_reloc_legacy *relocs;
     int i;
 
-    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    relocs = (struct r600_cs_reloc_legacy *)csi->relocs;
     /* check domains */
     if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
         /* in one CS a bo can only be in read or write domain but not
@@ -98,7 +118,7 @@ static int r600_cs_write_reloc(struct radeon_cs *cs,
         return -EINVAL;
     }
     /* check if bo is already referenced */
-    for(i = 0; i < cs->crelocs; i++) {
+    for(i = 0; i < csi->crelocs; i++) {
         uint32_t *indices;
         uint32_t *reloc_indices;
 
@@ -129,109 +149,108 @@ static int r600_cs_write_reloc(struct radeon_cs *cs,
             }
             relocs[i].indices = indices;
             relocs[i].reloc_indices = reloc_indices;
-            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw;
-            relocs[i].reloc_indices[relocs[i].cindices - 1] = cs->cdw;
-            cs->section_cdw += 2;
-	    cs->cdw += 2;
+            relocs[i].indices[relocs[i].cindices - 1] = csi->cdw;
+            relocs[i].reloc_indices[relocs[i].cindices - 1] = csi->cdw;
+            csi->section_cdw += 2;
+	    csi->cdw += 2;
 
             return 0;
         }
     }
     /* add bo to reloc */
     relocs = (struct r600_cs_reloc_legacy*)
-             realloc(cs->relocs,
-                     sizeof(struct r600_cs_reloc_legacy) * (cs->crelocs + 1));
+             realloc(csi->relocs,
+                     sizeof(struct r600_cs_reloc_legacy) * (csi->crelocs + 1));
     if (relocs == NULL) {
         return -ENOMEM;
     }
-    cs->relocs = relocs;
-    relocs[cs->crelocs].base.bo = bo;
-    relocs[cs->crelocs].base.read_domain = read_domain;
-    relocs[cs->crelocs].base.write_domain = write_domain;
-    relocs[cs->crelocs].base.flags = flags;
-    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
-    relocs[cs->crelocs].reloc_indices = (uint32_t*)malloc(4);
-    if ( (relocs[cs->crelocs].indices == NULL) || (relocs[cs->crelocs].reloc_indices == NULL) )
+    csi->relocs = relocs;
+    relocs[csi->crelocs].base.bo = bo;
+    relocs[csi->crelocs].base.read_domain = read_domain;
+    relocs[csi->crelocs].base.write_domain = write_domain;
+    relocs[csi->crelocs].base.flags = flags;
+    relocs[csi->crelocs].indices = (uint32_t*)malloc(4);
+    relocs[csi->crelocs].reloc_indices = (uint32_t*)malloc(4);
+    if ( (relocs[csi->crelocs].indices == NULL) || (relocs[csi->crelocs].reloc_indices == NULL) )
     {
         return -ENOMEM;
     }
 
-    relocs[cs->crelocs].indices[0] = cs->cdw;
-    relocs[cs->crelocs].reloc_indices[0] = cs->cdw;
-    cs->section_cdw += 2;
-    cs->cdw += 2;
-    relocs[cs->crelocs].cindices = 1;
-    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
-    cs->crelocs++;
+    relocs[csi->crelocs].indices[0] = csi->cdw;
+    relocs[csi->crelocs].reloc_indices[0] = csi->cdw;
+    csi->section_cdw += 2;
+    csi->cdw += 2;
+    relocs[csi->crelocs].cindices = 1;
+    csi->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
+    csi->crelocs++;
 
     radeon_bo_ref(bo);
 
     return 0;
 }
 
-static int r600_cs_begin(struct radeon_cs *cs,
+static int r600_cs_begin(struct radeon_cs_int *csi,
                     uint32_t ndw,
                     const char *file,
                     const char *func,
                     int line)
 {
-    if (cs->section) {
+    if (csi->section_ndw) {
         fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
-                cs->section_file, cs->section_func, cs->section_line);
+                csi->section_file, csi->section_func, csi->section_line);
         fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
 
-    cs->section = 1;
-    cs->section_ndw = ndw;
-    cs->section_cdw = 0;
-    cs->section_file = file;
-    cs->section_func = func;
-    cs->section_line = line;
+    csi->section_ndw = ndw;
+    csi->section_cdw = 0;
+    csi->section_file = file;
+    csi->section_func = func;
+    csi->section_line = line;
 
-    if (cs->cdw + ndw > cs->ndw) {
+    if (csi->cdw + ndw > csi->ndw) {
         uint32_t tmp, *ptr;
 	int num = (ndw > 0x400) ? ndw : 0x400;
 
-        tmp = (cs->cdw + num + 0x3FF) & (~0x3FF);
-        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
+        tmp = (csi->cdw + num + 0x3FF) & (~0x3FF);
+        ptr = (uint32_t*)realloc(csi->packets, 4 * tmp);
         if (ptr == NULL) {
             return -ENOMEM;
         }
-        cs->packets = ptr;
-        cs->ndw = tmp;
+        csi->packets = ptr;
+        csi->ndw = tmp;
     }
 
     return 0;
 }
 
-static int r600_cs_end(struct radeon_cs *cs,
+static int r600_cs_end(struct radeon_cs_int *csi,
                   const char *file,
                   const char *func,
                   int line)
 
 {
-    if (!cs->section) {
+    if (!csi->section_ndw) {
         fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
-    cs->section = 0;
 
-    if ( cs->section_ndw != cs->section_cdw ) {
+    if ( csi->section_ndw != csi->section_cdw ) {
         fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
-                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
-        fprintf(stderr, "cs->section_ndw = %d, cs->cdw = %d, cs->section_cdw = %d \n",
-                cs->section_ndw, cs->cdw, cs->section_cdw);
+                csi->section_file, csi->section_func, csi->section_line, csi->section_ndw, csi->section_cdw);
+        fprintf(stderr, "csi->section_ndw = %d, csi->cdw = %d, csi->section_cdw = %d \n",
+                csi->section_ndw, csi->cdw, csi->section_cdw);
         fprintf(stderr, "CS section end at (%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
+    csi->section_ndw = 0;
 
-    if (cs->cdw > cs->ndw) {
+    if (csi->cdw > csi->ndw) {
 	    fprintf(stderr, "CS section overflow at (%s,%s,%d) cdw %d ndw %d\n",
-		    cs->section_file, cs->section_func, cs->section_line,cs->cdw,cs->ndw);
+		    csi->section_file, csi->section_func, csi->section_line,csi->cdw,csi->ndw);
 	    fprintf(stderr, "CS section end at (%s,%s,%d)\n",
 		    file, func, line);
 	    assert(0);
@@ -240,20 +259,20 @@ static int r600_cs_end(struct radeon_cs *cs,
     return 0;
 }
 
-static int r600_cs_process_relocs(struct radeon_cs *cs, 
+static int r600_cs_process_relocs(struct radeon_cs_int *csi, 
                                   uint32_t * reloc_chunk,
                                   uint32_t * length_dw_reloc_chunk) 
 {
-    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)csi->csm;
     struct r600_cs_reloc_legacy *relocs;
     int i, j, r;
 
     uint32_t offset_dw = 0;
 
-    csm = (struct r600_cs_manager_legacy*)cs->csm;
-    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    csm = (struct r600_cs_manager_legacy*)csi->csm;
+    relocs = (struct r600_cs_reloc_legacy *)csi->relocs;
 restart:
-    for (i = 0; i < cs->crelocs; i++) {
+    for (i = 0; i < csi->crelocs; i++) {
             uint32_t soffset, eoffset;
 
             r = radeon_bo_legacy_validate(relocs[i].base.bo,
@@ -269,9 +288,9 @@ restart:
 
 	    for (j = 0; j < relocs[i].cindices; j++) {
 		    /* pkt3 nop header in ib chunk */
-		    cs->packets[relocs[i].reloc_indices[j]] = 0xC0001000;
+		    csi->packets[relocs[i].reloc_indices[j]] = 0xC0001000;
 		    /* reloc index in ib chunk */
-		    cs->packets[relocs[i].reloc_indices[j] + 1] = offset_dw;
+		    csi->packets[relocs[i].reloc_indices[j] + 1] = offset_dw;
 	    }
 
 	    /* asic offset in reloc chunk */ /* see alex drm r600_nomm_relocate */
@@ -286,14 +305,14 @@ restart:
     return 0;
 }
 
-static int r600_cs_set_age(struct radeon_cs *cs) /* -------------- */
+static int r600_cs_set_age(struct radeon_cs_int *csi) /* -------------- */
 {
-    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)csi->csm;
     struct r600_cs_reloc_legacy *relocs;
     int i;
 
-    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
-    for (i = 0; i < cs->crelocs; i++) {
+    relocs = (struct r600_cs_reloc_legacy *)csi->relocs;
+    for (i = 0; i < csi->crelocs; i++) {
         radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
         radeon_bo_unref(relocs[i].base.bo);
     }
@@ -301,21 +320,21 @@ static int r600_cs_set_age(struct radeon_cs *cs) /* -------------- */
 }
 
 #if 0
-static void dump_cmdbuf(struct radeon_cs *cs)
+static void dump_cmdbuf(struct radeon_cs_int *csi)
 {
 	int i;
 	fprintf(stderr,"--start--\n");
-	for (i = 0; i < cs->cdw; i++){
-		fprintf(stderr,"0x%08x\n", cs->packets[i]);
+	for (i = 0; i < csi->cdw; i++){
+		fprintf(stderr,"0x%08x\n", csi->packets[i]);
 	}
 	fprintf(stderr,"--end--\n");
 
 }
 #endif
 
-static int r600_cs_emit(struct radeon_cs *cs)
+static int r600_cs_emit(struct radeon_cs_int *csi)
 {
-    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)csi->csm;
     struct drm_radeon_cs       cs_cmd;
     struct drm_radeon_cs_chunk cs_chunk[2];
     uint32_t length_dw_reloc_chunk;
@@ -329,9 +348,9 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     csm->pending_count = 1;
 
-    reloc_chunk = (uint32_t*)calloc(1, cs->crelocs * 4 * 4);
+    reloc_chunk = (uint32_t*)calloc(1, csi->crelocs * 4 * 4);
 
-    r = r600_cs_process_relocs(cs, reloc_chunk, &length_dw_reloc_chunk);
+    r = r600_cs_process_relocs(csi, reloc_chunk, &length_dw_reloc_chunk);
     if (r) {
 	free(reloc_chunk);
         return 0;
@@ -339,8 +358,8 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     /* raw ib chunk */
     cs_chunk[0].chunk_id   = RADEON_CHUNK_ID_IB;
-    cs_chunk[0].length_dw  = cs->cdw;
-    cs_chunk[0].chunk_data = (unsigned long)(cs->packets);
+    cs_chunk[0].length_dw  = csi->cdw;
+    cs_chunk[0].chunk_data = (unsigned long)(csi->packets);
 
     /* reloc chaunk */
     cs_chunk[1].chunk_id   = RADEON_CHUNK_ID_RELOCS;
@@ -358,7 +377,7 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     do 
     {
-        r = drmCommandWriteRead(cs->csm->fd, DRM_RADEON_CS, &cs_cmd, sizeof(cs_cmd));
+        r = drmCommandWriteRead(csi->csm->fd, DRM_RADEON_CS, &cs_cmd, sizeof(cs_cmd));
         retry++;
     } while (r == -EAGAIN && retry < 1000);
 
@@ -369,11 +388,11 @@ static int r600_cs_emit(struct radeon_cs *cs)
 
     csm->pending_age = cs_cmd.cs_id;
 
-    r600_cs_set_age(cs);
+    r600_cs_set_age(csi);
 
-    cs->csm->read_used = 0;
-    cs->csm->vram_write_used = 0;
-    cs->csm->gart_write_used = 0;
+    csi->csm->read_used = 0;
+    csi->csm->vram_write_used = 0;
+    csi->csm->gart_write_used = 0;
 
     free(reloc_chunk);
 
@@ -393,35 +412,34 @@ static void inline r600_cs_free_reloc(void *relocs_p, int crelocs)
     }
 }
 
-static int r600_cs_destroy(struct radeon_cs *cs)
+static int r600_cs_destroy(struct radeon_cs_int *csi)
 {
-    r600_cs_free_reloc(cs->relocs, cs->crelocs);
-    free(cs->relocs);
-    free(cs->packets);
-    free(cs);
+    r600_cs_free_reloc(csi->relocs, csi->crelocs);
+    free(csi->relocs);
+    free(csi->packets);
+    free(csi);
     return 0;
 }
 
-static int r600_cs_erase(struct radeon_cs *cs)
+static int r600_cs_erase(struct radeon_cs_int *csi)
 {
-    r600_cs_free_reloc(cs->relocs, cs->crelocs);
-    free(cs->relocs);
-    cs->relocs_total_size = 0;
-    cs->relocs = NULL;
-    cs->crelocs = 0;
-    cs->cdw = 0;
-    cs->section = 0;
+    r600_cs_free_reloc(csi->relocs, csi->crelocs);
+    free(csi->relocs);
+    csi->relocs_total_size = 0;
+    csi->relocs = NULL;
+    csi->crelocs = 0;
+    csi->cdw = 0;
     return 0;
 }
 
-static int r600_cs_need_flush(struct radeon_cs *cs)
+static int r600_cs_need_flush(struct radeon_cs_int *csi)
 {
     /* this function used to flush when the BO usage got to
      * a certain size, now the higher levels handle this better */
     return 0;
 }
 
-static void r600_cs_print(struct radeon_cs *cs, FILE *file)
+static void r600_cs_print(struct radeon_cs_int *csi, FILE *file)
 {
 }
 
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.h b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
index eba43d37b6..dff0009699 100644
--- a/src/mesa/drivers/dri/r600/r600_cmdbuf.h
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
@@ -118,22 +118,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R600_IT_SET_CTL_CONST                     0x00006F00
 #define R600_IT_SURFACE_BASE_UPDATE               0x00007300
 
-struct r600_cs_manager_legacy
-{
-    struct radeon_cs_manager    base;
-    struct radeon_context       *ctx;
-    /* hack for scratch stuff */
-    uint32_t                    pending_age;
-    uint32_t                    pending_count;
-};
-
-struct r600_cs_reloc_legacy {
-    struct radeon_cs_reloc  base;
-    uint32_t                cindices;
-    uint32_t                *indices;
-    uint32_t                *reloc_indices;
-};
-
 struct radeon_cs_manager * r600_radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
 
 /**
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index dbd233729c..cb549497f5 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -74,6 +74,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "utils.h"
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
 
+#define R600_ENABLE_GLSL_TEST 1
+
 #define need_GL_VERSION_2_0
 #define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
@@ -97,6 +99,7 @@ static const struct dri_extension card_extensions[] = {
   {"GL_ARB_depth_clamp",                NULL},
   {"GL_ARB_depth_texture",		NULL},
   {"GL_ARB_fragment_program",		NULL},
+  {"GL_ARB_fragment_program_shadow",	NULL},
   {"GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions},
   {"GL_ARB_multitexture",		NULL},
   {"GL_ARB_point_parameters",		GL_ARB_point_parameters_functions},
@@ -109,6 +112,7 @@ static const struct dri_extension card_extensions[] = {
   {"GL_ARB_texture_env_crossbar",	NULL},
   {"GL_ARB_texture_env_dot3",		NULL},
   {"GL_ARB_texture_mirrored_repeat",	NULL},
+  {"GL_ARB_texture_non_power_of_two",   NULL},
   {"GL_ARB_vertex_program",		GL_ARB_vertex_program_functions},
   {"GL_EXT_blend_equation_separate",	GL_EXT_blend_equation_separate_functions},
   {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
@@ -155,7 +159,12 @@ static const struct dri_extension mm_extensions[] = {
  * functions added by GL_ATI_separate_stencil.
  */
 static const struct dri_extension gl_20_extension[] = {
+#ifdef R600_ENABLE_GLSL_TEST
+    {"GL_ARB_shading_language_100",			GL_VERSION_2_0_functions },
+#else
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
+#endif /* R600_ENABLE_GLSL_TEST */
+  {NULL, NULL}
 };
 
 static const struct tnl_pipeline_stage *r600_pipeline[] = {
@@ -308,6 +317,14 @@ static void r600InitGLExtensions(GLcontext *ctx)
 	if (r600->radeon.radeonScreen->kernel_mm)
 	  driInitExtensions(ctx, mm_extensions, GL_FALSE);
 
+#ifdef R600_ENABLE_GLSL_TEST
+    driInitExtensions(ctx, gl_20_extension, GL_TRUE);
+    _mesa_enable_2_0_extensions(ctx);
+    
+    /* glsl compiler has problem if this is not GL_TRUE */
+    ctx->Shader.EmitCondCodes = GL_TRUE;
+#endif /* R600_ENABLE_GLSL_TEST */
+
 	if (driQueryOptionb
 	    (&r600->radeon.optionCache, "disable_stencil_two_side"))
 		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
@@ -330,10 +347,10 @@ static void r600InitGLExtensions(GLcontext *ctx)
 /* Create the device specific rendering context.
  */
 GLboolean r600CreateContext(const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
+			    __DRIcontext * driContextPriv,
 			    void *sharedContextPrivate)
 {
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	__DRIscreen *sPriv = driContextPriv->driScreenPriv;
 	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
 	struct dd_function_table functions;
 	context_t *r600;
diff --git a/src/mesa/drivers/dri/r600/r600_context.h b/src/mesa/drivers/dri/r600/r600_context.h
index 394fd757d4..a1b4af715e 100644
--- a/src/mesa/drivers/dri/r600/r600_context.h
+++ b/src/mesa/drivers/dri/r600/r600_context.h
@@ -108,6 +108,7 @@ typedef struct StreamDesc
 	GLint   size;   //number of data element
 	GLenum  type;  //data element type
 	GLsizei stride;
+	GLenum  format; // GL_RGBA,GLBGRA 
 
 	struct radeon_bo *bo;
 	GLint  bo_offset;
@@ -153,7 +154,7 @@ struct r600_context {
 #define GL_CONTEXT(context)     ((GLcontext *)(context->radeon.glCtx))
 
 extern GLboolean r600CreateContext(const __GLcontextModes * glVisual,
-				   __DRIcontextPrivate * driContextPriv,
+				   __DRIcontext * driContextPriv,
 				   void *sharedContextPrivate);
 
 #define R700_CONTEXT_STATES(context) ((R700_CHIP_CONTEXT *)(&context->hw))
diff --git a/src/mesa/drivers/dri/r600/r600_reg_r6xx.h b/src/mesa/drivers/dri/r600/r600_reg_r6xx.h
index f7702c46de..74af7b4fed 100644
--- a/src/mesa/drivers/dri/r600/r600_reg_r6xx.h
+++ b/src/mesa/drivers/dri/r600/r600_reg_r6xx.h
@@ -415,11 +415,11 @@ enum {
 	ALPHA_TO_MASK_ENABLE                              = 1 << 0,
 	ALPHA_TO_MASK_OFFSET0_mask                        = 0x03 << 8,
 	ALPHA_TO_MASK_OFFSET0_shift                       = 8,
-	ALPHA_TO_MASK_OFFSET1_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET1_mask                        = 0x03 << 10,
 	ALPHA_TO_MASK_OFFSET1_shift                       = 10,
-	ALPHA_TO_MASK_OFFSET2_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET2_mask                        = 0x03 << 12,
 	ALPHA_TO_MASK_OFFSET2_shift                       = 12,
-	ALPHA_TO_MASK_OFFSET3_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET3_mask                        = 0x03 << 14,
 	ALPHA_TO_MASK_OFFSET3_shift                       = 14,
 
 //  SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
diff --git a/src/mesa/drivers/dri/r600/r600_reg_r7xx.h b/src/mesa/drivers/dri/r600/r600_reg_r7xx.h
index e5c01c861a..eb169bd885 100644
--- a/src/mesa/drivers/dri/r600/r600_reg_r7xx.h
+++ b/src/mesa/drivers/dri/r600/r600_reg_r7xx.h
@@ -143,6 +143,8 @@ enum {
 //  SQ_TEX_SAMPLER_MISC_0                                 = 0x0003d03c,
 	R7xx_TRUNCATE_COORD_bit                           = 1 << 9,
 	R7xx_DISABLE_CUBE_WRAP_bit                        = 1 << 10,
+//  DB_RENDER_CONTROL                                     = 0x00028d0c,
+	PERFECT_ZPASS_COUNTS_bit                          = 1 << 15,
 
 } ;
 
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
index 20965bb3c8..f745fe3e8a 100644
--- a/src/mesa/drivers/dri/r600/r600_tex.c
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -305,23 +305,14 @@ static void r600TexParameter(GLcontext * ctx, GLenum target,
 		break;
 
 	case GL_TEXTURE_BORDER_COLOR:
-		r600SetTexBorderColor(t, texObj->BorderColor);
+		r600SetTexBorderColor(t, texObj->BorderColor.f);
 		break;
 
 	case GL_TEXTURE_BASE_LEVEL:
 	case GL_TEXTURE_MAX_LEVEL:
 	case GL_TEXTURE_MIN_LOD:
 	case GL_TEXTURE_MAX_LOD:
-		/* This isn't the most efficient solution but there doesn't appear to
-		 * be a nice alternative.  Since there's no LOD clamping,
-		 * we just have to rely on loading the right subset of mipmap levels
-		 * to simulate a clamped LOD.
-		 */
-		if (t->mt) {
-			radeon_miptree_unreference(t->mt);
-			t->mt = 0;
-			t->validated = GL_FALSE;
-		}
+		t->validated = GL_FALSE;
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
@@ -369,10 +360,8 @@ static void r600DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 		t->bo = NULL;
 	}
 
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = 0;
-	}
+	radeon_miptree_unreference(&t->mt);
+
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -402,7 +391,7 @@ static struct gl_texture_object *r600NewTextureObject(GLcontext * ctx,
 	r600SetTexDefaultState(t);
 	r600UpdateTexWrap(t);
 	r600SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
-	r600SetTexBorderColor(t, t->base.BorderColor);
+	r600SetTexBorderColor(t, t->base.BorderColor.f);
 
 	return &t->base;
 }
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
index 27c8354923..b8466bdd75 100644
--- a/src/mesa/drivers/dri/r600/r600_texstate.c
+++ b/src/mesa/drivers/dri/r600/r600_texstate.c
@@ -91,7 +91,7 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa
 	SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
 		 FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
 	SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
-		 FORMAT_COMP_X_shift, FORMAT_COMP_Z_mask);
+		 FORMAT_COMP_Z_shift, FORMAT_COMP_Z_mask);
 	SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
 		 FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
 
@@ -357,37 +357,37 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_32_32_FLOAT,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 		break;
 	case MESA_FORMAT_RGBA_FLOAT16:
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_16_16_FLOAT,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 		break;
 	case MESA_FORMAT_RGB_FLOAT32: /* X, Y, Z, ONE */
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_32_FLOAT,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
@@ -396,11 +396,11 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_16_FLOAT,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
@@ -461,26 +461,26 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_FLOAT,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 		break;
 	case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_FLOAT,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 		break;
 	case MESA_FORMAT_INTENSITY_FLOAT32: /* X, X, X, X */
@@ -626,6 +626,31 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa
 	return GL_TRUE;
 }
 
+static GLuint r600_translate_shadow_func(GLenum func)
+{
+   switch (func) {
+   case GL_NEVER:
+      return SQ_TEX_DEPTH_COMPARE_NEVER;
+   case GL_LESS:
+      return SQ_TEX_DEPTH_COMPARE_LESS;
+   case GL_LEQUAL:
+      return SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
+   case GL_GREATER:
+      return SQ_TEX_DEPTH_COMPARE_GREATER;
+   case GL_GEQUAL:
+      return SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
+   case GL_NOTEQUAL:
+      return SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
+   case GL_EQUAL:
+      return SQ_TEX_DEPTH_COMPARE_EQUAL;
+   case GL_ALWAYS:
+      return SQ_TEX_DEPTH_COMPARE_ALWAYS;
+   default:
+      WARN_ONCE("Unknown shadow compare function! %d", func);
+      return 0;
+   }
+}
+
 void r600SetDepthTexMode(struct gl_texture_object *tObj)
 {
 	radeonTexObjPtr t;
@@ -649,7 +674,6 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 {
 	radeonTexObj *t = radeon_tex_obj(texObj);
 	const struct gl_texture_image *firstImage;
-	int firstlevel = t->mt ? t->mt->firstLevel : 0;
 	GLuint uTexelPitch, row_align;
 
 	if (rmesa->radeon.radeonScreen->driScreen->dri2.enabled &&
@@ -657,7 +681,7 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 	    t->bo)
 		return;
 
-	firstImage = t->base.Image[0][firstlevel];
+	firstImage = t->base.Image[0][t->minLod];
 
 	if (!t->image_override) {
 		if (!r600GetTexFormat(texObj, firstImage->TexFormat)) {
@@ -692,7 +716,8 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 	}
 
 	row_align = rmesa->radeon.texture_row_align - 1;
-	uTexelPitch = ((firstImage->Width * t->mt->bpp + row_align) & ~row_align) / t->mt->bpp;
+	uTexelPitch = (_mesa_format_row_stride(firstImage->TexFormat, firstImage->Width) + row_align) & ~row_align;
+	uTexelPitch = uTexelPitch / _mesa_get_format_bytes(firstImage->TexFormat);
 	uTexelPitch = (uTexelPitch + R700_TEXEL_PITCH_ALIGNMENT_MASK)
 		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
 
@@ -706,11 +731,22 @@ static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *tex
 	SETfield(t->SQ_TEX_RESOURCE1, firstImage->Height - 1,
 		 TEX_HEIGHT_shift, TEX_HEIGHT_mask);
 
-	if ((t->mt->lastLevel - t->mt->firstLevel) > 0) {
-		t->SQ_TEX_RESOURCE3 = t->mt->levels[0].size / 256;
-		SETfield(t->SQ_TEX_RESOURCE4, t->mt->firstLevel, BASE_LEVEL_shift, BASE_LEVEL_mask);
-		SETfield(t->SQ_TEX_RESOURCE5, t->mt->lastLevel, LAST_LEVEL_shift, LAST_LEVEL_mask);
+	t->SQ_TEX_RESOURCE2 = get_base_teximage_offset(t) / 256;
+
+	if ((t->maxLod - t->minLod) > 0) {
+		t->SQ_TEX_RESOURCE3 = radeon_miptree_image_offset(t->mt, 0, t->minLod + 1) / 256;
+		SETfield(t->SQ_TEX_RESOURCE4, 0, BASE_LEVEL_shift, BASE_LEVEL_mask);
+		SETfield(t->SQ_TEX_RESOURCE5, t->maxLod - t->minLod, LAST_LEVEL_shift, LAST_LEVEL_mask);
+	}
+	if(texObj->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB)
+	{
+		SETfield(t->SQ_TEX_SAMPLER0, r600_translate_shadow_func(texObj->CompareFunc), DEPTH_COMPARE_FUNCTION_shift, DEPTH_COMPARE_FUNCTION_mask);
 	}
+	else
+	{
+		CLEARfield(t->SQ_TEX_SAMPLER0, DEPTH_COMPARE_FUNCTION_mask);
+	}
+
 }
 
 /**
@@ -808,9 +844,8 @@ void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	struct gl_texture_object *tObj =
 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
 	radeonTexObjPtr t = radeon_tex_obj(tObj);
-	int firstlevel = t->mt ? t->mt->firstLevel : 0;
 	const struct gl_texture_image *firstImage;
-	uint32_t pitch_val, size, row_align, bpp;
+	uint32_t pitch_val, size, row_align;
 
 	if (!tObj)
 		return;
@@ -820,13 +855,9 @@ void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	if (!offset)
 		return;
 
-	bpp = depth / 8;
-	if (bpp == 3) 
-		bpp = 4;
-
-	firstImage = t->base.Image[0][firstlevel];
+	firstImage = t->base.Image[0][t->minLod];
 	row_align = rmesa->radeon.texture_row_align - 1;
-	size = ((firstImage->Width * bpp + row_align) & ~row_align) * firstImage->Height;
+	size = ((_mesa_format_row_stride(firstImage->TexFormat, firstImage->Width) + row_align) & ~row_align) * firstImage->Height;
 	if (t->bo) {
 		radeon_bo_unref(t->bo);
 		t->bo = NULL;
@@ -922,18 +953,7 @@ void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
     	    return;
     	}
 
-	radeon_update_renderbuffers(pDRICtx, dPriv);
-	/* back & depth buffer are useless free them right away */
-	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-        rb->bo = NULL;
-	}
-	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-		rb->bo = NULL;
-	}
+	radeon_update_renderbuffers(pDRICtx, dPriv, GL_TRUE);
 	rb = rfb->color_rb[0];
 	if (rb->bo == NULL) {
 		/* Failed to BO for the buffer */
@@ -949,14 +969,10 @@ void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_fo
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index e0d7d4fa6b..0ff16b4ddd 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -32,12 +32,49 @@
 
 #include "main/mtypes.h"
 #include "main/imports.h"
+#include "shader/prog_parameter.h"
 
 #include "radeon_debug.h"
 #include "r600_context.h"
 
 #include "r700_assembler.h"
 
+#define USE_CF_FOR_CONTINUE_BREAK 1
+#define USE_CF_FOR_POP_AFTER      1
+
+struct prog_instruction noise1_insts[12] = { 
+    {OPCODE_BGNSUB , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{0, 0, 0, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 2, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{8, 0, 0, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 4, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{8, 0, 585, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 8, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_SGT , {{0, 0, 585, 0, 0, 0}, {8, 0, 1170, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 1, 1, 0, 8, 1672, 0}, 1, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_IF , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 7, 0, 0}, 0, 0, 0, 1, 0, 0, 0, 15, 0, 0, 0}, 
+    {OPCODE_MOV , {{0, 0, 1755, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 1, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_RET , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_ENDIF , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{0, 0, 1170, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 1, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_RET , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_ENDSUB , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}
+};
+float noise1_const[2][4] = {
+    {0.300000f, 0.900000f, 0.500000f, 0.300000f}
+};
+
+COMPILED_SUB noise1_presub = {
+    &(noise1_insts[0]),
+    12, 
+    2, 
+    1, 
+    0, 
+    &(noise1_const[0]), 
+    SWIZZLE_X, 
+    SWIZZLE_X, 
+    SWIZZLE_X, 
+    SWIZZLE_X,
+    {0,0,0},
+    0 
+};
+
 BITS addrmode_PVSDST(PVSDST * pPVSDST)
 {
 	return pPVSDST->addrmode0 | ((BITS)pPVSDST->addrmode1 << 1);
@@ -327,22 +364,27 @@ GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size)
     return(format);
 }
 
-unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
+unsigned int r700GetNumOperands(GLuint opcode, GLuint nIsOp3) 
 {
-    if(pAsm->D.dst.op3)
+    if(nIsOp3 > 0)
     {
         return 3;
     }
 
-    switch (pAsm->D.dst.opcode)
+    switch (opcode)
     {
     case SQ_OP2_INST_ADD:
+    case SQ_OP2_INST_KILLE:
     case SQ_OP2_INST_KILLGT:
+    case SQ_OP2_INST_KILLGE:
+    case SQ_OP2_INST_KILLNE:
     case SQ_OP2_INST_MUL: 
     case SQ_OP2_INST_MAX:
     case SQ_OP2_INST_MIN:
     //case SQ_OP2_INST_MAX_DX10:
     //case SQ_OP2_INST_MIN_DX10:
+    case SQ_OP2_INST_SETE: 
+    case SQ_OP2_INST_SETNE:
     case SQ_OP2_INST_SETGT:
     case SQ_OP2_INST_SETGE:
     case SQ_OP2_INST_PRED_SETE:
@@ -358,6 +400,7 @@ unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
     case SQ_OP2_INST_MOVA_FLOOR:
     case SQ_OP2_INST_FRACT:
     case SQ_OP2_INST_FLOOR:
+    case SQ_OP2_INST_TRUNC:
     case SQ_OP2_INST_EXP_IEEE:
     case SQ_OP2_INST_LOG_CLAMPED:
     case SQ_OP2_INST_LOG_IEEE:
@@ -369,7 +412,7 @@ unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
         return 1;
         
     default: radeon_error(
-		    "Need instruction operand number for %x.\n", pAsm->D.dst.opcode);
+		    "Need instruction operand number for %x.\n", opcode); 
     };
 
     return 3;
@@ -383,103 +426,128 @@ int Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700
     pAsm->pR700Shader = pShader;
     pAsm->currentShaderType = spt;
 
-	pAsm->cf_last_export_ptr   = NULL;
+    pAsm->cf_last_export_ptr   = NULL;
+
+    pAsm->cf_current_export_clause_ptr = NULL;
+    pAsm->cf_current_alu_clause_ptr    = NULL;
+    pAsm->cf_current_tex_clause_ptr    = NULL;
+    pAsm->cf_current_vtx_clause_ptr    = NULL;
+    pAsm->cf_current_cf_clause_ptr     = NULL;
 
-	pAsm->cf_current_export_clause_ptr = NULL;
-	pAsm->cf_current_alu_clause_ptr    = NULL;
-	pAsm->cf_current_tex_clause_ptr    = NULL;
-	pAsm->cf_current_vtx_clause_ptr    = NULL;
-	pAsm->cf_current_cf_clause_ptr     = NULL;
+    // No clause has been created yet
+    pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;
 
-	// No clause has been created yet
-	pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;
+    pAsm->number_of_colorandz_exports = 0;
+    pAsm->number_of_exports           = 0;
+    pAsm->number_of_export_opcodes    = 0;
 
-	pAsm->number_of_colorandz_exports = 0;
-	pAsm->number_of_exports           = 0;
-	pAsm->number_of_export_opcodes    = 0;
+    pAsm->alu_x_opcode = 0;
 
+    pAsm->D2.bits = 0;
 
-	pAsm->D.bits = 0;
-	pAsm->S[0].bits = 0;
-	pAsm->S[1].bits = 0;
-	pAsm->S[2].bits = 0;
+    pAsm->D.bits = 0;
+    pAsm->S[0].bits = 0;
+    pAsm->S[1].bits = 0;
+    pAsm->S[2].bits = 0;
 
-	pAsm->uLastPosUpdate = 0; 
+    pAsm->uLastPosUpdate = 0; 
 	
-	*(BITS *) &pAsm->fp_stOutFmt0 = 0;
+    *(BITS *) &pAsm->fp_stOutFmt0 = 0;
 
-	pAsm->uIIns = 0;
-	pAsm->uOIns = 0;
-	pAsm->number_used_registers = 0;
-	pAsm->uUsedConsts = 256; 
+    pAsm->uIIns = 0;
+    pAsm->uOIns = 0;
+    pAsm->number_used_registers = 0;
+    pAsm->uUsedConsts = 256; 
 
 
-	// Fragment programs
-	pAsm->uBoolConsts = 0;
-	pAsm->uIntConsts = 0;
-	pAsm->uInsts = 0;
-	pAsm->uConsts = 0;
+    // Fragment programs
+    pAsm->uBoolConsts = 0;
+    pAsm->uIntConsts = 0;
+    pAsm->uInsts = 0;
+    pAsm->uConsts = 0;
 
-	pAsm->FCSP = 0;
-	pAsm->fc_stack[0].type = FC_NONE;
+    pAsm->FCSP = 0;
+    pAsm->fc_stack[0].type = FC_NONE;
 
-	pAsm->branch_depth     = 0;
-	pAsm->max_branch_depth = 0;
+    pAsm->aArgSubst[0] =
+    pAsm->aArgSubst[1] =
+    pAsm->aArgSubst[2] =
+    pAsm->aArgSubst[3] = (-1);
 
-	pAsm->aArgSubst[0] =
-	pAsm->aArgSubst[1] =
-	pAsm->aArgSubst[2] =
-	pAsm->aArgSubst[3] = (-1);
+    pAsm->uOutputs = 0;
 
-	pAsm->uOutputs = 0;
+    for (i=0; i<NUMBER_OF_OUTPUT_COLORS; i++) 
+    {
+        pAsm->color_export_register_number[i] = (-1);
+    }
 
-	for (i=0; i<NUMBER_OF_OUTPUT_COLORS; i++) 
-	{
-		pAsm->color_export_register_number[i] = (-1);
-	}
 
+    pAsm->depth_export_register_number = (-1);
+    pAsm->stencil_export_register_number = (-1);
+    pAsm->coverage_to_mask_export_register_number = (-1);
+    pAsm->mask_export_register_number = (-1);
+
+    pAsm->starting_export_register_number = 0;
+    pAsm->starting_vfetch_register_number = 0;
+    pAsm->starting_temp_register_number   = 0;
+    pAsm->uFirstHelpReg = 0;
+
+    pAsm->input_position_is_used = GL_FALSE;
+    pAsm->input_normal_is_used   = GL_FALSE;
+
+    for (i=0; i<NUMBER_OF_INPUT_COLORS; i++) 
+    {
+        pAsm->input_color_is_used[ i ] = GL_FALSE;
+    }
+
+    for (i=0; i<NUMBER_OF_TEXTURE_UNITS; i++) 
+    {
+        pAsm->input_texture_unit_is_used[ i ] = GL_FALSE;
+    }
 
-	pAsm->depth_export_register_number = (-1);
-	pAsm->stencil_export_register_number = (-1);
-	pAsm->coverage_to_mask_export_register_number = (-1);
-	pAsm->mask_export_register_number = (-1);
+    for (i=0; i<VERT_ATTRIB_MAX; i++) 
+    {
+        pAsm->vfetch_instruction_ptr_array[ i ] = NULL;
+    }
 
-	pAsm->starting_export_register_number = 0;
-	pAsm->starting_vfetch_register_number = 0;
-	pAsm->starting_temp_register_number   = 0;
-	pAsm->uFirstHelpReg = 0;
+    pAsm->number_of_inputs = 0;
 
+    pAsm->is_tex = GL_FALSE;
+    pAsm->need_tex_barrier = GL_FALSE;
 
-	pAsm->input_position_is_used = GL_FALSE;
-	pAsm->input_normal_is_used   = GL_FALSE;
+    pAsm->subs              = NULL;
+    pAsm->unSubArraySize    = 0;
+    pAsm->unSubArrayPointer = 0;
+    pAsm->callers              = NULL;
+    pAsm->unCallerArraySize    = 0;
+    pAsm->unCallerArrayPointer = 0;
 
+    pAsm->CALLSP = 0;
+    pAsm->CALLSTACK[0].FCSP_BeforeEntry = 0;
+    pAsm->CALLSTACK[0].plstCFInstructions_local
+          = &(pAsm->pR700Shader->lstCFInstructions);
 
-	for (i=0; i<NUMBER_OF_INPUT_COLORS; i++) 
-	{
-		pAsm->input_color_is_used[ i ] = GL_FALSE;
-	}
+    pAsm->CALLSTACK[0].max = 0;
+    pAsm->CALLSTACK[0].current = 0;
 
-	for (i=0; i<NUMBER_OF_TEXTURE_UNITS; i++) 
-	{
-		pAsm->input_texture_unit_is_used[ i ] = GL_FALSE;
-	}
+    SetActiveCFlist(pAsm->pR700Shader, pAsm->CALLSTACK[0].plstCFInstructions_local);
 
-	for (i=0; i<VERT_ATTRIB_MAX; i++) 
-	{
-		pAsm->vfetch_instruction_ptr_array[ i ] = NULL;
-	}
+    pAsm->unCFflags = 0;
 
-	pAsm->number_of_inputs = 0;
+    pAsm->presubs           = NULL;
+    pAsm->unPresubArraySize = 0;
+    pAsm->unNumPresub       = 0;
+    pAsm->unCurNumILInsts   = 0;
 
-	pAsm->is_tex = GL_FALSE;
-	pAsm->need_tex_barrier = GL_FALSE;
+    pAsm->unVetTexBits      = 0;
 
-	return 0;
+    return 0;
 }
 
 GLboolean IsTex(gl_inst_opcode Opcode)
 {
-    if( (OPCODE_TEX==Opcode) || (OPCODE_TXP==Opcode) || (OPCODE_TXB==Opcode) )
+    if( (OPCODE_TEX==Opcode) || (OPCODE_TXP==Opcode) || (OPCODE_TXB==Opcode) ||
+        (OPCODE_DDX==Opcode) || (OPCODE_DDY==Opcode) )
     {
         return GL_TRUE;
     }
@@ -592,6 +660,31 @@ int check_current_clause(r700_AssemblerBase* pAsm,
     return GL_TRUE;
 }
 
+GLboolean add_cf_instruction(r700_AssemblerBase* pAsm)
+{
+    if(GL_FALSE == check_current_clause(pAsm, CF_OTHER_CLAUSE))
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->cf_current_cf_clause_ptr = 
+      (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);
+
+    if (pAsm->cf_current_cf_clause_ptr != NULL) 
+	{
+		Init_R700ControlFlowGenericClause(pAsm->cf_current_cf_clause_ptr);
+		AddCFInstruction( pAsm->pR700Shader, 
+                          (R700ControlFlowInstruction *)pAsm->cf_current_cf_clause_ptr );
+	}
+	else 
+	{
+        radeon_error("Could not allocate a new VFetch CF instruction.\n");
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+
 GLboolean add_vfetch_instruction(r700_AssemblerBase*     pAsm,
 								 R700VertexInstruction*  vertex_instruction_ptr)
 {
@@ -798,6 +891,7 @@ GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
                                        GLubyte             element,
                                        GLuint              _signed,
                                        GLboolean           normalize,
+                                       GLenum              format,
                                        VTX_FETCH_METHOD  * pFetchMethod)
 {
     GLuint client_size_inbyte;
@@ -846,10 +940,21 @@ GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
 	vfetch_instruction_ptr->m_Word0.f.src_sel_x        = SQ_SEL_X;
 	vfetch_instruction_ptr->m_Word0.f.mega_fetch_count = mega_fetch_count;
 
-	vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_X;
-	vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
-	vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_Z;
-	vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
+	if(format == GL_BGRA)
+	{
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_Z;
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_X;
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
+	}
+	else
+	{
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_X;
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_Z;
+		vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
+
+	}
 
 	vfetch_instruction_ptr->m_Word1.f.use_const_fields = 1;
     vfetch_instruction_ptr->m_Word1.f.data_format      = data_format;
@@ -987,7 +1092,8 @@ GLboolean checkop2(r700_AssemblerBase* pAsm)
 
     checkop_init(pAsm);
 
-    if( (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
+    if( (pILInst->SrcReg[0].File == PROGRAM_UNIFORM)     || 
+        (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
         (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
         (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
         (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
@@ -998,7 +1104,8 @@ GLboolean checkop2(r700_AssemblerBase* pAsm)
     {
         bSrcConst[0] = GL_FALSE;
     }
-    if( (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
+    if( (pILInst->SrcReg[1].File == PROGRAM_UNIFORM)     || 
+        (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
         (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
         (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
         (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
@@ -1031,7 +1138,8 @@ GLboolean checkop3(r700_AssemblerBase* pAsm)
 
     checkop_init(pAsm);
 
-    if( (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
+    if( (pILInst->SrcReg[0].File == PROGRAM_UNIFORM)     || 
+        (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
         (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
         (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
         (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
@@ -1042,7 +1150,8 @@ GLboolean checkop3(r700_AssemblerBase* pAsm)
     {
         bSrcConst[0] = GL_FALSE;
     }
-    if( (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
+    if( (pILInst->SrcReg[1].File == PROGRAM_UNIFORM)     || 
+        (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
         (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
         (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
         (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
@@ -1053,7 +1162,8 @@ GLboolean checkop3(r700_AssemblerBase* pAsm)
     {
         bSrcConst[1] = GL_FALSE;
     }
-    if( (pILInst->SrcReg[2].File == PROGRAM_CONSTANT)    ||
+    if( (pILInst->SrcReg[2].File == PROGRAM_UNIFORM)     || 
+        (pILInst->SrcReg[2].File == PROGRAM_CONSTANT)    ||
         (pILInst->SrcReg[2].File == PROGRAM_LOCAL_PARAM) ||
         (pILInst->SrcReg[2].File == PROGRAM_ENV_PARAM)   ||
         (pILInst->SrcReg[2].File == PROGRAM_STATE_VAR) )
@@ -1153,6 +1263,7 @@ GLboolean assemble_src(r700_AssemblerBase *pAsm,
         case PROGRAM_LOCAL_PARAM:
         case PROGRAM_ENV_PARAM:
         case PROGRAM_STATE_VAR:
+        case PROGRAM_UNIFORM:
             if (1 == pILInst->SrcReg[src].RelAddr)
             {
                 setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_RELATIVE_A0);
@@ -1163,10 +1274,18 @@ GLboolean assemble_src(r700_AssemblerBase *pAsm,
             }
 
             pAsm->S[fld].src.rtype = SRC_REG_CONSTANT;
-            pAsm->S[fld].src.reg   = pILInst->SrcReg[src].Index;
+            if(pILInst->SrcReg[src].Index < 0)
+            {
+                WARN_ONCE("Negative register offsets not supported yet!\n");
+                pAsm->S[fld].src.reg  = 0;
+            } 
+            else
+            {
+                pAsm->S[fld].src.reg = pILInst->SrcReg[src].Index;
+            }
             break;      
         case PROGRAM_INPUT:
-            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
+            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE); 
             pAsm->S[fld].src.rtype = SRC_REG_INPUT;
             switch (pAsm->currentShaderType)
             {
@@ -1179,7 +1298,7 @@ GLboolean assemble_src(r700_AssemblerBase *pAsm,
             }
             break;      
         default:
-            radeon_error("Invalid source argument type\n");
+            radeon_error("Invalid source argument type : %d \n", pILInst->SrcReg[src].File);
             return GL_FALSE;
         }
     } 
@@ -1235,6 +1354,15 @@ GLboolean assemble_dst(r700_AssemblerBase *pAsm)
     pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
     pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
   
+    if(pILInst->SaturateMode == SATURATE_ZERO_ONE)
+    {
+        pAsm->D2.dst2.SaturateMode = 1;
+    }
+    else
+    {
+        pAsm->D2.dst2.SaturateMode = 0;
+    }
+
     return GL_TRUE;
 }
 
@@ -1294,6 +1422,7 @@ GLboolean tex_src(r700_AssemblerBase *pAsm)
     else
     {
     switch (pILInst->SrcReg[0].File) {
+        case PROGRAM_UNIFORM: 
         case PROGRAM_CONSTANT:
         case PROGRAM_LOCAL_PARAM:
         case PROGRAM_ENV_PARAM:
@@ -1306,36 +1435,65 @@ GLboolean tex_src(r700_AssemblerBase *pAsm)
             pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
             break;
         case PROGRAM_INPUT:
-            switch (pILInst->SrcReg[0].Index)
+            if(SPT_VP == pAsm->currentShaderType)
+            {
+                switch (pILInst->SrcReg[0].Index)
+                {
+                    case VERT_ATTRIB_TEX0:
+                    case VERT_ATTRIB_TEX1:
+                    case VERT_ATTRIB_TEX2:
+                    case VERT_ATTRIB_TEX3:
+                    case VERT_ATTRIB_TEX4:
+                    case VERT_ATTRIB_TEX5:
+                    case VERT_ATTRIB_TEX6:
+                    case VERT_ATTRIB_TEX7:
+                        bValidTexCoord = GL_TRUE;
+                        pAsm->S[0].src.reg   =
+                            pAsm->ucVP_AttributeMap[pILInst->SrcReg[0].Index];
+                        pAsm->S[0].src.rtype = SRC_REG_INPUT;
+                        break;
+                }
+            }
+            else
             {
-                case FRAG_ATTRIB_WPOS:
-                case FRAG_ATTRIB_COL0:
-                case FRAG_ATTRIB_COL1:
-                case FRAG_ATTRIB_FOGC:
-                case FRAG_ATTRIB_TEX0:
-                case FRAG_ATTRIB_TEX1:
-                case FRAG_ATTRIB_TEX2:
-	        case FRAG_ATTRIB_TEX3:
-                case FRAG_ATTRIB_TEX4:
-                case FRAG_ATTRIB_TEX5:
-                case FRAG_ATTRIB_TEX6:
-                case FRAG_ATTRIB_TEX7:
-                    bValidTexCoord = GL_TRUE;
+                switch (pILInst->SrcReg[0].Index)
+                {
+                    case FRAG_ATTRIB_WPOS:
+                    case FRAG_ATTRIB_COL0:
+                    case FRAG_ATTRIB_COL1:
+                    case FRAG_ATTRIB_FOGC:
+                    case FRAG_ATTRIB_TEX0:
+                    case FRAG_ATTRIB_TEX1:
+                    case FRAG_ATTRIB_TEX2:
+                    case FRAG_ATTRIB_TEX3:
+                    case FRAG_ATTRIB_TEX4:
+                    case FRAG_ATTRIB_TEX5:
+                    case FRAG_ATTRIB_TEX6:
+                    case FRAG_ATTRIB_TEX7:
+                        bValidTexCoord = GL_TRUE;
+                        pAsm->S[0].src.reg   =
+                            pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
+                        pAsm->S[0].src.rtype = SRC_REG_INPUT;
+                        break;
+                    case FRAG_ATTRIB_FACE:
+                        fprintf(stderr, "FRAG_ATTRIB_FACE unsupported\n");
+                        break;
+                    case FRAG_ATTRIB_PNTC:
+                        fprintf(stderr, "FRAG_ATTRIB_PNTC unsupported\n");
+                        break;
+                }
+
+                if( (pILInst->SrcReg[0].Index >= FRAG_ATTRIB_VAR0) ||
+                    (pILInst->SrcReg[0].Index < FRAG_ATTRIB_MAX) )
+                {
+				    bValidTexCoord = GL_TRUE;
                     pAsm->S[0].src.reg   =
                         pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
                     pAsm->S[0].src.rtype = SRC_REG_INPUT;
-                    break;
-                case FRAG_ATTRIB_FACE:
-                    fprintf(stderr, "FRAG_ATTRIB_FACE unsupported\n");
-                    break;
-                case FRAG_ATTRIB_PNTC:
-                    fprintf(stderr, "FRAG_ATTRIB_PNTC unsupported\n");
-                    break;
-                case FRAG_ATTRIB_VAR0:
-                    fprintf(stderr, "FRAG_ATTRIB_VAR0 unsupported\n");
-                    break;
+                }
             }
-        break;
+
+            break;
         }
     }
 
@@ -1380,8 +1538,17 @@ GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm, GLboolean normalize
     tex_instruction_ptr->m_Word0.f.tex_inst         = pAsm->D.dst.opcode;
     tex_instruction_ptr->m_Word0.f.bc_frac_mode     = 0x0;
     tex_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
+    tex_instruction_ptr->m_Word0.f.alt_const        = 0;
 
-    tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg;
+    if(SPT_VP == pAsm->currentShaderType)
+    {
+        tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg + VERT_ATTRIB_MAX;
+        pAsm->unVetTexBits |= 1 << texture_unit_source->reg;
+    }
+    else
+    {
+        tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg;
+    }
 
     tex_instruction_ptr->m_Word1.f.lod_bias     = 0x0;
     if (normalized) {
@@ -1400,7 +1567,6 @@ GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm, GLboolean normalize
     tex_instruction_ptr->m_Word2.f.offset_x   = 0x0;
     tex_instruction_ptr->m_Word2.f.offset_y   = 0x0;
     tex_instruction_ptr->m_Word2.f.offset_z   = 0x0;
-
     tex_instruction_ptr->m_Word2.f.sampler_id = texture_unit_source->reg;
 
     // dst
@@ -1517,6 +1683,10 @@ GLboolean assemble_alu_src(R700ALUInstruction*  alu_instruction_ptr,
         {
             src_sel = pSource->reg + CFILE_REGISTER_OFFSET;            
         }
+        else if (pSource->rtype == SRC_REC_LITERAL)
+        {
+            src_sel = SQ_ALU_SRC_LITERAL;            
+        }
         else
         {
             radeon_error("Source (%d) register type (%d) not one of TEMP, INPUT, or CONSTANT.\n",
@@ -1606,7 +1776,8 @@ GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
         return GL_FALSE;
     }
 
-    if ( pAsm->cf_current_alu_clause_ptr == NULL ||
+    if ( pAsm->alu_x_opcode != 0 ||
+         pAsm->cf_current_alu_clause_ptr == NULL ||
          ( (pAsm->cf_current_alu_clause_ptr != NULL) && 
            (pAsm->cf_current_alu_clause_ptr->m_Word1.f.count >= (GetCFMaxInstructions(pAsm->cf_current_alu_clause_ptr->m_ShaderInstType)-contiguous_slots_needed-1) )
          ) ) 
@@ -1636,9 +1807,17 @@ GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
         pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr0 = 0x0;
         pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr1 = 0x0;
 
-        //cf_current_alu_clause_ptr->m_Word1.f.count           = number_of_scalar_operations - 1;
         pAsm->cf_current_alu_clause_ptr->m_Word1.f.count           = 0x0;
-        pAsm->cf_current_alu_clause_ptr->m_Word1.f.cf_inst         = SQ_CF_INST_ALU;
+
+        if(pAsm->alu_x_opcode != 0)
+        {
+            pAsm->cf_current_alu_clause_ptr->m_Word1.f.cf_inst = pAsm->alu_x_opcode;
+            pAsm->alu_x_opcode = 0;
+        }
+        else
+        {
+            pAsm->cf_current_alu_clause_ptr->m_Word1.f.cf_inst = SQ_CF_INST_ALU;
+        }
 
         pAsm->cf_current_alu_clause_ptr->m_Word1.f.whole_quad_mode = 0x0;
 
@@ -1646,7 +1825,7 @@ GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
     }
     else 
     {
-        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count++;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count += (GetInstructionSize(alu_instruction_ptr->m_ShaderInstType) / 2);
     }
 
     // If this clause constains any instruction that is forward dependent on a TEX instruction, 
@@ -1923,7 +2102,7 @@ GLboolean check_scalar(r700_AssemblerBase* pAsm,
 
     GLuint swizzle_key;
 
-    GLuint number_of_operands = r700GetNumOperands(pAsm);
+    GLuint number_of_operands = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);
 
     for (src=0; src<number_of_operands; src++) 
     {
@@ -2012,7 +2191,7 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
 
     GLuint swizzle_key;
 
-    GLuint number_of_operands = r700GetNumOperands(pAsm);
+    GLuint number_of_operands = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);
 
     for (src=0; src<number_of_operands; src++) 
     {
@@ -2045,7 +2224,7 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
         if( is_gpr(sel) ) 
         {
             if( GL_FALSE == cycle_for_vector_bank_swizzle(bank_swizzle, src, &cycle) )
-            {
+            {             
                 return GL_FALSE;
             }
 
@@ -2057,7 +2236,7 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
             else 
             {
                 if( GL_FALSE == reserve_gpr(pAsm, sel, chan, cycle) )
-                {
+                {                    
                     return GL_FALSE;
                 }
             }
@@ -2069,7 +2248,7 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
             if( is_cfile(sel) ) 
             {        
                 if( GL_FALSE == reserve_cfile(pAsm, sel, chan) )
-                {
+                {                    
                     return GL_FALSE;
                 }
             }
@@ -2081,6 +2260,10 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
 
 GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
 {
+    R700ALUInstruction            * alu_instruction_ptr;
+    R700ALUInstructionHalfLiteral * alu_instruction_ptr_hl;
+    R700ALUInstructionFullLiteral * alu_instruction_ptr_fl;
+
     GLuint    number_of_scalar_operations;
     GLboolean is_single_scalar_operation;
     GLuint    scalar_channel_index;
@@ -2089,7 +2272,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
     int    current_source_index;
     GLuint contiguous_slots_needed;
 
-    GLuint    uNumSrc = r700GetNumOperands(pAsm);
+    GLuint    uNumSrc = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);
     //GLuint    channel_swizzle, j;
     //GLuint    chan_counter[4] = {0, 0, 0, 0};
     //PVSSRC *  pSource[3];
@@ -2146,23 +2329,44 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
 
     contiguous_slots_needed = 0;
 
-    if(GL_TRUE == is_reduction_opcode(&(pAsm->D)) ) 
+    if(!is_single_scalar_operation) 
     {
         contiguous_slots_needed = 4;
     }
 
+    contiguous_slots_needed += pAsm->D2.dst2.literal_slots;
+
     initialize(pAsm);    
 
     for (scalar_channel_index=0;
             scalar_channel_index < number_of_scalar_operations; 
                 scalar_channel_index++) 
     {
-        R700ALUInstruction* alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
-        if (alu_instruction_ptr == NULL) 
-		{
-			return GL_FALSE;
-		}
-        Init_R700ALUInstruction(alu_instruction_ptr);
+        if(scalar_channel_index == (number_of_scalar_operations-1))
+        {
+            switch(pAsm->D2.dst2.literal_slots)
+            {
+            case 0:
+                alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+                Init_R700ALUInstruction(alu_instruction_ptr);
+                break;
+            case 1:
+                alu_instruction_ptr_hl = (R700ALUInstructionHalfLiteral*) CALLOC_STRUCT(R700ALUInstructionHalfLiteral);
+                Init_R700ALUInstructionHalfLiteral(alu_instruction_ptr_hl, pAsm->C[0].f, pAsm->C[1].f);
+                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_hl;
+                break;
+            case 2:
+                alu_instruction_ptr_fl = (R700ALUInstructionFullLiteral*) CALLOC_STRUCT(R700ALUInstructionFullLiteral);
+                Init_R700ALUInstructionFullLiteral(alu_instruction_ptr_fl,pAsm->C[0].f, pAsm->C[1].f, pAsm->C[2].f, pAsm->C[3].f);
+                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_fl;
+            break;
+            };
+        }
+        else
+        {
+            alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+            Init_R700ALUInstruction(alu_instruction_ptr);
+        }
         
         //src 0
         current_source_index = 0;
@@ -2172,7 +2376,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
                                          current_source_index,
                                          pcurrent_source, 
                                          scalar_channel_index) )     
-        {
+        {            
             return GL_FALSE;
         }
    
@@ -2186,13 +2390,13 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
                                              current_source_index,
                                              pcurrent_source, 
                                              scalar_channel_index) ) 
-            {
+            {                
                 return GL_FALSE;
             }
         }
 
         //other bits
-        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_AR_X;
+        alu_instruction_ptr->m_Word0.f.index_mode = pAsm->D2.dst2.index_mode;
 
         if(   (is_single_scalar_operation == GL_TRUE) 
            || (GL_TRUE == bSplitInst) )
@@ -2204,9 +2408,17 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             alu_instruction_ptr->m_Word0.f.last = (scalar_channel_index == 3) ?  1 : 0;
         }
 
-        alu_instruction_ptr->m_Word0.f.pred_sel                = 0x0;
-        alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;  
-        alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+        alu_instruction_ptr->m_Word0.f.pred_sel = (pAsm->D.dst.pred_inv > 0) ? 1 : 0;
+        if(1 == pAsm->D.dst.predicated)
+        {
+            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x1;
+            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x1;
+        }
+        else
+        {
+            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
+            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+        }
 
         // dst
         if( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
@@ -2215,7 +2427,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             alu_instruction_ptr->m_Word1.f.dst_gpr  = pAsm->D.dst.reg;
         }
         else 
-        {
+        {            
             radeon_error("Only temp destination registers supported for ALU dest regs.\n");
             return GL_FALSE;
         }
@@ -2245,7 +2457,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
 
         alu_instruction_ptr->m_Word1.f.dst_chan = scalar_channel_index;
 
-        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->pILInst[pAsm->uiCurInst].SaturateMode;
+        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->D2.dst2.SaturateMode;
 
         if (pAsm->D.dst.op3) 
         {            
@@ -2272,8 +2484,8 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             {
                 alu_instruction_ptr->m_Word1_OP2.f6.alu_inst           = pAsm->D.dst.opcode;
 
-                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = pAsm->S[0].src.abs;
+                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = pAsm->S[1].src.abs;
 
                 //alu_instruction_ptr->m_Word1_OP2.f6.update_execute_mask = 0x0;
                 //alu_instruction_ptr->m_Word1_OP2.f6.update_pred         = 0x0;
@@ -2301,8 +2513,8 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             {
                 alu_instruction_ptr->m_Word1_OP2.f.alu_inst           = pAsm->D.dst.opcode;
 
-                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = pAsm->S[0].src.abs;
+                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = pAsm->S[1].src.abs;
 
                 //alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
                 //alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
@@ -2329,7 +2541,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
         }
 
         if(GL_FALSE == add_alu_instruction(pAsm, alu_instruction_ptr, contiguous_slots_needed) )
-        {
+        {            
             return GL_FALSE;
         }
 
@@ -2340,19 +2552,19 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
         if (is_single_scalar_operation) 
         {
             if(GL_FALSE == check_scalar(pAsm, alu_instruction_ptr) )
-            {
+            {                
                 return GL_FALSE;
             }
         }
         else 
         {
             if(GL_FALSE == check_vector(pAsm, alu_instruction_ptr) )
-            {
-                return 1;
+            {                
+                return GL_FALSE; 
             }
         }
 
-        contiguous_slots_needed = 0;
+        contiguous_slots_needed -= 1;
     }
 
     return GL_TRUE;
@@ -2403,11 +2615,14 @@ GLboolean next_ins(r700_AssemblerBase *pAsm)
     
     //reset for next inst.
     pAsm->D.bits    = 0;
+    pAsm->D2.bits   = 0;
     pAsm->S[0].bits = 0;
     pAsm->S[1].bits = 0;
     pAsm->S[2].bits = 0;
     pAsm->is_tex = GL_FALSE;
     pAsm->need_tex_barrier = GL_FALSE;
+    pAsm->D2.bits = 0;
+    pAsm->C[0].bits = pAsm->C[1].bits = pAsm->C[2].bits = pAsm->C[3].bits = 0;
     return GL_TRUE;
 }
 
@@ -2640,9 +2855,44 @@ GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
     return GL_TRUE;
 }
 
-GLboolean assemble_COS(r700_AssemblerBase *pAsm)
+GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
 {
-    return assemble_math_function(pAsm, SQ_OP2_INST_COS);
+    int tmp;
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
+
+    assemble_src(pAsm, 0, -1);
+
+    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+    pAsm->D2.dst2.literal_slots = 1;
+    pAsm->C[0].f = 1/(3.1415926535 * 2);
+    pAsm->C[1].f = 0.0F;
+    next_ins(pAsm);
+
+    pAsm->D.dst.opcode = opcode;
+    pAsm->D.dst.math = 1;
+
+    assemble_dst(pAsm);
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    next_ins(pAsm);
+
+    //TODO - replicate if more channels set in WriteMask
+    return GL_TRUE;
+
 }
  
 GLboolean assemble_DOT(r700_AssemblerBase *pAsm)
@@ -2910,13 +3160,15 @@ GLboolean assemble_FRC(r700_AssemblerBase *pAsm)
     return GL_TRUE;
 }
  
-GLboolean assemble_KIL(r700_AssemblerBase *pAsm)
-{
-    /* TODO: doc says KILL has to be last(end) ALU clause */
-    
-    checkop1(pAsm);
+GLboolean assemble_KIL(r700_AssemblerBase *pAsm, GLuint opcode)
+{  
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    if(pILInst->Opcode == OPCODE_KIL)
+        checkop1(pAsm);
 
-    pAsm->D.dst.opcode = SQ_OP2_INST_KILLGT;  
+    pAsm->D.dst.opcode = opcode;  
+    //pAsm->D.dst.math = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
     pAsm->D.dst.rtype = DST_REG_TEMPORARY;
@@ -2929,21 +3181,34 @@ GLboolean assemble_KIL(r700_AssemblerBase *pAsm)
     setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
     pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
     pAsm->S[0].src.reg = 0;
-
     setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_0);
     noneg_PVSSRC(&(pAsm->S[0].src));
 
-    if ( GL_FALSE == assemble_src(pAsm, 0, 1) )
+    if(pILInst->Opcode == OPCODE_KIL_NV)
     {
-        return GL_FALSE;
+        setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+        pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[1].src.reg = 0;
+        setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_1);
+        neg_PVSSRC(&(pAsm->S[1].src));
     }
-  
+    else
+    {
+        if( GL_FALSE == assemble_src(pAsm, 0, 1) )
+        {
+            return GL_FALSE;
+        }
+
+    }
+
     if ( GL_FALSE == next_ins(pAsm) )
     {
         return GL_FALSE;
     }
 
+    /* Doc says KILL has to be last(end) ALU clause */
     pAsm->pR700Shader->killIsUsed = GL_TRUE;
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
     
     return GL_TRUE;
 }
@@ -3007,6 +3272,7 @@ GLboolean assemble_LRP(r700_AssemblerBase *pAsm)
     {
         return GL_FALSE;
     }
+
     if( GL_FALSE == assemble_src(pAsm, 2, -1) ) 
     {
         return GL_FALSE;
@@ -3742,77 +4008,137 @@ GLboolean assemble_RSQ(r700_AssemblerBase *pAsm)
     return assemble_math_function(pAsm, SQ_OP2_INST_RECIPSQRT_IEEE);
 }
  
-GLboolean assemble_SIN(r700_AssemblerBase *pAsm) 
-{
-    return assemble_math_function(pAsm, SQ_OP2_INST_SIN);
-}
- 
 GLboolean assemble_SCS(r700_AssemblerBase *pAsm) 
 {
     BITS tmp;
 
-	checkop1(pAsm);
+    checkop1(pAsm);
 
-	tmp = gethelpr(pAsm);
+    tmp = gethelpr(pAsm);
+    /* tmp.x = src /2*PI */
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
 
-	// COS tmp.x,    a.x
-	pAsm->D.dst.opcode = SQ_OP2_INST_COS;
-	pAsm->D.dst.math = 1;
+    assemble_src(pAsm, 0, -1);
 
-	setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-	pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-	pAsm->D.dst.reg = tmp;
-	pAsm->D.dst.writex = 1;
+    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+    pAsm->D2.dst2.literal_slots = 1;
+    pAsm->C[0].f = 1/(3.1415926535 * 2);
+    pAsm->C[1].f = 0.0F;
 
-	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
-	{
-		return GL_FALSE;
-	}
+    next_ins(pAsm);
 
-	if ( GL_FALSE == next_ins(pAsm) )
-	{
-		return GL_FALSE;
-	}
+    // COS dst.x,    a.x
+    pAsm->D.dst.opcode = SQ_OP2_INST_COS;
+    pAsm->D.dst.math = 1;
 
-	// SIN tmp.y,    a.x
-	pAsm->D.dst.opcode = SQ_OP2_INST_SIN;
-	pAsm->D.dst.math = 1;
+    assemble_dst(pAsm);
+    /* mask y */
+    pAsm->D.dst.writey = 0;
 
-	setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-	pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-	pAsm->D.dst.reg = tmp;
-	pAsm->D.dst.writey = 1;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
 
-	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
-	{
-		return GL_FALSE;
-	}
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
 
-	if( GL_FALSE == next_ins(pAsm) )
-	{
-		return GL_FALSE;
-	}
+    // SIN dst.y,    a.x
+    pAsm->D.dst.opcode = SQ_OP2_INST_SIN;
+    pAsm->D.dst.math = 1;
 
-	// MOV dst.mask,     tmp
-	pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+    assemble_dst(pAsm);
+    /* mask x */
+    pAsm->D.dst.writex = 0;
 
-	if( GL_FALSE == assemble_dst(pAsm) )
-	{
-		return GL_FALSE;
-	}
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
 
-	setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
-	pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
-	pAsm->S[0].src.reg = tmp;
+    return GL_TRUE;
+}
 
-	noswizzle_PVSSRC(&(pAsm->S[0].src));
-	pAsm->S[0].src.swizzlez = SQ_SEL_0;
-	pAsm->S[0].src.swizzlew = SQ_SEL_0;
+GLboolean assemble_LOGIC(r700_AssemblerBase *pAsm, BITS opcode) 
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+	    return GL_FALSE;
+    }
 
-	if ( GL_FALSE == next_ins(pAsm) )
-	{
-		return GL_FALSE;
-	}
+    pAsm->D.dst.opcode = opcode;
+    //pAsm->D.dst.math   = 1;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+	    return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_LOGIC_PRED(r700_AssemblerBase *pAsm, BITS opcode) 
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    pAsm->D.dst.opcode = opcode;
+    pAsm->D.dst.math   = 1;
+    pAsm->D.dst.predicated = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = pAsm->uHelpReg;
+    pAsm->D.dst.writex = 1;
+    pAsm->D.dst.writey = pAsm->D.dst.writez = pAsm->D.dst.writew = 0;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = pAsm->last_cond_register + pAsm->starting_temp_register_number;
+    pAsm->S[0].src.swizzlex = pILInst->DstReg.CondSwizzle & 0x7;
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[1].src.reg   = pAsm->uHelpReg;
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[1].src));
+    pAsm->S[1].src.swizzlex = SQ_SEL_0;
+    pAsm->S[1].src.swizzley = SQ_SEL_0;
+    pAsm->S[1].src.swizzlez = SQ_SEL_0;
+    pAsm->S[1].src.swizzlew = SQ_SEL_0;
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+	    return GL_FALSE;
+    }
 
     return GL_TRUE;
 }
@@ -3895,6 +4221,7 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
     
     switch (pAsm->pILInst[pAsm->uiCurInst].SrcReg[0].File)
     {
+    case PROGRAM_UNIFORM: 
     case PROGRAM_CONSTANT:
     case PROGRAM_LOCAL_PARAM:
     case PROGRAM_ENV_PARAM:
@@ -3915,22 +4242,6 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
 	    need_barrier = GL_TRUE;
     }
 
-    switch (pAsm->pILInst[pAsm->uiCurInst].Opcode)
-    {
-        case OPCODE_TEX:
-            break;
-        case OPCODE_TXB:
-            radeon_error("do not support TXB yet\n");
-            return GL_FALSE;
-            break;
-        case OPCODE_TXP:
-            break;
-        default:
-            radeon_error("Internal error: bad texture op (not TEX)\n");
-            return GL_FALSE;
-            break;
-    }
-
     if (pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_TXP)
     {
         GLuint tmp = gethelpr(pAsm);
@@ -4008,24 +4319,6 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
             return GL_FALSE;
         }
  
-        /* tmp1.z = ABS(tmp1.z) dont have abs support in assembler currently
-         * have to do explicit instruction
-         */
-        pAsm->D.dst.opcode = SQ_OP2_INST_MAX;
-        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-        pAsm->D.dst.reg   = tmp1;
-        pAsm->D.dst.writez = 1;
-
-        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
-        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
-        pAsm->S[0].src.reg = tmp1;
-	noswizzle_PVSSRC(&(pAsm->S[0].src));
-        pAsm->S[1].bits = pAsm->S[0].bits;
-        flipneg_PVSSRC(&(pAsm->S[1].src));
-        
-        next_ins(pAsm);
-
         /* tmp1.z = RCP_e(|tmp1.z|) */
         pAsm->D.dst.opcode = SQ_OP2_INST_RECIP_IEEE;
         pAsm->D.dst.math = 1;
@@ -4038,13 +4331,13 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
         pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
         pAsm->S[0].src.reg = tmp1;
         pAsm->S[0].src.swizzlex = SQ_SEL_Z;
+        pAsm->S[0].src.abs = 1;
 
         next_ins(pAsm);
 
         /* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
          * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
          * muladd has no writemask, have to use another temp 
-         * also no support for imm constants, so add 1 here
          */
         pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
         pAsm->D.dst.op3    = 1;
@@ -4061,30 +4354,12 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
         pAsm->S[1].src.reg   = tmp1;
         setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Z);
         setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
-        pAsm->S[2].src.rtype = SRC_REG_TEMPORARY;
+        /* immediate c 1.5 */
+        pAsm->D2.dst2.literal_slots = 1;
+        pAsm->C[0].f = 1.5F;
+        pAsm->S[2].src.rtype = SRC_REC_LITERAL;
         pAsm->S[2].src.reg   = tmp1;
-        setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_1);
-
-        next_ins(pAsm);
-
-        /* ADD the remaining .5 */
-        pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
-        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-        pAsm->D.dst.reg   = tmp2;
-        pAsm->D.dst.writex = 1;
-        pAsm->D.dst.writey = 1;
-        pAsm->D.dst.writez = 0;
-        pAsm->D.dst.writew = 0;
-
-        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
-        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
-        pAsm->S[0].src.reg   = tmp2;
-        noswizzle_PVSSRC(&(pAsm->S[0].src));
-        setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
-        pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
-        pAsm->S[1].src.reg   = 252; // SQ_ALU_SRC_0_5 
-        noswizzle_PVSSRC(&(pAsm->S[1].src));
+        setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
 
         next_ins(pAsm);
 
@@ -4109,14 +4384,35 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
 
     }
 
-    pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+    switch(pAsm->pILInst[pAsm->uiCurInst].Opcode)
+    {
+        case OPCODE_DDX:
+            /* will these need WQM(1) on CF inst ? */
+            pAsm->D.dst.opcode = SQ_TEX_INST_GET_GRADIENTS_H;
+            break;
+        case OPCODE_DDY:
+            pAsm->D.dst.opcode = SQ_TEX_INST_GET_GRADIENTS_V;
+            break;
+        case OPCODE_TXB:
+            pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_L;
+            break;
+        default:
+            if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
+                pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_C;
+            else
+                pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+    }
+
+    pAsm->is_tex = GL_TRUE;
+    if ( GL_TRUE == need_barrier )
+
     pAsm->is_tex = GL_TRUE;
     if ( GL_TRUE == need_barrier )
     {
         pAsm->need_tex_barrier = GL_TRUE;
     }
     // Set src1 to tex unit id
-    pAsm->S[1].src.reg   = pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit;
+    pAsm->S[1].src.reg   = pAsm->SamplerUnits[pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit];
     pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
 
     //No sw info from mesa compiler, so hard code here.
@@ -4150,11 +4446,46 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
         pAsm->S[0].src.swizzlew = SQ_SEL_Y;
     }
  
+    if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
+    {
+        /* compare value goes to w chan ? */
+        pAsm->S[0].src.swizzlew = SQ_SEL_Z;
+    }
+
     if ( GL_FALSE == next_ins(pAsm) )
         {
             return GL_FALSE;
         }
 
+    /* add ARB shadow ambient but clamp to 0..1 */
+    if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
+    {
+	/* ADD_SAT dst,  dst,  ambient[texunit] */
+	pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
+
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+	    return GL_FALSE;
+	}
+	pAsm->D2.dst2.SaturateMode = 1;
+
+	pAsm->S[0].src.rtype = pAsm->D.dst.rtype;
+	pAsm->S[0].src.reg = pAsm->D.dst.reg;
+	noswizzle_PVSSRC(&(pAsm->S[0].src));
+	noneg_PVSSRC(&(pAsm->S[0].src));
+
+	pAsm->S[1].src.rtype = SRC_REG_CONSTANT;
+	pAsm->S[1].src.reg = pAsm->shadow_regs[pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit];
+	noswizzle_PVSSRC(&(pAsm->S[1].src));
+	noneg_PVSSRC(&(pAsm->S[1].src));
+
+	if( GL_FALSE == next_ins(pAsm) )
+	{
+	    return GL_FALSE;
+	}
+
+    }
+
     return GL_TRUE;
 }
 
@@ -4273,27 +4604,909 @@ GLboolean assemble_EXPORT(r700_AssemblerBase *pAsm)
     return GL_TRUE;
 }
 
-GLboolean assemble_IF(r700_AssemblerBase *pAsm)
+static inline void decreaseCurrent(r700_AssemblerBase *pAsm, GLuint uReason)
+{
+    switch (uReason)
+    {
+    case FC_PUSH_VPM:
+        pAsm->CALLSTACK[pAsm->CALLSP].current--;
+        break;
+    case FC_PUSH_WQM:
+        pAsm->CALLSTACK[pAsm->CALLSP].current -= 4;
+        break;
+    case FC_LOOP:
+        pAsm->CALLSTACK[pAsm->CALLSP].current -= 4;
+        break;
+    case FC_REP:
+        /* TODO : for 16 vp asic, should -= 2; */
+        pAsm->CALLSTACK[pAsm->CALLSP].current -= 1;
+        break;
+    };
+}
+
+static inline void checkStackDepth(r700_AssemblerBase *pAsm, GLuint uReason, GLboolean bCheckMaxOnly)
+{
+    if(GL_TRUE == bCheckMaxOnly)
+    {
+        switch (uReason)
+        {
+        case FC_PUSH_VPM:
+            if((pAsm->CALLSTACK[pAsm->CALLSP].current + 1)
+                    > pAsm->CALLSTACK[pAsm->CALLSP].max)
+            {
+                pAsm->CALLSTACK[pAsm->CALLSP].max =
+                    pAsm->CALLSTACK[pAsm->CALLSP].current + 1;
+            }
+            break;
+        case FC_PUSH_WQM:
+            if((pAsm->CALLSTACK[pAsm->CALLSP].current + 4)
+                    > pAsm->CALLSTACK[pAsm->CALLSP].max)
+            {
+                pAsm->CALLSTACK[pAsm->CALLSP].max =
+                    pAsm->CALLSTACK[pAsm->CALLSP].current + 4;
+            }
+            break;
+        }
+        return;
+    }
+
+    switch (uReason)
+    {
+    case FC_PUSH_VPM:
+        pAsm->CALLSTACK[pAsm->CALLSP].current++;
+        break;
+    case FC_PUSH_WQM:
+        pAsm->CALLSTACK[pAsm->CALLSP].current += 4;
+        break;
+    case FC_LOOP:
+        pAsm->CALLSTACK[pAsm->CALLSP].current += 4;
+        break;
+    case FC_REP:
+        /* TODO : for 16 vp asic, should += 2; */
+        pAsm->CALLSTACK[pAsm->CALLSP].current += 1;
+        break;
+    };
+
+    if(pAsm->CALLSTACK[pAsm->CALLSP].current
+         > pAsm->CALLSTACK[pAsm->CALLSP].max)
+    {
+        pAsm->CALLSTACK[pAsm->CALLSP].max =
+            pAsm->CALLSTACK[pAsm->CALLSP].current;
+    }
+}
+
+GLboolean jumpToOffest(r700_AssemblerBase *pAsm, GLuint pops, GLint offset)
+{
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = pops;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_JUMP;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr = pAsm->cf_current_cf_clause_ptr->m_uIndex + offset;
+
+    return GL_TRUE;
+}
+
+GLboolean pops(r700_AssemblerBase *pAsm, GLuint pops)
 {
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = pops;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_POP;
+ 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr             = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_IF(r700_AssemblerBase *pAsm, GLboolean bHasElse)
+{
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;
+
+    assemble_LOGIC_PRED(pAsm, SQ_OP2_INST_PRED_SETNE);
+
+
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if(GL_TRUE != bHasElse)
+    {
+        pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count = 1; 
+    }
+    else
+    {
+        pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count = 0;
+    }
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_JUMP;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->FCSP++;
+	pAsm->fc_stack[pAsm->FCSP].type  = FC_IF;
+    pAsm->fc_stack[pAsm->FCSP].mid   = NULL;
+    pAsm->fc_stack[pAsm->FCSP].midLen= 0;
+    pAsm->fc_stack[pAsm->FCSP].first = pAsm->cf_current_cf_clause_ptr;
+
+#ifndef USE_CF_FOR_POP_AFTER
+    if(GL_TRUE != bHasElse)
+    {
+        pAsm->alu_x_opcode = SQ_CF_INST_ALU_POP_AFTER;
+    }
+#endif /* USE_CF_FOR_POP_AFTER */
+
+    checkStackDepth(pAsm, FC_PUSH_VPM, GL_FALSE); 
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_ELSE(r700_AssemblerBase *pAsm)
+{
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1; ///
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_ELSE;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->fc_stack[pAsm->FCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( (void *)pAsm->fc_stack[pAsm->FCSP].mid,
+                                                                                     0,
+                                                                                     sizeof(R700ControlFlowGenericClause *) );
+    pAsm->fc_stack[pAsm->FCSP].mid[0] = pAsm->cf_current_cf_clause_ptr;
+    //pAsm->fc_stack[pAsm->FCSP].unNumMid = 1;
+
+#ifndef USE_CF_FOR_POP_AFTER
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU_POP_AFTER;
+#endif /* USE_CF_FOR_POP_AFTER */
+
+    pAsm->fc_stack[pAsm->FCSP].first->m_Word0.f.addr = pAsm->pR700Shader->plstCFInstructions_active->uNumOfNode - 1; 
+
     return GL_TRUE;
 }
 
 GLboolean assemble_ENDIF(r700_AssemblerBase *pAsm)
 {
+#ifdef USE_CF_FOR_POP_AFTER
+    pops(pAsm, 1); 
+#endif /* USE_CF_FOR_POP_AFTER */
+
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+
+    if(NULL == pAsm->fc_stack[pAsm->FCSP].mid)
+    {
+        /* no else in between */
+        pAsm->fc_stack[pAsm->FCSP].first->m_Word0.f.addr = pAsm->pR700Shader->plstCFInstructions_active->uNumOfNode;
+    }
+    else
+    {
+        pAsm->fc_stack[pAsm->FCSP].mid[0]->m_Word0.f.addr = pAsm->pR700Shader->plstCFInstructions_active->uNumOfNode;
+    }
+
+    if(NULL != pAsm->fc_stack[pAsm->FCSP].mid)
+    {
+        FREE(pAsm->fc_stack[pAsm->FCSP].mid);
+    }
+
+    if(pAsm->fc_stack[pAsm->FCSP].type != FC_IF)
+    {
+        radeon_error("if/endif in shader code are not paired. \n");
+        return GL_FALSE;
+    }
+    
+    pAsm->FCSP--;
+
+    decreaseCurrent(pAsm, FC_PUSH_VPM);
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_BGNLOOP(r700_AssemblerBase *pAsm)
+{
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_START_NO_AL;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->FCSP++;
+	pAsm->fc_stack[pAsm->FCSP].type  = FC_LOOP;
+    pAsm->fc_stack[pAsm->FCSP].mid   = NULL;
+    pAsm->fc_stack[pAsm->FCSP].unNumMid = 0;
+    pAsm->fc_stack[pAsm->FCSP].midLen   = 0;
+    pAsm->fc_stack[pAsm->FCSP].first    = pAsm->cf_current_cf_clause_ptr;
+
+    checkStackDepth(pAsm, FC_LOOP, GL_FALSE);
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_BRK(r700_AssemblerBase *pAsm)
+{
+#ifdef USE_CF_FOR_CONTINUE_BREAK
+
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;
+
+    assemble_LOGIC_PRED(pAsm, SQ_OP2_INST_PRED_SETNE);
+    
+    unsigned int unFCSP;
+    for(unFCSP=pAsm->FCSP; unFCSP>0; unFCSP--)
+    {
+        if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
+        {
+            break;
+        }
+    }
+    if(0 == FC_LOOP)
+    {
+        radeon_error("Break is not inside loop/endloop pair.\n");
+        return GL_FALSE;
+    }
+
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_BREAK;
+ 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->fc_stack[unFCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( 
+                                              (void *)pAsm->fc_stack[unFCSP].mid,
+                                              sizeof(R700ControlFlowGenericClause *) * pAsm->fc_stack[unFCSP].unNumMid,
+                                              sizeof(R700ControlFlowGenericClause *) * (pAsm->fc_stack[unFCSP].unNumMid + 1) );
+    pAsm->fc_stack[unFCSP].mid[pAsm->fc_stack[unFCSP].unNumMid] = pAsm->cf_current_cf_clause_ptr;
+    pAsm->fc_stack[unFCSP].unNumMid++;
+
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_POP;
+ 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr             = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;
+
+    checkStackDepth(pAsm, FC_PUSH_VPM, GL_TRUE);
+
+#endif //USE_CF_FOR_CONTINUE_BREAK
+    return GL_TRUE;
+}
+
+GLboolean assemble_CONT(r700_AssemblerBase *pAsm)
+{
+#ifdef USE_CF_FOR_CONTINUE_BREAK
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;
+
+    assemble_LOGIC_PRED(pAsm, SQ_OP2_INST_PRED_SETNE);
+
+    unsigned int unFCSP;
+    for(unFCSP=pAsm->FCSP; unFCSP>0; unFCSP--)
+    {
+        if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
+        {
+            break;
+        }
+    }
+    if(0 == FC_LOOP)
+    {
+        radeon_error("Continue is not inside loop/endloop pair.\n");
+        return GL_FALSE;
+    }
+
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_CONTINUE;
+ 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->fc_stack[unFCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( 
+                                              (void *)pAsm->fc_stack[unFCSP].mid,
+                                              sizeof(R700ControlFlowGenericClause *) * pAsm->fc_stack[unFCSP].unNumMid,
+                                              sizeof(R700ControlFlowGenericClause *) * (pAsm->fc_stack[unFCSP].unNumMid + 1) );
+    pAsm->fc_stack[unFCSP].mid[pAsm->fc_stack[unFCSP].unNumMid] = pAsm->cf_current_cf_clause_ptr;
+    pAsm->fc_stack[unFCSP].unNumMid++;
+
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_POP;
+ 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr             = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;
+
+    checkStackDepth(pAsm, FC_PUSH_VPM, GL_TRUE);
+
+#endif /* USE_CF_FOR_CONTINUE_BREAK */
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_ENDLOOP(r700_AssemblerBase *pAsm)
+{
+    GLuint i;
+
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_END;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word0.f.addr   = pAsm->fc_stack[pAsm->FCSP].first->m_uIndex + 1;
+    pAsm->fc_stack[pAsm->FCSP].first->m_Word0.f.addr = pAsm->cf_current_cf_clause_ptr->m_uIndex + 1;
+
+#ifdef USE_CF_FOR_CONTINUE_BREAK
+    for(i=0; i<pAsm->fc_stack[pAsm->FCSP].unNumMid; i++)
+    {
+        pAsm->fc_stack[pAsm->FCSP].mid[i]->m_Word0.f.addr = pAsm->cf_current_cf_clause_ptr->m_uIndex;
+    }
+    if(NULL != pAsm->fc_stack[pAsm->FCSP].mid)
+    {
+        FREE(pAsm->fc_stack[pAsm->FCSP].mid);
+    }
+#endif
+
+    if(pAsm->fc_stack[pAsm->FCSP].type != FC_LOOP)
+    {
+        radeon_error("loop/endloop in shader code are not paired. \n");
+        return GL_FALSE;
+    }
+
+    GLuint unFCSP;
+    GLuint unIF = 0;
+    if((pAsm->unCFflags & HAS_CURRENT_LOOPRET) > 0)
+    {        
+        for(unFCSP=(pAsm->FCSP-1); unFCSP>pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry; unFCSP--)
+        {
+            if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
+            {
+                breakLoopOnFlag(pAsm, unFCSP);
+                break;
+            }
+            else if(FC_IF == pAsm->fc_stack[unFCSP].type)
+            {
+                unIF++;
+            }
+        }
+        if(unFCSP <= pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry)
+        {            
+#ifdef USE_CF_FOR_POP_AFTER
+            returnOnFlag(pAsm, unIF); 
+#else
+            returnOnFlag(pAsm, 0);
+#endif /* USE_CF_FOR_POP_AFTER */
+            pAsm->unCFflags &= ~HAS_CURRENT_LOOPRET;
+        }
+    }
+
+    pAsm->FCSP--;
+
+    decreaseCurrent(pAsm, FC_LOOP);
+    
+    return GL_TRUE;
+}
+
+void add_return_inst(r700_AssemblerBase *pAsm)
+{
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+    //pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_RETURN;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+}
+
+GLboolean assemble_BGNSUB(r700_AssemblerBase *pAsm, GLint nILindex, GLuint uiIL_Shift)
+{
+    /* Put in sub */
+    if( (pAsm->unSubArrayPointer + 1) > pAsm->unSubArraySize )
+    {
+        pAsm->subs = (SUB_OFFSET*)_mesa_realloc( (void *)pAsm->subs,
+                                  sizeof(SUB_OFFSET) * pAsm->unSubArraySize,
+                                  sizeof(SUB_OFFSET) * (pAsm->unSubArraySize + 10) );
+        if(NULL == pAsm->subs)
+        {
+            return GL_FALSE;
+        }
+        pAsm->unSubArraySize += 10;
+    }
+
+    pAsm->subs[pAsm->unSubArrayPointer].subIL_Offset = nILindex + uiIL_Shift;
+    pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.pHead=NULL;  
+    pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.pTail=NULL;  
+    pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.uNumOfNode=0;
+
+    pAsm->CALLSP++;
+    pAsm->CALLSTACK[pAsm->CALLSP].subDescIndex = pAsm->unSubArrayPointer;
+    pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry = pAsm->FCSP;
+    pAsm->CALLSTACK[pAsm->CALLSP].plstCFInstructions_local
+                   = &(pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local);
+    pAsm->CALLSTACK[pAsm->CALLSP].max = 0;
+    pAsm->CALLSTACK[pAsm->CALLSP].current = 0;
+    SetActiveCFlist(pAsm->pR700Shader, 
+                    pAsm->CALLSTACK[pAsm->CALLSP].plstCFInstructions_local);
+
+    pAsm->unSubArrayPointer++;
+
+    /* start sub */
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+
+    pAsm->FCSP++;
+    pAsm->fc_stack[pAsm->FCSP].type  = FC_REP;
+
+    checkStackDepth(pAsm, FC_REP, GL_FALSE);
+
     return GL_TRUE;
 }
 
-GLboolean AssembleInstr(GLuint uiNumberInsts,
+GLboolean assemble_ENDSUB(r700_AssemblerBase *pAsm)
+{
+    if(pAsm->fc_stack[pAsm->FCSP].type != FC_REP)
+    {
+        radeon_error("BGNSUB/ENDSUB in shader code are not paired. \n");
+        return GL_FALSE;
+    }
+
+    /* copy max to sub structure */
+    pAsm->subs[pAsm->CALLSTACK[pAsm->CALLSP].subDescIndex].unStackDepthMax
+        = pAsm->CALLSTACK[pAsm->CALLSP].max;
+
+    decreaseCurrent(pAsm, FC_REP);
+
+    pAsm->CALLSP--;
+    SetActiveCFlist(pAsm->pR700Shader, 
+                    pAsm->CALLSTACK[pAsm->CALLSP].plstCFInstructions_local);
+    
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+
+    pAsm->FCSP--;
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_RET(r700_AssemblerBase *pAsm)
+{
+    GLuint unIF = 0;
+
+    if(pAsm->CALLSP > 0)
+    {   /* in sub */
+        GLuint unFCSP;        
+        for(unFCSP=pAsm->FCSP; unFCSP>pAsm->CALLSTACK[pAsm->CALLSP].FCSP_BeforeEntry; unFCSP--)
+        {
+            if(FC_LOOP == pAsm->fc_stack[unFCSP].type)
+            {
+                setRetInLoopFlag(pAsm, SQ_SEL_1);
+                breakLoopOnFlag(pAsm, unFCSP);
+                pAsm->unCFflags |= LOOPRET_FLAGS;
+
+                return GL_TRUE;
+            }
+            else if(FC_IF == pAsm->fc_stack[unFCSP].type)
+            {
+                unIF++;
+            }
+        }
+    }
+
+#ifdef USE_CF_FOR_POP_AFTER    
+    if(unIF > 0)
+    {
+        pops(pAsm, unIF);
+    }
+#endif /* USE_CF_FOR_POP_AFTER */
+
+    add_return_inst(pAsm);
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_CAL(r700_AssemblerBase *pAsm, 
+                       GLint nILindex,
+                       GLuint uiIL_Shift,
+                       GLuint uiNumberInsts,
+                       struct prog_instruction *pILInst,
+                       PRESUB_DESC * pPresubDesc)
+{
+    GLint uiIL_Offset;
+
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.call_count       = 1;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_CALL;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    /* Put in caller */
+    if( (pAsm->unCallerArrayPointer + 1) > pAsm->unCallerArraySize )
+    {
+        pAsm->callers = (CALLER_POINTER*)_mesa_realloc( (void *)pAsm->callers, 
+                       sizeof(CALLER_POINTER) * pAsm->unCallerArraySize, 
+                       sizeof(CALLER_POINTER) * (pAsm->unCallerArraySize + 10) );
+        if(NULL == pAsm->callers)
+        {
+            return GL_FALSE;
+        }
+        pAsm->unCallerArraySize += 10;
+    }
+    
+    uiIL_Offset = nILindex + uiIL_Shift;
+    pAsm->callers[pAsm->unCallerArrayPointer].subIL_Offset = uiIL_Offset; 
+    pAsm->callers[pAsm->unCallerArrayPointer].cf_ptr       = pAsm->cf_current_cf_clause_ptr;
+    
+    pAsm->callers[pAsm->unCallerArrayPointer].finale_cf_ptr  = NULL; 
+    pAsm->callers[pAsm->unCallerArrayPointer].prelude_cf_ptr = NULL; 
+
+    pAsm->unCallerArrayPointer++;
+
+    int j;
+    GLuint max;
+    GLuint unSubID;
+    GLboolean bRet;
+    for(j=0; j<pAsm->unSubArrayPointer; j++)
+    {
+        if(uiIL_Offset == pAsm->subs[j].subIL_Offset)
+        {   /* compiled before */
+
+            max = pAsm->subs[j].unStackDepthMax 
+                + pAsm->CALLSTACK[pAsm->CALLSP].current;
+            if(max > pAsm->CALLSTACK[pAsm->CALLSP].max)
+            {
+                pAsm->CALLSTACK[pAsm->CALLSP].max = max;
+            }
+            
+            pAsm->callers[pAsm->unCallerArrayPointer - 1].subDescIndex = j; 
+            return GL_TRUE;
+        }
+    }
+
+    pAsm->callers[pAsm->unCallerArrayPointer - 1].subDescIndex = pAsm->unSubArrayPointer;
+    unSubID = pAsm->unSubArrayPointer;
+
+    bRet = AssembleInstr(nILindex, uiIL_Shift, uiNumberInsts, pILInst, pAsm);
+
+    if(GL_TRUE == bRet)
+    {
+        max = pAsm->subs[unSubID].unStackDepthMax 
+            + pAsm->CALLSTACK[pAsm->CALLSP].current;
+        if(max > pAsm->CALLSTACK[pAsm->CALLSP].max)
+        {
+            pAsm->CALLSTACK[pAsm->CALLSP].max = max;
+        }
+
+        pAsm->subs[unSubID].pPresubDesc = pPresubDesc;
+    }
+
+    return bRet;
+}
+
+GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
+{
+    GLfloat fLiteral[2] = {0.1, 0.0};
+
+    pAsm->D.dst.opcode   = SQ_OP2_INST_MOV;
+    pAsm->D.dst.op3      = 0;
+    pAsm->D.dst.rtype    = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg      = pAsm->flag_reg_index;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+    pAsm->D2.dst2.literal_slots      = 1;
+    pAsm->D2.dst2.SaturateMode = SATURATE_OFF;
+    pAsm->D.dst.predicated     = 0;
+    /* in reloc where dislink flag init inst, only one slot alu inst is handled. */
+    pAsm->D.dst.math           = 1; /* TODO : not math really, but one channel op, more generic alu assembler needed */
+    pAsm->D2.dst2.index_mode = SQ_INDEX_LOOP; /* Check this ! */
+#if 0
+    pAsm->S[0].src.rtype = SRC_REC_LITERAL;
+    //pAsm->S[0].src.reg   = 0;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_X;
+    pAsm->S[0].src.swizzley = SQ_SEL_Y;
+    pAsm->S[0].src.swizzlez = SQ_SEL_Z;
+    pAsm->S[0].src.swizzlew = SQ_SEL_W;
+
+    if( GL_FALSE == next_ins_literal(pAsm, &(fLiteral[0])) )
+    {
+        return GL_FALSE;
+    }
+#else
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = 0;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = flagValue;
+    pAsm->S[0].src.swizzley = flagValue;
+    pAsm->S[0].src.swizzlez = flagValue;
+    pAsm->S[0].src.swizzlew = flagValue;
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+#endif
+
+    return GL_TRUE;
+}
+
+GLboolean testFlag(r700_AssemblerBase *pAsm)
+{
+    GLfloat fLiteral[2] = {0.1, 0.0};
+
+    //Test flag
+    GLuint tmp = gethelpr(pAsm);
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU_PUSH_BEFORE;
+
+    pAsm->D.dst.opcode   = SQ_OP2_INST_PRED_SETE;
+    pAsm->D.dst.math     = 1;
+    pAsm->D.dst.rtype    = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg      = tmp;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+    pAsm->D2.dst2.literal_slots      = 1;
+    pAsm->D2.dst2.SaturateMode = SATURATE_OFF;
+    pAsm->D.dst.predicated     = 1;
+    pAsm->D2.dst2.index_mode = SQ_INDEX_LOOP; /* Check this ! */
+
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = pAsm->flag_reg_index;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_X;
+    pAsm->S[0].src.swizzley = SQ_SEL_Y;
+    pAsm->S[0].src.swizzlez = SQ_SEL_Z;
+    pAsm->S[0].src.swizzlew = SQ_SEL_W;
+#if 0
+    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
+    //pAsm->S[1].src.reg   = 0;
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[1].src));
+    pAsm->S[1].src.swizzlex = SQ_SEL_X;
+    pAsm->S[1].src.swizzley = SQ_SEL_Y;
+    pAsm->S[1].src.swizzlez = SQ_SEL_Z;
+    pAsm->S[1].src.swizzlew = SQ_SEL_W;
+
+    if( GL_FALSE == next_ins_literal(pAsm, &(fLiteral[0])) )
+    {
+        return GL_FALSE;
+    }
+#else
+    pAsm->S[1].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[1].src.reg   = 0;
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[1].src));
+    pAsm->S[1].src.swizzlex = SQ_SEL_1;
+    pAsm->S[1].src.swizzley = SQ_SEL_1;
+    pAsm->S[1].src.swizzlez = SQ_SEL_1;
+    pAsm->S[1].src.swizzlew = SQ_SEL_1;
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+#endif
+
+    checkStackDepth(pAsm, FC_PUSH_VPM, GL_TRUE);
+
+    return GL_TRUE;
+}
+
+GLboolean returnOnFlag(r700_AssemblerBase *pAsm, GLuint unIF)
+{
+    testFlag(pAsm);
+    jumpToOffest(pAsm, 1, 4);
+    setRetInLoopFlag(pAsm, SQ_SEL_0);
+    pops(pAsm, unIF + 1);
+    add_return_inst(pAsm);
+
+    return GL_TRUE;
+}
+
+GLboolean breakLoopOnFlag(r700_AssemblerBase *pAsm, GLuint unFCSP)
+{
+    testFlag(pAsm);
+ 
+    //break
+    if(GL_FALSE == add_cf_instruction(pAsm) )
+    {
+        return GL_FALSE;
+    }
+    
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_LOOP_BREAK;
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    pAsm->fc_stack[unFCSP].mid = (R700ControlFlowGenericClause **)_mesa_realloc( 
+                                              (void *)pAsm->fc_stack[unFCSP].mid,
+                                              sizeof(R700ControlFlowGenericClause *) * pAsm->fc_stack[unFCSP].unNumMid,
+                                              sizeof(R700ControlFlowGenericClause *) * (pAsm->fc_stack[unFCSP].unNumMid + 1) );
+    pAsm->fc_stack[unFCSP].mid[pAsm->fc_stack[unFCSP].unNumMid] = pAsm->cf_current_cf_clause_ptr;
+    pAsm->fc_stack[unFCSP].unNumMid++;
+
+    pops(pAsm, 1);
+               
+    return GL_TRUE;
+}
+
+GLboolean AssembleInstr(GLuint uiFirstInst,
+                        GLuint uiIL_Shift,
+                        GLuint uiNumberInsts,
                         struct prog_instruction *pILInst, 
 						r700_AssemblerBase *pR700AsmCode)
 {
     GLuint i;
 
     pR700AsmCode->pILInst = pILInst;
-	for(i=0; i<uiNumberInsts; i++)
+	for(i=uiFirstInst; i<uiNumberInsts; i++)
     {
         pR700AsmCode->uiCurInst = i;
 
+#ifndef USE_CF_FOR_CONTINUE_BREAK
+        if(OPCODE_BRK == pILInst[i+1].Opcode)
+        {
+            switch(pILInst[i].Opcode)            
+            {
+            case OPCODE_SLE:
+                pILInst[i].Opcode = OPCODE_SGT;
+                break;
+            case OPCODE_SLT:
+                pILInst[i].Opcode = OPCODE_SGE;
+                break;
+            case OPCODE_SGE:
+                pILInst[i].Opcode = OPCODE_SLT;
+                break;
+            case OPCODE_SGT:
+                pILInst[i].Opcode = OPCODE_SLE;
+                break;
+            case OPCODE_SEQ:
+                pILInst[i].Opcode = OPCODE_SNE;
+                break;
+            case OPCODE_SNE:
+                pILInst[i].Opcode = OPCODE_SEQ;
+                break;
+            default:
+                break;
+            }
+        }
+#endif
+        if(pILInst[i].CondUpdate == 1)
+        {
+            /* remember dest register used for cond evaluation */
+            /* XXX also handle PROGRAM_OUTPUT registers here? */
+            pR700AsmCode->last_cond_register = pILInst[i].DstReg.Index; 
+        }
+
         switch (pILInst[i].Opcode)
         {
         case OPCODE_ABS: 
@@ -4321,7 +5534,7 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
                 return GL_FALSE;
             break;  
         case OPCODE_COS: 
-            if ( GL_FALSE == assemble_COS(pR700AsmCode) ) 
+            if ( GL_FALSE == assemble_TRIG(pR700AsmCode, SQ_OP2_INST_COS) ) 
                 return GL_FALSE;
             break;  
 
@@ -4350,7 +5563,8 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
             if ( GL_FALSE == assemble_FLR(pR700AsmCode) ) 
                 return GL_FALSE;
             break;  
-        //case OP_FLR_INT: 
+        //case OP_FLR_INT: ;
+
         //    if ( GL_FALSE == assemble_FLR_INT() ) 
         //        return GL_FALSE;
         //    break;  
@@ -4361,7 +5575,8 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
             break;  
 
         case OPCODE_KIL: 
-            if ( GL_FALSE == assemble_KIL(pR700AsmCode) ) 
+        case OPCODE_KIL_NV: 
+            if ( GL_FALSE == assemble_KIL(pR700AsmCode, SQ_OP2_INST_KILLGT) ) 
                 return GL_FALSE;
             break;
         case OPCODE_LG2: 
@@ -4401,6 +5616,26 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
         case OPCODE_MUL: 
             if ( GL_FALSE == assemble_MUL(pR700AsmCode) ) 
                 return GL_FALSE;
+            break;
+            
+        case OPCODE_NOISE1:
+            {                                               
+                callPreSub(pR700AsmCode, 
+                           GLSL_NOISE1,                         
+                           &noise1_presub,                                                  
+                           pILInst->DstReg.Index + pR700AsmCode->starting_temp_register_number, 
+                           1); 
+                radeon_error("noise1: not yet supported shader instruction\n");
+            };
+            break; 
+        case OPCODE_NOISE2: 
+            radeon_error("noise2: not yet supported shader instruction\n");
+            break; 
+        case OPCODE_NOISE3: 
+            radeon_error("noise3: not yet supported shader instruction\n");
+            break; 
+        case OPCODE_NOISE4: 
+            radeon_error("noise4: not yet supported shader instruction\n");
             break; 
 
         case OPCODE_POW: 
@@ -4416,22 +5651,78 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
                 return GL_FALSE;
             break;  
         case OPCODE_SIN: 
-            if ( GL_FALSE == assemble_SIN(pR700AsmCode) ) 
+            if ( GL_FALSE == assemble_TRIG(pR700AsmCode, SQ_OP2_INST_SIN) ) 
                 return GL_FALSE;
             break;  
         case OPCODE_SCS: 
             if ( GL_FALSE == assemble_SCS(pR700AsmCode) ) 
                 return GL_FALSE;
-            break;  
+            break; 
+            
+        case OPCODE_SEQ:
+            if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETE) ) 
+            {
+                return GL_FALSE;
+            }
+            break;
+
+        case OPCODE_SGT: 
+            if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETGT) ) 
+            {
+                return GL_FALSE;
+            }
+            break;
 
         case OPCODE_SGE: 
             if ( GL_FALSE == assemble_SGE(pR700AsmCode) ) 
+            { 
                 return GL_FALSE;
-            break; 
+            }
+            break;
+        
+        /* NO LT, LE, TODO : use GE => LE, GT => LT : reverse 2 src order would be simpliest. Or use SQ_CF_COND_FALSE for SQ_CF_COND_ACTIVE.*/
         case OPCODE_SLT: 
-            if ( GL_FALSE == assemble_SLT(pR700AsmCode) ) 
+            {
+                struct prog_src_register SrcRegSave[2];
+                SrcRegSave[0] = pILInst[i].SrcReg[0];
+                SrcRegSave[1] = pILInst[i].SrcReg[1];
+                pILInst[i].SrcReg[0] = SrcRegSave[1];
+                pILInst[i].SrcReg[1] = SrcRegSave[0];
+                if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETGT) ) 
+                {
+                    pILInst[i].SrcReg[0] = SrcRegSave[0];
+                    pILInst[i].SrcReg[1] = SrcRegSave[1];
+                    return GL_FALSE;
+                }
+                pILInst[i].SrcReg[0] = SrcRegSave[0];
+                pILInst[i].SrcReg[1] = SrcRegSave[1];
+            }
+            break;
+
+        case OPCODE_SLE: 
+            {
+                struct prog_src_register SrcRegSave[2];
+                SrcRegSave[0] = pILInst[i].SrcReg[0];
+                SrcRegSave[1] = pILInst[i].SrcReg[1];
+                pILInst[i].SrcReg[0] = SrcRegSave[1];
+                pILInst[i].SrcReg[1] = SrcRegSave[0];
+                if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETGE) ) 
+                {
+                    pILInst[i].SrcReg[0] = SrcRegSave[0];
+                    pILInst[i].SrcReg[1] = SrcRegSave[1];
+                    return GL_FALSE;
+                }
+                pILInst[i].SrcReg[0] = SrcRegSave[0];
+                pILInst[i].SrcReg[1] = SrcRegSave[1];
+            }
+            break;
+
+        case OPCODE_SNE: 
+            if ( GL_FALSE == assemble_LOGIC(pR700AsmCode, SQ_OP2_INST_SETNE) ) 
+            {
                 return GL_FALSE;
-            break; 
+            }
+            break;
 
         //case OP_STP: 
         //    if ( GL_FALSE == assemble_STP(pR700AsmCode) ) 
@@ -4457,7 +5748,8 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
                 }
             }
             break;
-
+        case OPCODE_DDX:
+        case OPCODE_DDY:
         case OPCODE_TEX: 
         case OPCODE_TXB:  
         case OPCODE_TXP: 
@@ -4465,30 +5757,104 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
                 return GL_FALSE;
             break;
 
+        case OPCODE_TRUNC:
+            if ( GL_FALSE == assemble_math_function(pR700AsmCode, SQ_OP2_INST_TRUNC) )
+                return GL_FALSE;
+            break;
+
         case OPCODE_XPD: 
             if ( GL_FALSE == assemble_XPD(pR700AsmCode) ) 
                 return GL_FALSE;
             break;  
 
-        case OPCODE_IF   : 
-            if ( GL_FALSE == assemble_IF(pR700AsmCode) ) 
-                return GL_FALSE;
+        case OPCODE_IF:
+            {                
+                GLboolean bHasElse = GL_FALSE;
+
+                if(pILInst[pILInst[i].BranchTarget].Opcode == OPCODE_ELSE)
+                {
+                    bHasElse = GL_TRUE;
+                }
+
+                if ( GL_FALSE == assemble_IF(pR700AsmCode, bHasElse) ) 
+                {
+                    return GL_FALSE;
+                }
+            }
             break;
+
         case OPCODE_ELSE : 
-            radeon_error("Not yet implemented instruction OPCODE_ELSE \n");
-            //if ( GL_FALSE == assemble_BAD("ELSE") ) 
+            if ( GL_FALSE == assemble_ELSE(pR700AsmCode) ) 
                 return GL_FALSE;
             break;
+
         case OPCODE_ENDIF: 
             if ( GL_FALSE == assemble_ENDIF(pR700AsmCode) ) 
                 return GL_FALSE;
             break;
 
+        case OPCODE_BGNLOOP:
+            if( GL_FALSE == assemble_BGNLOOP(pR700AsmCode) )
+            {
+                return GL_FALSE;
+            }
+            break;
+
+        case OPCODE_BRK:
+            if( GL_FALSE == assemble_BRK(pR700AsmCode) )
+            {
+                return GL_FALSE;
+            }
+            break;
+
+        case OPCODE_CONT:
+            if( GL_FALSE == assemble_CONT(pR700AsmCode) )
+            {
+                return GL_FALSE;
+            }
+            break;
+
+        case OPCODE_ENDLOOP:
+            if( GL_FALSE == assemble_ENDLOOP(pR700AsmCode) )
+            {
+                return GL_FALSE;
+            }
+            break;
+
+        case OPCODE_BGNSUB:
+            if( GL_FALSE == assemble_BGNSUB(pR700AsmCode, i, uiIL_Shift) )
+            {
+                return GL_FALSE;
+            }
+            break;
+        
+        case OPCODE_RET:
+            if( GL_FALSE == assemble_RET(pR700AsmCode) )
+            {
+                return GL_FALSE;
+            }
+            break;
+        
+        case OPCODE_CAL:
+            if( GL_FALSE == assemble_CAL(pR700AsmCode, 
+                                         pILInst[i].BranchTarget,
+                                         uiIL_Shift,
+                                         uiNumberInsts,
+                                         pILInst,
+                                         NULL) )
+            {
+                return GL_FALSE;
+            }
+            break;
+
         //case OPCODE_EXPORT: 
         //    if ( GL_FALSE == assemble_EXPORT() ) 
         //        return GL_FALSE;
         //    break;
 
+        case OPCODE_ENDSUB:
+            return assemble_ENDSUB(pR700AsmCode);
+
         case OPCODE_END: 
 			//pR700AsmCode->uiCurInst = i;
 			//This is to remaind that if in later exoort there is depth/stencil
@@ -4505,6 +5871,417 @@ GLboolean AssembleInstr(GLuint uiNumberInsts,
     return GL_TRUE;
 }
 
+GLboolean InitShaderProgram(r700_AssemblerBase * pAsm)
+{
+    setRetInLoopFlag(pAsm, SQ_SEL_0);
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+    return GL_TRUE;
+}
+
+GLboolean RelocProgram(r700_AssemblerBase * pAsm, struct gl_program * pILProg)
+{
+    GLuint i;
+    GLuint unCFoffset;
+    TypedShaderList * plstCFmain;
+    TypedShaderList * plstCFsub;
+
+    R700ShaderInstruction *        pInst;
+    R700ControlFlowGenericClause * pCFInst;
+
+    R700ControlFlowALUClause * pCF_ALU;
+    R700ALUInstruction       * pALU;
+    GLuint                     unConstOffset = 0;
+    GLuint                     unRegOffset;
+    GLuint                     unMinRegIndex;
+
+    plstCFmain = pAsm->CALLSTACK[0].plstCFInstructions_local;
+
+    /* remove flags init if they are not used */
+    if((pAsm->unCFflags & HAS_LOOPRET) == 0)
+    {
+        R700ControlFlowALUClause * pCF_ALU;
+        pInst = plstCFmain->pHead;
+        while(pInst)
+        {
+            if(SIT_CF_ALU == pInst->m_ShaderInstType)
+            {
+                pCF_ALU = (R700ControlFlowALUClause *)pInst;
+                if(0 == pCF_ALU->m_Word1.f.count)
+                {
+                    pCF_ALU->m_Word1.f.cf_inst = SQ_CF_INST_NOP;
+                }
+                else
+                {
+                    R700ALUInstruction * pALU = pCF_ALU->m_pLinkedALUInstruction;
+                    
+                    pALU->m_pLinkedALUClause = NULL;
+                    pALU = (R700ALUInstruction *)(pALU->pNextInst);
+                    pALU->m_pLinkedALUClause = pCF_ALU;
+                    pCF_ALU->m_pLinkedALUInstruction = pALU;
+
+                    pCF_ALU->m_Word1.f.count--;
+                }
+                break;
+            }
+            pInst = pInst->pNextInst;
+        };
+    }
+
+    if(pAsm->CALLSTACK[0].max > 0)
+    {
+        pAsm->pR700Shader->uStackSize = ((pAsm->CALLSTACK[0].max + 3)>>2) + 2;
+    }
+
+    if(0 == pAsm->unSubArrayPointer)
+    {
+        return GL_TRUE;
+    }
+
+    unCFoffset = plstCFmain->uNumOfNode;
+
+    if(NULL != pILProg->Parameters)
+    {        
+        unConstOffset = pILProg->Parameters->NumParameters;
+    }
+
+    /* Reloc subs */
+    for(i=0; i<pAsm->unSubArrayPointer; i++)
+    {
+        pAsm->subs[i].unCFoffset = unCFoffset;
+        plstCFsub = &(pAsm->subs[i].lstCFInstructions_local);
+
+        pInst = plstCFsub->pHead;
+
+        /* reloc instructions */
+        while(pInst)
+        {
+            if(SIT_CF_GENERIC == pInst->m_ShaderInstType)
+            {
+                pCFInst = (R700ControlFlowGenericClause *)pInst;
+
+                switch (pCFInst->m_Word1.f.cf_inst)
+                {
+                case SQ_CF_INST_POP:
+                case SQ_CF_INST_JUMP:
+                case SQ_CF_INST_ELSE:
+                case SQ_CF_INST_LOOP_END:
+                case SQ_CF_INST_LOOP_START:
+                case SQ_CF_INST_LOOP_START_NO_AL:
+                case SQ_CF_INST_LOOP_CONTINUE:
+                case SQ_CF_INST_LOOP_BREAK:
+                    pCFInst->m_Word0.f.addr += unCFoffset;
+                    break;
+                default:
+                    break;
+                }
+            }  
+            
+            pInst->m_uIndex += unCFoffset;
+
+            pInst = pInst->pNextInst;
+        };
+
+        if(NULL != pAsm->subs[i].pPresubDesc)
+        {
+            GLuint                     uNumSrc;            
+            
+            unMinRegIndex  = pAsm->subs[i].pPresubDesc->pCompiledSub->MinRegIndex;
+            unRegOffset    = pAsm->subs[i].pPresubDesc->maxStartReg;            
+            unConstOffset += pAsm->subs[i].pPresubDesc->unConstantsStart;
+
+            pInst = plstCFsub->pHead;
+            while(pInst)
+            {
+                if(SIT_CF_ALU == pInst->m_ShaderInstType)
+                {
+                    pCF_ALU = (R700ControlFlowALUClause *)pInst;
+
+                    pALU = pCF_ALU->m_pLinkedALUInstruction;
+                    for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
+                    {
+                        pALU->m_Word1.f.dst_gpr = pALU->m_Word1.f.dst_gpr + unRegOffset - unMinRegIndex;
+
+                        if(pALU->m_Word0.f.src0_sel < SQ_ALU_SRC_GPR_SIZE)
+                        {   
+                            pALU->m_Word0.f.src0_sel = pALU->m_Word0.f.src0_sel + unRegOffset - unMinRegIndex;
+                        }
+                        else if(pALU->m_Word0.f.src0_sel >= SQ_ALU_SRC_CFILE_BASE)
+                        {   
+                            pALU->m_Word0.f.src0_sel += unConstOffset;
+                        }
+
+                        if( ((pALU->m_Word1.val >> SQ_ALU_WORD1_OP3_ALU_INST_SHIFT) & 0x0000001F) 
+                            >= SQ_OP3_INST_MUL_LIT )
+                        {   /* op3 : 3 srcs */
+                            if(pALU->m_Word1_OP3.f.src2_sel < SQ_ALU_SRC_GPR_SIZE)
+                            {   
+                                pALU->m_Word1_OP3.f.src2_sel = pALU->m_Word1_OP3.f.src2_sel + unRegOffset - unMinRegIndex;
+                            }
+                            else if(pALU->m_Word1_OP3.f.src2_sel >= SQ_ALU_SRC_CFILE_BASE)
+                            {   
+                                pALU->m_Word1_OP3.f.src2_sel += unConstOffset;
+                            }    
+                            if(pALU->m_Word0.f.src1_sel < SQ_ALU_SRC_GPR_SIZE)
+                            {   
+                                pALU->m_Word0.f.src1_sel = pALU->m_Word0.f.src1_sel + unRegOffset - unMinRegIndex;
+                            }
+                            else if(pALU->m_Word0.f.src1_sel >= SQ_ALU_SRC_CFILE_BASE)
+                            {   
+                                pALU->m_Word0.f.src1_sel += unConstOffset;
+                            }                                 
+                        }
+                        else
+                        {
+                            if(pAsm->bR6xx)
+                            {
+                                uNumSrc = r700GetNumOperands(pALU->m_Word1_OP2.f6.alu_inst, 0);
+                            }
+                            else
+                            {
+                                uNumSrc = r700GetNumOperands(pALU->m_Word1_OP2.f.alu_inst, 0);
+                            }
+                            if(2 == uNumSrc)
+                            {   /* 2 srcs */
+                                if(pALU->m_Word0.f.src1_sel < SQ_ALU_SRC_GPR_SIZE)
+                                {   
+                                    pALU->m_Word0.f.src1_sel = pALU->m_Word0.f.src1_sel + unRegOffset - unMinRegIndex;
+                                }
+                                else if(pALU->m_Word0.f.src1_sel >= SQ_ALU_SRC_CFILE_BASE)
+                                {   
+                                    pALU->m_Word0.f.src1_sel += unConstOffset;
+                                }                                  
+                            }                            
+                        }
+                        pALU = (R700ALUInstruction*)(pALU->pNextInst);
+                    }                    
+                }             
+                pInst = pInst->pNextInst;
+            };
+        }
+
+        /* Put sub into main */
+        plstCFmain->pTail->pNextInst = plstCFsub->pHead;
+        plstCFmain->pTail            = plstCFsub->pTail;
+        plstCFmain->uNumOfNode      += plstCFsub->uNumOfNode;
+
+        unCFoffset += plstCFsub->uNumOfNode;
+    }
+
+    /* reloc callers */
+    for(i=0; i<pAsm->unCallerArrayPointer; i++)
+    {
+        pAsm->callers[i].cf_ptr->m_Word0.f.addr
+            = pAsm->subs[pAsm->callers[i].subDescIndex].unCFoffset; 
+
+        if(NULL != pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc)
+        {                 
+            unMinRegIndex = pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc->pCompiledSub->MinRegIndex;
+            unRegOffset = pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc->maxStartReg;
+
+            if(NULL != pAsm->callers[i].prelude_cf_ptr)
+            {                
+                pCF_ALU = (R700ControlFlowALUClause * )(pAsm->callers[i].prelude_cf_ptr);
+                pALU = pCF_ALU->m_pLinkedALUInstruction;
+                for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
+                {
+                    pALU->m_Word1.f.dst_gpr = pALU->m_Word1.f.dst_gpr + unRegOffset - unMinRegIndex;
+                    pALU = (R700ALUInstruction*)(pALU->pNextInst);
+                }
+            }
+            if(NULL != pAsm->callers[i].finale_cf_ptr)
+            {
+                pCF_ALU = (R700ControlFlowALUClause * )(pAsm->callers[i].finale_cf_ptr);
+                pALU = pCF_ALU->m_pLinkedALUInstruction;
+                for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
+                {
+                    pALU->m_Word0.f.src0_sel = pALU->m_Word0.f.src0_sel + unRegOffset - unMinRegIndex;
+                    pALU = (R700ALUInstruction*)(pALU->pNextInst);
+                }
+            }
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean callPreSub(r700_AssemblerBase* pAsm, 
+                         LOADABLE_SCRIPT_SIGNITURE scriptSigniture,                          
+                         COMPILED_SUB * pCompiledSub,                                               
+                         GLshort uOutReg,
+                         GLshort uNumValidSrc)
+{
+    /* save assemble context */
+    GLuint starting_temp_register_number_save;
+    GLuint number_used_registers_save;
+    GLuint uFirstHelpReg_save;
+    GLuint uHelpReg_save;
+    GLuint uiCurInst_save;
+    struct prog_instruction *pILInst_save;
+    PRESUB_DESC * pPresubDesc;
+    GLboolean     bRet;
+    int i;
+
+    R700ControlFlowGenericClause* prelude_cf_ptr = NULL;
+
+    /* copy srcs to presub inputs */  
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+    for(i=0; i<uNumValidSrc; i++)
+    {
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = pCompiledSub->srcRegIndex[i];
+        pAsm->D.dst.writex = 1;
+        pAsm->D.dst.writey = 1;
+        pAsm->D.dst.writez = 1;
+        pAsm->D.dst.writew = 1;
+
+        if( GL_FALSE == assemble_src(pAsm, i, 0) )
+        {
+            return GL_FALSE;
+        }
+
+        next_ins(pAsm);
+    }
+    if(uNumValidSrc > 0)
+    {
+        prelude_cf_ptr     = pAsm->cf_current_alu_clause_ptr;
+        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+    }
+
+    /* browse thro existing presubs. */
+    for(i=0; i<pAsm->unNumPresub; i++)
+    {
+        if(pAsm->presubs[i].sptSigniture == scriptSigniture)
+        {
+            break;
+        }
+    }
+
+    if(i == pAsm->unNumPresub)
+    {   /* not loaded yet */
+        /* save assemble context */
+        number_used_registers_save         = pAsm->number_used_registers;
+        uFirstHelpReg_save                 = pAsm->uFirstHelpReg;
+        uHelpReg_save                      = pAsm->uHelpReg;
+        starting_temp_register_number_save = pAsm->starting_temp_register_number;
+        pILInst_save                       = pAsm->pILInst;
+        uiCurInst_save                     = pAsm->uiCurInst;
+
+        /* alloc in presub */
+        if( (pAsm->unNumPresub + 1) > pAsm->unPresubArraySize )
+        {
+            pAsm->presubs = (PRESUB_DESC*)_mesa_realloc( (void *)pAsm->presubs,
+                                      sizeof(PRESUB_DESC) * pAsm->unPresubArraySize,
+                                      sizeof(PRESUB_DESC) * (pAsm->unPresubArraySize + 4) );
+            if(NULL == pAsm->presubs)
+            {
+                radeon_error("No memeory to allocate built in shader function description structures. \n");
+                return GL_FALSE;
+            }
+            pAsm->unPresubArraySize += 4;
+        }
+        
+        pPresubDesc = &(pAsm->presubs[i]);
+        pPresubDesc->sptSigniture = scriptSigniture;
+
+        /* constants offsets need to be final resolved at reloc. */
+        if(0 == pAsm->unNumPresub)
+        {
+            pPresubDesc->unConstantsStart = 0; 
+        }
+        else
+        {
+            pPresubDesc->unConstantsStart =  pAsm->presubs[i-1].unConstantsStart
+                                           + pAsm->presubs[i-1].pCompiledSub->NumParameters;
+        }
+
+        pPresubDesc->pCompiledSub = pCompiledSub;
+
+        pPresubDesc->subIL_Shift = pAsm->unCurNumILInsts;
+        pPresubDesc->maxStartReg  = uFirstHelpReg_save;
+        pAsm->unCurNumILInsts    += pCompiledSub->NumInstructions;
+
+        pAsm->unNumPresub++;
+
+        /* setup new assemble context */
+        pAsm->starting_temp_register_number = 0;
+        pAsm->number_used_registers = pCompiledSub->NumTemporaries;
+        pAsm->uFirstHelpReg         = pAsm->number_used_registers;
+        pAsm->uHelpReg              = pAsm->uFirstHelpReg;
+
+        bRet = assemble_CAL(pAsm, 
+                            0, 
+                            pPresubDesc->subIL_Shift, 
+                            pCompiledSub->NumInstructions,
+                            pCompiledSub->Instructions,
+                            pPresubDesc);
+
+        
+        pPresubDesc->number_used_registers = pAsm->number_used_registers;        
+
+        /* restore assemble context */
+        pAsm->number_used_registers         = number_used_registers_save; 
+        pAsm->uFirstHelpReg                 = uFirstHelpReg_save;
+        pAsm->uHelpReg                      = uHelpReg_save;
+        pAsm->starting_temp_register_number = starting_temp_register_number_save;
+        pAsm->pILInst                       = pILInst_save; 
+        pAsm->uiCurInst                     = uiCurInst_save;
+    }
+    else
+    {   /* was loaded */
+        pPresubDesc = &(pAsm->presubs[i]);  
+        
+        bRet = assemble_CAL(pAsm, 
+                            0, 
+                            pPresubDesc->subIL_Shift, 
+                            pCompiledSub->NumInstructions,
+                            pCompiledSub->Instructions,
+                            pPresubDesc);
+    }
+
+    if(GL_FALSE == bRet)
+    {
+        radeon_error("Shader presub assemble failed. \n");
+    }
+    else
+    {
+        /* copy presub output to real dst */ 
+        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = pCompiledSub->dstRegIndex;
+        pAsm->S[0].src.swizzlex = pCompiledSub->outputSwizzleX;
+        pAsm->S[0].src.swizzley = pCompiledSub->outputSwizzleY;
+        pAsm->S[0].src.swizzlez = pCompiledSub->outputSwizzleZ;
+        pAsm->S[0].src.swizzlew = pCompiledSub->outputSwizzleW;
+
+        next_ins(pAsm);        
+
+        pAsm->callers[pAsm->unCallerArrayPointer - 1].finale_cf_ptr  = pAsm->cf_current_alu_clause_ptr;
+        pAsm->callers[pAsm->unCallerArrayPointer - 1].prelude_cf_ptr = prelude_cf_ptr;
+        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+    }
+
+    if( (pPresubDesc->number_used_registers + pAsm->uFirstHelpReg) > pAsm->number_used_registers )
+    {
+        pAsm->number_used_registers = pPresubDesc->number_used_registers + pAsm->uFirstHelpReg;
+    }
+    if(pAsm->uFirstHelpReg > pPresubDesc->maxStartReg)
+    {
+        pPresubDesc->maxStartReg = pAsm->uFirstHelpReg;
+    }
+
+    return bRet;
+}
+
 GLboolean Process_Export(r700_AssemblerBase* pAsm,
                          GLuint type,
                          GLuint export_starting_index,
@@ -4799,6 +6576,25 @@ GLboolean Process_Vertex_Exports(r700_AssemblerBase *pR700AsmCode,
             export_starting_index++;
 		}
 	}
+    
+    for(i=VERT_RESULT_VAR0; i<VERT_RESULT_MAX; i++)
+	{
+        unBit = 1 << i;
+        if(OutputsWritten & unBit)
+		{
+            if( GL_FALSE == Process_Export(pR700AsmCode,
+                                          SQ_EXPORT_PARAM, 
+                                          export_starting_index, 
+                                          1, 
+                                          pR700AsmCode->ucVP_OutputMap[i],
+                                          GL_FALSE) )
+            {                
+                return GL_FALSE;
+            }
+
+            export_starting_index++;
+		}
+    }
 
     // At least one param should be exported
     if (export_count) 
@@ -4833,6 +6629,21 @@ GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode)
 {
     FREE(pR700AsmCode->pucOutMask);
     FREE(pR700AsmCode->pInstDeps);
+
+    if(NULL != pR700AsmCode->subs)
+    {
+        FREE(pR700AsmCode->subs);
+    }
+    if(NULL != pR700AsmCode->callers)
+    {
+        FREE(pR700AsmCode->callers);
+    }
+
+    if(NULL != pR700AsmCode->presubs)
+    {
+        FREE(pR700AsmCode->presubs);
+    }
+
     return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.h b/src/mesa/drivers/dri/r600/r700_assembler.h
index c66db502a1..56baf5b0d9 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.h
+++ b/src/mesa/drivers/dri/r600/r700_assembler.h
@@ -34,6 +34,45 @@
 #include "r700_shaderinst.h"
 #include "r700_shader.h"
 
+typedef enum LOADABLE_SCRIPT_SIGNITURE
+{
+    GLSL_NOISE1 = 0x10000001,
+    GLSL_NOISE2 = 0x10000002,
+    GLSL_NOISE3 = 0x10000003,
+    GLSL_NOISE4 = 0x10000004
+}LOADABLE_SCRIPT_SIGNITURE;
+
+typedef struct COMPILED_SUB
+{
+    struct  prog_instruction *Instructions;
+    GLuint  NumInstructions;
+    GLuint  NumTemporaries;
+    GLuint  NumParameters;
+    GLuint  MinRegIndex;
+    GLfloat (*ParameterValues)[4];
+    GLbyte  outputSwizzleX; 
+    GLbyte  outputSwizzleY;
+    GLbyte  outputSwizzleZ;
+    GLbyte  outputSwizzleW;
+    GLshort srcRegIndex[3];
+    GLushort dstRegIndex;
+}COMPILED_SUB;
+
+typedef struct PRESUB_DESCtag 
+{
+    LOADABLE_SCRIPT_SIGNITURE sptSigniture;
+    GLint  subIL_Shift;
+    struct prog_src_register InReg[3];
+    struct prog_dst_register OutReg;
+
+    GLushort maxStartReg;
+    GLushort number_used_registers;
+
+    GLuint   unConstantsStart;
+
+    COMPILED_SUB * pCompiledSub;
+} PRESUB_DESC;
+
 typedef enum SHADER_PIPE_TYPE 
 {
     SPT_VP = 0,
@@ -72,7 +111,8 @@ typedef enum SrcRegisterType
     SRC_REG_INPUT          = 1,
     SRC_REG_CONSTANT       = 2,
     SRC_REG_ALT_TEMPORARY  = 3,
-    NUMBER_OF_SRC_REG_TYPE = 4
+    SRC_REC_LITERAL        = 4, 
+    NUMBER_OF_SRC_REG_TYPE = 5
 } SrcRegisterType;
 
 typedef enum DstRegisterType 
@@ -111,16 +151,24 @@ typedef struct PVSDSTtag
 	BITS addrmode1:1; //32
 } PVSDST;
 
+typedef struct PVSINSTtag
+{
+    BITS literal_slots      :2; 
+    BITS SaturateMode :2; 
+    BITS index_mode   :3;
+} PVSINST;
+
 typedef struct PVSSRCtag 
 {
-	BITS rtype:4;            
+	BITS rtype:3;            
 	BITS addrmode0:1;        
-	BITS reg:10;      //15     (8)
+	BITS reg:10;      //14     (8)
 	BITS swizzlex:3;
 	BITS swizzley:3;
 	BITS swizzlez:3;
-	BITS swizzlew:3;  //27        
+	BITS swizzlew:3;  //26        
 
+	BITS abs:1;
 	BITS negx:1;
 	BITS negy:1;
 	BITS negz:1;
@@ -148,6 +196,7 @@ typedef union PVSDWORDtag
 {
 	BITS    bits;
 	PVSDST  dst;
+    PVSINST dst2;
 	PVSSRC  src;
 	PVSMATH math;
 	float   f;
@@ -251,6 +300,8 @@ enum
     FC_IF = 1,
     FC_LOOP = 2,
     FC_REP = 3,
+    FC_PUSH_VPM = 4,
+    FC_PUSH_WQM = 5,
 
     COND_NONE = 0,
     COND_BOOL = 1,
@@ -263,22 +314,56 @@ enum
 
 typedef struct FC_LEVEL 
 {
-	unsigned int           first; ///< first fc instruction on level (if, rep, loop)
-	unsigned int*          mid; ///< middle instructions - else or all breaks on this level
-	unsigned int           midLen;
-	unsigned int           type;
-	unsigned int           cond;
-	unsigned int           inv;
-	unsigned int           bpush; ///< 1 if first instruction does branch stack push
-			 int           id; ///< id of bool or int variable
+    R700ControlFlowGenericClause *  first;
+    R700ControlFlowGenericClause ** mid;
+    unsigned int unNumMid;
+    unsigned int midLen;
+    unsigned int type;
+    unsigned int cond;
+    unsigned int inv;
+    int id; ///< id of bool or int variable
 } FC_LEVEL;
 
 typedef struct VTX_FETCH_METHOD 
 {
-	GLboolean bEnableMini;
-	GLuint mega_fetch_remainder;
+    GLboolean bEnableMini;
+    GLuint mega_fetch_remainder;
 } VTX_FETCH_METHOD;
 
+typedef struct SUB_OFFSET
+{
+    GLint  subIL_Offset;
+    GLuint unCFoffset;
+    GLuint unStackDepthMax;
+    PRESUB_DESC *   pPresubDesc;
+    TypedShaderList lstCFInstructions_local;
+} SUB_OFFSET;
+
+typedef struct CALLER_POINTER
+{
+    GLint  subIL_Offset;
+    GLint  subDescIndex;
+    R700ControlFlowGenericClause* cf_ptr;
+
+    R700ControlFlowGenericClause* prelude_cf_ptr;
+    R700ControlFlowGenericClause* finale_cf_ptr;
+} CALLER_POINTER;
+
+#define SQ_MAX_CALL_DEPTH 0x00000020
+
+typedef struct CALL_LEVEL
+{
+    unsigned int      FCSP_BeforeEntry;
+    GLint             subDescIndex;
+    GLushort          current;
+    GLushort          max;
+    TypedShaderList * plstCFInstructions_local;
+} CALL_LEVEL;
+
+#define HAS_CURRENT_LOOPRET 0x1L
+#define HAS_LOOPRET         0x2L
+#define LOOPRET_FLAGS       HAS_LOOPRET | HAS_CURRENT_LOOPRET
+
 typedef struct r700_AssemblerBase 
 {
 	R700ControlFlowSXClause*      cf_last_export_ptr;
@@ -294,14 +379,19 @@ typedef struct r700_AssemblerBase
 	// No clause has been created yet
 	CF_CLAUSE_TYPE cf_current_clause_type;
 
+    BITS alu_x_opcode;
+
 	GLuint number_of_exports;
 	GLuint number_of_colorandz_exports;
 	GLuint number_of_export_opcodes;
 
 	PVSDWORD D;
+    PVSDWORD D2;
 	PVSDWORD S[3];
+        PVSDWORD C[4];
 
 	unsigned int uLastPosUpdate;
+	unsigned int last_cond_register;
 
 	OUT_FRAGMENT_FMT_0     fp_stOutFmt0;
 
@@ -310,6 +400,8 @@ typedef struct r700_AssemblerBase
 	unsigned int number_used_registers;
 	unsigned int uUsedConsts; 
 
+    unsigned int flag_reg_index;
+
 	// Fragment programs
 	unsigned int uiFP_AttributeMap[FRAG_ATTRIB_MAX];
 	unsigned int uiFP_OutputMap[FRAG_RESULT_MAX];
@@ -330,9 +422,6 @@ typedef struct r700_AssemblerBase
 	unsigned int FCSP;
 	FC_LEVEL fc_stack[32];
 
-	unsigned int branch_depth;
-	unsigned int max_branch_depth;
-
 	//-----------------------------------------------------------------------------------
 	// ArgSubst used in Assemble_Source() function
 	//-----------------------------------------------------------------------------------
@@ -373,11 +462,33 @@ typedef struct r700_AssemblerBase
     SHADER_PIPE_TYPE currentShaderType;
     struct prog_instruction * pILInst;
     GLuint             uiCurInst;
+    GLubyte SamplerUnits[MAX_SAMPLERS];
     GLboolean   bR6xx;
     /* helper to decide which type of instruction to assemble */
     GLboolean is_tex;
     /* we inserted helper intructions and need barrier on next TEX ins */ 
     GLboolean need_tex_barrier; 
+
+    SUB_OFFSET     * subs;
+    GLuint           unSubArraySize;
+    GLuint           unSubArrayPointer;
+    CALLER_POINTER * callers;
+    GLuint           unCallerArraySize;
+    GLuint           unCallerArrayPointer;
+    unsigned int     CALLSP;
+    CALL_LEVEL       CALLSTACK[SQ_MAX_CALL_DEPTH];
+
+    GLuint unCFflags;
+
+    PRESUB_DESC * presubs;
+    GLuint        unPresubArraySize;
+    GLuint        unNumPresub;
+    GLuint        unCurNumILInsts;
+
+    GLuint    unVetTexBits;
+
+    GLuint    shadow_regs[R700_MAX_TEXTURE_UNITS];
+
 } r700_AssemblerBase;
 
 //Internal use
@@ -399,7 +510,7 @@ BITS is_depth_component_exported(OUT_FRAGMENT_FMT_0* pFPOutFmt) ;
 GLboolean is_reduction_opcode(PVSDWORD * dest);
 GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size);
 
-unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm);
+unsigned int r700GetNumOperands(GLuint opcode, GLuint nIsOp3);
 
 GLboolean IsTex(gl_inst_opcode Opcode);
 GLboolean IsAlu(gl_inst_opcode Opcode);
@@ -422,6 +533,7 @@ GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
                                        GLubyte             element,
                                        GLuint              _signed,
                                        GLboolean           normalize,
+                                       GLenum              format,
                                        VTX_FETCH_METHOD  * pFetchMethod);
 GLboolean cleanup_vfetch_instructions(r700_AssemblerBase* pAsm);
 GLuint gethelpr(r700_AssemblerBase* pAsm);
@@ -446,6 +558,10 @@ GLboolean assemble_alu_src(R700ALUInstruction*  alu_instruction_ptr,
 GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
                               R700ALUInstruction* alu_instruction_ptr,
                               GLuint              contiguous_slots_needed);
+
+GLboolean add_cf_instruction(r700_AssemblerBase* pAsm);
+void add_return_inst(r700_AssemblerBase *pAsm);
+
 void get_src_properties(R700ALUInstruction*  alu_instruction_ptr,
                         int                  source_index,
                         BITS*                psrc_sel,
@@ -467,13 +583,20 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
                        R700ALUInstruction* alu_instruction_ptr);
 GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm);
 GLboolean next_ins(r700_AssemblerBase *pAsm);
+
+GLboolean pops(r700_AssemblerBase *pAsm, GLuint pops);
+GLboolean jumpToOffest(r700_AssemblerBase *pAsm, GLuint pops, GLint offset);
+GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue);
+GLboolean testFlag(r700_AssemblerBase *pAsm);
+GLboolean breakLoopOnFlag(r700_AssemblerBase *pAsm, GLuint unFCSP);
+GLboolean returnOnFlag(r700_AssemblerBase *pAsm, GLuint unIF);
+
 GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode);
 GLboolean assemble_ABS(r700_AssemblerBase *pAsm);
 GLboolean assemble_ADD(r700_AssemblerBase *pAsm);
 GLboolean assemble_ARL(r700_AssemblerBase *pAsm);
 GLboolean assemble_BAD(char *opcode_str);
 GLboolean assemble_CMP(r700_AssemblerBase *pAsm);
-GLboolean assemble_COS(r700_AssemblerBase *pAsm);
 GLboolean assemble_DOT(r700_AssemblerBase *pAsm);
 GLboolean assemble_DST(r700_AssemblerBase *pAsm);
 GLboolean assemble_EX2(r700_AssemblerBase *pAsm);
@@ -481,7 +604,7 @@ GLboolean assemble_EXP(r700_AssemblerBase *pAsm);
 GLboolean assemble_FLR(r700_AssemblerBase *pAsm);
 GLboolean assemble_FLR_INT(r700_AssemblerBase *pAsm);
 GLboolean assemble_FRC(r700_AssemblerBase *pAsm);
-GLboolean assemble_KIL(r700_AssemblerBase *pAsm);
+GLboolean assemble_KIL(r700_AssemblerBase *pAsm, GLuint opcode);
 GLboolean assemble_LG2(r700_AssemblerBase *pAsm);
 GLboolean assemble_LRP(r700_AssemblerBase *pAsm);
 GLboolean assemble_LOG(r700_AssemblerBase *pAsm);
@@ -494,17 +617,37 @@ GLboolean assemble_MUL(r700_AssemblerBase *pAsm);
 GLboolean assemble_POW(r700_AssemblerBase *pAsm);
 GLboolean assemble_RCP(r700_AssemblerBase *pAsm);
 GLboolean assemble_RSQ(r700_AssemblerBase *pAsm);
-GLboolean assemble_SIN(r700_AssemblerBase *pAsm);
 GLboolean assemble_SCS(r700_AssemblerBase *pAsm);
 GLboolean assemble_SGE(r700_AssemblerBase *pAsm);
+
+GLboolean assemble_LOGIC(r700_AssemblerBase *pAsm, BITS opcode);
+GLboolean assemble_LOGIC_PRED(r700_AssemblerBase *pAsm, BITS opcode); 
+GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode);
+
 GLboolean assemble_SLT(r700_AssemblerBase *pAsm);
 GLboolean assemble_STP(r700_AssemblerBase *pAsm);
 GLboolean assemble_TEX(r700_AssemblerBase *pAsm);
 GLboolean assemble_XPD(r700_AssemblerBase *pAsm);
 GLboolean assemble_EXPORT(r700_AssemblerBase *pAsm);
-GLboolean assemble_IF(r700_AssemblerBase *pAsm);
+GLboolean assemble_IF(r700_AssemblerBase *pAsm, GLboolean bHasElse);
+GLboolean assemble_ELSE(r700_AssemblerBase *pAsm);
 GLboolean assemble_ENDIF(r700_AssemblerBase *pAsm);
 
+GLboolean assemble_BGNLOOP(r700_AssemblerBase *pAsm);
+GLboolean assemble_BRK(r700_AssemblerBase *pAsm);
+GLboolean assemble_COND(r700_AssemblerBase *pAsm);
+GLboolean assemble_ENDLOOP(r700_AssemblerBase *pAsm);
+
+GLboolean assemble_BGNSUB(r700_AssemblerBase *pAsm, GLint nILindex, GLuint uiIL_Shift);
+GLboolean assemble_ENDSUB(r700_AssemblerBase *pAsm);
+GLboolean assemble_RET(r700_AssemblerBase *pAsm);
+GLboolean assemble_CAL(r700_AssemblerBase *pAsm, 
+                       GLint nILindex,
+                       GLuint uiIL_Offest,
+                       GLuint uiNumberInsts,
+                       struct prog_instruction *pILInst,
+                       PRESUB_DESC * pPresubDesc);
+
 GLboolean Process_Export(r700_AssemblerBase* pAsm,
                          GLuint type, 
                          GLuint export_starting_index,
@@ -514,14 +657,25 @@ GLboolean Process_Export(r700_AssemblerBase* pAsm,
 GLboolean Move_Depth_Exports_To_Correct_Channels(r700_AssemblerBase *pAsm, 
                                                  BITS depth_channel_select);
 
+GLboolean callPreSub(r700_AssemblerBase* pAsm, 
+                     LOADABLE_SCRIPT_SIGNITURE scriptSigniture,
+                     /* struct prog_instruction ** pILInstParent, */
+                     COMPILED_SUB * pCompiledSub,                                            
+                     GLshort uOutReg,
+                     GLshort uNumValidSrc);
 
 //Interface
-GLboolean AssembleInstr(GLuint uiNumberInsts,
+GLboolean AssembleInstr(GLuint uiFirstInst,
+                        GLuint uiIL_Shift,
+                        GLuint uiNumberInsts,
                         struct prog_instruction *pILInst, 
 						r700_AssemblerBase *pR700AsmCode);
 GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode, GLbitfield OutputsWritten);  
 GLboolean Process_Vertex_Exports(r700_AssemblerBase *pR700AsmCode, GLbitfield OutputsWritten);
 
+GLboolean RelocProgram(r700_AssemblerBase * pAsm, struct gl_program * pILProg);
+GLboolean InitShaderProgram(r700_AssemblerBase * pAsm);
+
 int       Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700_Shader* pShader);
 GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode);
 
diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c
index 47b38d2e36..3bc2d2ba02 100644
--- a/src/mesa/drivers/dri/r600/r700_chip.c
+++ b/src/mesa/drivers/dri/r600/r700_chip.c
@@ -45,6 +45,9 @@ static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	context_t         *context = R700_CONTEXT(ctx);
 	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    struct r700_vertex_program *vp = context->selected_vp;
+
 	struct radeon_bo *bo = NULL;
 	unsigned int i;
 	BATCH_LOCALS(&context->radeon);
@@ -52,13 +55,14 @@ static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
 	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
 
 	for (i = 0; i < R700_TEXTURE_NUMBERUNITS; i++) {
-		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {            
 			radeonTexObj *t = r700->textures[i];
 			if (t) {
-				if (!t->image_override)
+				if (!t->image_override) {
 					bo = t->mt->bo;
-				else
+				} else {
 					bo = t->bo;
+				}
 				if (bo) {
 
 					r700SyncSurf(context, bo,
@@ -67,7 +71,16 @@ static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
 
 					BEGIN_BATCH_NO_AUTOSTATE(9 + 4);
 					R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
-					R600_OUT_BATCH(i * 7);
+
+                    if( (1<<i) & vp->r700AsmCode.unVetTexBits )                    
+                    {   /* vs texture */                                     
+                        R600_OUT_BATCH((i + VERT_ATTRIB_MAX + SQ_FETCH_RESOURCE_VS_OFFSET) * FETCH_RESOURCE_STRIDE);
+                    }
+                    else
+                    {
+					    R600_OUT_BATCH(i * 7);
+                    }
+
 					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE0);
 					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE1);
 					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE2);
@@ -77,7 +90,7 @@ static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
 					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE6);
 					R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE2,
 							     bo,
-							     0,
+							     r700->textures[i]->SQ_TEX_RESOURCE2,
 							     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
 					R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE3,
 							     bo,
@@ -91,21 +104,35 @@ static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
 	}
 }
 
+#define SAMPLER_STRIDE                 3
+
 static void r700SendTexSamplerState(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	context_t         *context = R700_CONTEXT(ctx);
 	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
 	unsigned int i;
+
+    struct r700_vertex_program *vp = context->selected_vp;
+
 	BATCH_LOCALS(&context->radeon);
 	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
 
 	for (i = 0; i < R700_TEXTURE_NUMBERUNITS; i++) {
-		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {            
 			radeonTexObj *t = r700->textures[i];
 			if (t) {
 				BEGIN_BATCH_NO_AUTOSTATE(5);
 				R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_SAMPLER, 3));
-				R600_OUT_BATCH(i * 3);
+
+                if( (1<<i) & vp->r700AsmCode.unVetTexBits )                    
+                {   /* vs texture */
+                    R600_OUT_BATCH((i+SQ_TEX_SAMPLER_VS_OFFSET) * SAMPLER_STRIDE); //work 1
+                }
+                else
+                {
+				    R600_OUT_BATCH(i * 3);
+                }
+
 				R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER0);
 				R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER1);
 				R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER2);
@@ -442,68 +469,77 @@ static void r700SendRenderTargetState(GLcontext *ctx, struct radeon_state_atom *
 
 static void r700SendPSState(GLcontext *ctx, struct radeon_state_atom *atom)
 {
-	context_t *context = R700_CONTEXT(ctx);
-	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
-	struct radeon_bo * pbo;
-	BATCH_LOCALS(&context->radeon);
-	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+    struct radeon_bo * pbo;
+    BATCH_LOCALS(&context->radeon);
+    radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
 
-	pbo = (struct radeon_bo *)r700GetActiveFpShaderBo(GL_CONTEXT(context));
+    pbo = (struct radeon_bo *)r700GetActiveFpShaderBo(GL_CONTEXT(context));
 
-	if (!pbo)
-		return;
+    if (!pbo)
+	    return;
 
-	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+    r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
 
-        BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
-	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_PS, 1);
-	R600_OUT_BATCH(r700->ps.SQ_PGM_START_PS.u32All);
-	R600_OUT_BATCH_RELOC(r700->ps.SQ_PGM_START_PS.u32All,
-			     pbo,
-			     r700->ps.SQ_PGM_START_PS.u32All,
-			     RADEON_GEM_DOMAIN_GTT, 0, 0);
-	END_BATCH();
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(SQ_PGM_START_PS, 1);
+    R600_OUT_BATCH(r700->ps.SQ_PGM_START_PS.u32All);
+    R600_OUT_BATCH_RELOC(r700->ps.SQ_PGM_START_PS.u32All,
+		         pbo,
+		         r700->ps.SQ_PGM_START_PS.u32All,
+		         RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
 
-        BEGIN_BATCH_NO_AUTOSTATE(9);
-	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_PS, r700->ps.SQ_PGM_RESOURCES_PS.u32All);
-	R600_OUT_BATCH_REGVAL(SQ_PGM_EXPORTS_PS, r700->ps.SQ_PGM_EXPORTS_PS.u32All);
-	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_PS, r700->ps.SQ_PGM_CF_OFFSET_PS.u32All);
-        END_BATCH();
+    BEGIN_BATCH_NO_AUTOSTATE(9);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_PS, r700->ps.SQ_PGM_RESOURCES_PS.u32All);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_EXPORTS_PS, r700->ps.SQ_PGM_EXPORTS_PS.u32All);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_PS, r700->ps.SQ_PGM_CF_OFFSET_PS.u32All);
+    END_BATCH();
 
-	COMMIT_BATCH();
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    R600_OUT_BATCH_REGVAL(SQ_LOOP_CONST_0, 0x01000FFF);
+    END_BATCH();
+
+    COMMIT_BATCH();
 
 }
 
 static void r700SendVSState(GLcontext *ctx, struct radeon_state_atom *atom)
 {
-	context_t *context = R700_CONTEXT(ctx);
-	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
-	struct radeon_bo * pbo;
-	BATCH_LOCALS(&context->radeon);
-	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+    struct radeon_bo * pbo;
+    BATCH_LOCALS(&context->radeon);
+    radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
 
-	pbo = (struct radeon_bo *)r700GetActiveVpShaderBo(GL_CONTEXT(context));
+    pbo = (struct radeon_bo *)r700GetActiveVpShaderBo(GL_CONTEXT(context));
 
-	if (!pbo)
-		return;
+    if (!pbo)
+	    return;
 
-	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+    r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
 
-        BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
-	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_VS, 1);
-	R600_OUT_BATCH(r700->vs.SQ_PGM_START_VS.u32All);
-	R600_OUT_BATCH_RELOC(r700->vs.SQ_PGM_START_VS.u32All,
-			     pbo,
-			     r700->vs.SQ_PGM_START_VS.u32All,
-			     RADEON_GEM_DOMAIN_GTT, 0, 0);
-	END_BATCH();
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGSEQ(SQ_PGM_START_VS, 1);
+    R600_OUT_BATCH(r700->vs.SQ_PGM_START_VS.u32All);
+    R600_OUT_BATCH_RELOC(r700->vs.SQ_PGM_START_VS.u32All,
+		         pbo,
+		         r700->vs.SQ_PGM_START_VS.u32All,
+		         RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
 
-        BEGIN_BATCH_NO_AUTOSTATE(6);
-	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_VS, r700->vs.SQ_PGM_RESOURCES_VS.u32All);
-	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_VS, r700->vs.SQ_PGM_CF_OFFSET_VS.u32All);
-        END_BATCH();
+    BEGIN_BATCH_NO_AUTOSTATE(6);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_VS, r700->vs.SQ_PGM_RESOURCES_VS.u32All);
+    R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_VS, r700->vs.SQ_PGM_CF_OFFSET_VS.u32All);
+    END_BATCH();
 
-	COMMIT_BATCH();
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    R600_OUT_BATCH_REGVAL((SQ_LOOP_CONST_0 + 32*4), 0x0100000F);
+    //R600_OUT_BATCH_REGVAL((SQ_LOOP_CONST_0 + (SQ_LOOP_CONST_vs<2)), 0x0100000F);
+    END_BATCH();
+
+    COMMIT_BATCH();
 }
 
 static void r700SendFSState(GLcontext *ctx, struct radeon_state_atom *atom)
@@ -784,8 +820,7 @@ static void r700SendDBState(GLcontext *ctx, struct radeon_state_atom *atom)
 	BATCH_LOCALS(&context->radeon);
 	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
 
-        BEGIN_BATCH_NO_AUTOSTATE(23);
-	R600_OUT_BATCH_REGVAL(DB_HTILE_DATA_BASE, r700->DB_HTILE_DATA_BASE.u32All);
+	BEGIN_BATCH_NO_AUTOSTATE(17);
 
 	R600_OUT_BATCH_REGSEQ(DB_STENCIL_CLEAR, 2);
 	R600_OUT_BATCH(r700->DB_STENCIL_CLEAR.u32All);
@@ -798,7 +833,6 @@ static void r700SendDBState(GLcontext *ctx, struct radeon_state_atom *atom)
 	R600_OUT_BATCH(r700->DB_RENDER_CONTROL.u32All);
 	R600_OUT_BATCH(r700->DB_RENDER_OVERRIDE.u32All);
 
-	R600_OUT_BATCH_REGVAL(DB_HTILE_SURFACE, r700->DB_HTILE_SURFACE.u32All);
 	R600_OUT_BATCH_REGVAL(DB_ALPHA_TO_MASK, r700->DB_ALPHA_TO_MASK.u32All);
 
 	END_BATCH();
@@ -1152,7 +1186,11 @@ static int check_blnd(GLcontext *ctx, struct radeon_state_atom *atom)
 		count += 3;
 
 	if (context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) {
-		for (ui = 0; ui < R700_MAX_RENDER_TARGETS; ui++) {
+		/* targets are enabled in r700SetRenderTarget but state
+		   size is calculated before that. Until MRT's are done
+		   hardcode target0 as enabled. */
+		count += 3;
+		for (ui = 1; ui < R700_MAX_RENDER_TARGETS; ui++) {
                         if (r700->render_target[ui].enabled)
 				count += 3;
 		}
@@ -1282,7 +1320,7 @@ void r600InitAtoms(context_t *context)
 	context->radeon.hw.atomlist.name = "atom-list";
 
 	ALLOC_STATE(sq, always, 34, r700SendSQConfig);
-	ALLOC_STATE(db, always, 23, r700SendDBState);
+	ALLOC_STATE(db, always, 17, r700SendDBState);
 	ALLOC_STATE(stencil, always, 4, r700SendStencilState);
 	ALLOC_STATE(db_target, always, 12, r700SendDepthTargetState);
 	ALLOC_STATE(sc, always, 15, r700SendSCState);
@@ -1295,16 +1333,16 @@ void r600InitAtoms(context_t *context)
 	ALLOC_STATE(poly, always, 10, r700SendPolyState);
 	ALLOC_STATE(cb, cb, 18, r700SendCBState);
 	ALLOC_STATE(clrcmp, always, 6, r700SendCBCLRCMPState);
+	ALLOC_STATE(cb_target, always, 25, r700SendRenderTargetState);
 	ALLOC_STATE(blnd, blnd, (6 + (R700_MAX_RENDER_TARGETS * 3)), r700SendCBBlendState);
 	ALLOC_STATE(blnd_clr, always, 6, r700SendCBBlendColorState);
-	ALLOC_STATE(cb_target, always, 25, r700SendRenderTargetState);
 	ALLOC_STATE(sx, always, 9, r700SendSXState);
 	ALLOC_STATE(vgt, always, 41, r700SendVGTState);
 	ALLOC_STATE(spi, always, (59 + R700_MAX_SHADER_EXPORTS), r700SendSPIState);
 	ALLOC_STATE(vpt, always, 16, r700SendViewportState);
 	ALLOC_STATE(fs, always, 18, r700SendFSState);
-	ALLOC_STATE(vs, always, 18, r700SendVSState);
-	ALLOC_STATE(ps, always, 21, r700SendPSState);
+	ALLOC_STATE(vs, always, 21, r700SendVSState);
+	ALLOC_STATE(ps, always, 24, r700SendPSState);
 	ALLOC_STATE(vs_consts, vs_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendVSConsts);
 	ALLOC_STATE(ps_consts, ps_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendPSConsts);
 	ALLOC_STATE(vtx, vtx, (6 + (VERT_ATTRIB_MAX * 18)), r700SendVTXState);
diff --git a/src/mesa/drivers/dri/r600/r700_clear.c b/src/mesa/drivers/dri/r600/r700_clear.c
index c6546ab00c..98bfdd0937 100644
--- a/src/mesa/drivers/dri/r600/r700_clear.c
+++ b/src/mesa/drivers/dri/r600/r700_clear.c
@@ -49,14 +49,18 @@ static GLboolean r700ClearFast(context_t *context, GLbitfield mask)
 void r700Clear(GLcontext * ctx, GLbitfield mask)
 {
     context_t *context = R700_CONTEXT(ctx);
-    __DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
-    const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+    __DRIdrawable *dPriv = radeon_get_drawable(&context->radeon);
+    const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask[0]);
     GLbitfield swrast_mask = 0, tri_mask = 0;
     int i;
     struct gl_framebuffer *fb = ctx->DrawBuffer;
 
     radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s %x\n", __func__, mask);
 
+    if (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_FRONT_RIGHT)) {
+        context->radeon.front_buffer_dirty = GL_TRUE;
+    }
+
     if( GL_TRUE == r700ClearFast(context, mask) )
     {
         return;
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
index 0f549ead9c..84d51e6606 100644
--- a/src/mesa/drivers/dri/r600/r700_fragprog.c
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -34,6 +34,7 @@
 #include "main/imports.h"
 #include "shader/prog_parameter.h"
 #include "shader/prog_statevars.h"
+#include "shader/program.h"
 
 #include "r600_context.h"
 #include "r600_cmdbuf.h"
@@ -42,14 +43,68 @@
 
 #include "r700_debug.h"
 
+void insert_wpos_code(GLcontext *ctx, struct gl_fragment_program *fprog)
+{
+    static const gl_state_index winstate[STATE_LENGTH]
+         = { STATE_INTERNAL, STATE_FB_SIZE, 0, 0, 0};
+    struct prog_instruction *newInst, *inst;
+    GLint  win_size;  /* state reference */
+    GLuint wpos_temp; /* temp register */
+    int i, j;
+
+    /* PARAM win_size = STATE_FB_SIZE */
+    win_size = _mesa_add_state_reference(fprog->Base.Parameters, winstate);
+
+    wpos_temp = fprog->Base.NumTemporaries++;
+
+    /* scan program where WPOS is used and replace with wpos_temp */
+    inst = fprog->Base.Instructions;
+    for (i = 0; i < fprog->Base.NumInstructions; i++) {
+        for (j=0; j < 3; j++) {
+            if(inst->SrcReg[j].File == PROGRAM_INPUT && 
+               inst->SrcReg[j].Index == FRAG_ATTRIB_WPOS) {
+                inst->SrcReg[j].File = PROGRAM_TEMPORARY;
+                inst->SrcReg[j].Index = wpos_temp;
+            }
+        }
+        inst++;
+    }
+
+    _mesa_insert_instructions(&(fprog->Base), 0, 1);
+
+    newInst = fprog->Base.Instructions;
+    /* invert wpos.y
+     * wpos_temp.xyzw = wpos.x-yzw + winsize.0y00 */
+    newInst[0].Opcode = OPCODE_ADD;
+    newInst[0].DstReg.File = PROGRAM_TEMPORARY;
+    newInst[0].DstReg.Index = wpos_temp;
+    newInst[0].DstReg.WriteMask = WRITEMASK_XYZW;
+
+    newInst[0].SrcReg[0].File = PROGRAM_INPUT;
+    newInst[0].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+    newInst[0].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+    newInst[0].SrcReg[0].Negate = NEGATE_Y;
+
+    newInst[0].SrcReg[1].File = PROGRAM_STATE_VAR;
+    newInst[0].SrcReg[1].Index = win_size;
+    newInst[0].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ZERO);
+
+}
+
 //TODO : Validate FP input with VP output.
 void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
-						  struct gl_fragment_program *mesa_fp)
+						  struct gl_fragment_program *mesa_fp,
+                          GLcontext *ctx) 
 {
 	unsigned int unBit;
     unsigned int i;
     GLuint       ui;
 
+    /* match fp inputs with vp exports. */
+    struct r700_vertex_program_cont *vpc =
+		       (struct r700_vertex_program_cont *)ctx->VertexProgram._Current;
+    GLbitfield OutputsWritten = vpc->mesa_program.Base.OutputsWritten;
+    
 	pAsm->number_used_registers = 0;
 
 //Input mapping : mesa_fp->Base.InputsRead set the flag, set in 
@@ -61,32 +116,99 @@ void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
 		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_WPOS] = pAsm->number_used_registers++;
 	}
 
-	unBit = 1 << FRAG_ATTRIB_COL0;
-	if(mesa_fp->Base.InputsRead & unBit)
+    unBit = 1 << VERT_RESULT_COL0;
+	if(OutputsWritten & unBit)
 	{
 		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL0] = pAsm->number_used_registers++;
 	}
 
-	unBit = 1 << FRAG_ATTRIB_COL1;
-	if(mesa_fp->Base.InputsRead & unBit)
+	unBit = 1 << VERT_RESULT_COL1;
+	if(OutputsWritten & unBit)
 	{
 		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL1] = pAsm->number_used_registers++;
 	}
 
-        unBit = 1 << FRAG_ATTRIB_FOGC;
-        if(mesa_fp->Base.InputsRead & unBit)
-        {
-                pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FOGC] = pAsm->number_used_registers++;
-        }
+    unBit = 1 << VERT_RESULT_FOGC;
+    if(OutputsWritten & unBit)
+    {
+        pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FOGC] = pAsm->number_used_registers++;
+    }
 
 	for(i=0; i<8; i++)
 	{
-		unBit = 1 << (FRAG_ATTRIB_TEX0 + i);
-		if(mesa_fp->Base.InputsRead & unBit)
+		unBit = 1 << (VERT_RESULT_TEX0 + i);
+		if(OutputsWritten & unBit)
 		{
 			pAsm->uiFP_AttributeMap[FRAG_ATTRIB_TEX0 + i] = pAsm->number_used_registers++;
 		}
 	}
+ 
+/* order has been taken care of */ 
+#if 1
+    for(i=VERT_RESULT_VAR0; i<VERT_RESULT_MAX; i++)
+	{
+        unBit = 1 << i;
+        if(OutputsWritten & unBit)
+		{
+            pAsm->uiFP_AttributeMap[i-VERT_RESULT_VAR0+FRAG_ATTRIB_VAR0] = pAsm->number_used_registers++;
+        }
+    }
+#else
+    if( (mesa_fp->Base.InputsRead >> FRAG_ATTRIB_VAR0) > 0 )
+    {
+	    struct r700_vertex_program_cont *vpc =
+		       (struct r700_vertex_program_cont *)ctx->VertexProgram._Current;
+        struct gl_program_parameter_list * VsVarying = vpc->mesa_program.Base.Varying;
+        struct gl_program_parameter_list * PsVarying = mesa_fp->Base.Varying;
+        struct gl_program_parameter      * pVsParam;
+        struct gl_program_parameter      * pPsParam;
+        GLuint j, k;
+        GLuint unMaxVarying = 0;
+
+        for(i=0; i<VsVarying->NumParameters; i++)
+        {
+            pAsm->uiFP_AttributeMap[i + FRAG_ATTRIB_VAR0] = 0;
+        }
+
+        for(i=FRAG_ATTRIB_VAR0; i<FRAG_ATTRIB_MAX; i++)
+	    {
+            unBit = 1 << i;
+            if(mesa_fp->Base.InputsRead & unBit)
+		    {
+                j = i - FRAG_ATTRIB_VAR0;
+                pPsParam = PsVarying->Parameters + j;
+
+                for(k=0; k<VsVarying->NumParameters; k++)
+                {					
+                    pVsParam = VsVarying->Parameters + k;
+
+			        if( strcmp(pPsParam->Name, pVsParam->Name) == 0)
+                    {
+                        pAsm->uiFP_AttributeMap[i] = pAsm->number_used_registers + k;                  
+                        if(k > unMaxVarying)
+                        {
+                            unMaxVarying = k;
+                        }
+                        break;
+                    }
+                }
+		    }
+        }
+
+        pAsm->number_used_registers += unMaxVarying + 1;
+    }
+#endif
+    unBit = 1 << FRAG_ATTRIB_FACE;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+        pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FACE] = pAsm->number_used_registers++;
+    }
+
+    unBit = 1 << FRAG_ATTRIB_PNTC;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+        pAsm->uiFP_AttributeMap[FRAG_ATTRIB_PNTC] = pAsm->number_used_registers++;
+    }
 
 /* Map temporary registers (GPRs) */
     pAsm->starting_temp_register_number = pAsm->number_used_registers;
@@ -127,6 +249,8 @@ void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
         pAsm->pucOutMask[ui] = 0x0;
     }
 
+    pAsm->flag_reg_index = pAsm->number_used_registers++;
+
     pAsm->uFirstHelpReg = pAsm->number_used_registers;
 }
 
@@ -233,22 +357,61 @@ GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
 }
 
 GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
-							     struct gl_fragment_program   *mesa_fp)
+							     struct gl_fragment_program   *mesa_fp,
+                                 GLcontext *ctx) 
 {
 	GLuint    number_of_colors_exported;
 	GLboolean z_enabled = GL_FALSE;
-	GLuint    unBit;
+	GLuint    unBit, shadow_unit;
+	int i;
+	struct prog_instruction *inst;
+	gl_state_index shadow_ambient[STATE_LENGTH]
+	    = { STATE_INTERNAL, STATE_SHADOW_AMBIENT, 0, 0, 0};
 
     //Init_Program
 	Init_r700_AssemblerBase( SPT_FP, &(fp->r700AsmCode), &(fp->r700Shader) );
-	Map_Fragment_Program(&(fp->r700AsmCode), mesa_fp);
+
+    if(mesa_fp->Base.InputsRead & FRAG_BIT_WPOS)
+    {
+        insert_wpos_code(ctx, mesa_fp);
+    }
+
+    /* add/map  consts for ARB_shadow_ambient */
+    if(mesa_fp->Base.ShadowSamplers)
+    {
+        inst = mesa_fp->Base.Instructions;
+        for (i = 0; i < mesa_fp->Base.NumInstructions; i++)
+        {
+            if(inst->TexShadow == 1)
+            {
+                shadow_unit = inst->TexSrcUnit;
+                shadow_ambient[2] = shadow_unit;
+                fp->r700AsmCode.shadow_regs[shadow_unit] = 
+                    _mesa_add_state_reference(mesa_fp->Base.Parameters, shadow_ambient);
+            }
+            inst++;
+        }
+    }
+
+    Map_Fragment_Program(&(fp->r700AsmCode), mesa_fp, ctx); 
 
     if( GL_FALSE == Find_Instruction_Dependencies_fp(fp, mesa_fp) )
 	{
 		return GL_FALSE;
     }
+
+    InitShaderProgram(&(fp->r700AsmCode));
 	
-	if( GL_FALSE == AssembleInstr(mesa_fp->Base.NumInstructions,
+    for(i=0; i < MAX_SAMPLERS; i++)
+    {
+         fp->r700AsmCode.SamplerUnits[i] = fp->mesa_program.Base.SamplerUnits[i];
+    }
+
+    fp->r700AsmCode.unCurNumILInsts = mesa_fp->Base.NumInstructions;
+
+	if( GL_FALSE == AssembleInstr(0,
+                                  0,
+                                  mesa_fp->Base.NumInstructions,
                                   &(mesa_fp->Base.Instructions[0]), 
                                   &(fp->r700AsmCode)) )
 	{
@@ -260,6 +423,11 @@ GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
         return GL_FALSE;
     }
 
+    if( GL_FALSE == RelocProgram(&(fp->r700AsmCode), &(mesa_fp->Base)) )
+    {
+        return GL_FALSE;
+    }
+
     fp->r700Shader.nRegs = (fp->r700AsmCode.number_used_registers == 0) ? 0 
                          : (fp->r700AsmCode.number_used_registers - 1);
 
@@ -300,7 +468,7 @@ void r700SelectFragmentShader(GLcontext *ctx)
     }
 
     if (GL_FALSE == fp->translated)
-	    r700TranslateFragmentShader(fp, &(fp->mesa_program));
+	    r700TranslateFragmentShader(fp, &(fp->mesa_program), ctx); 
 }
 
 void * r700GetActiveFpShaderBo(GLcontext * ctx)
@@ -325,6 +493,7 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
     unsigned int unNumOfReg;
     unsigned int unBit;
     GLuint exportCount;
+    GLboolean point_sprite = GL_FALSE;
 
     if(GL_FALSE == fp->loaded)
     {
@@ -378,6 +547,50 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
         CLEARbit(r700->SPI_INPUT_Z.u32All, PROVIDE_Z_TO_SPI_bit);
     }
 
+    if (mesa_fp->Base.InputsRead & (1 << FRAG_ATTRIB_FACE))
+    {
+        ui += 1;
+        SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, ui, NUM_INTERP_shift, NUM_INTERP_mask);
+        SETbit(r700->SPI_PS_IN_CONTROL_1.u32All, FRONT_FACE_ENA_bit);
+        SETbit(r700->SPI_PS_IN_CONTROL_1.u32All, FRONT_FACE_ALL_BITS_bit);
+        SETfield(r700->SPI_PS_IN_CONTROL_1.u32All, pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FACE], FRONT_FACE_ADDR_shift, FRONT_FACE_ADDR_mask);
+    }
+    else
+    {
+        CLEARbit(r700->SPI_PS_IN_CONTROL_1.u32All, FRONT_FACE_ENA_bit);
+    }
+
+    /* see if we need any point_sprite replacements */
+    for (i = VERT_RESULT_TEX0; i<= VERT_RESULT_TEX7; i++)
+    {
+        if(ctx->Point.CoordReplace[i - VERT_RESULT_TEX0] == GL_TRUE)
+            point_sprite = GL_TRUE;
+    }
+
+    if ((mesa_fp->Base.InputsRead & (1 << FRAG_ATTRIB_PNTC)) || point_sprite)
+    {
+        /* for FRAG_ATTRIB_PNTC we need to increase num_interp */
+        if(mesa_fp->Base.InputsRead & (1 << FRAG_ATTRIB_PNTC))
+        {
+            ui++;
+            SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, ui, NUM_INTERP_shift, NUM_INTERP_mask);
+        }
+        SETbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_ENA_bit);
+        SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_S, PNT_SPRITE_OVRD_X_shift, PNT_SPRITE_OVRD_X_mask);
+        SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_T, PNT_SPRITE_OVRD_Y_shift, PNT_SPRITE_OVRD_Y_mask);
+        SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_0, PNT_SPRITE_OVRD_Z_shift, PNT_SPRITE_OVRD_Z_mask);
+        SETfield(r700->SPI_INTERP_CONTROL_0.u32All, SPI_PNT_SPRITE_SEL_1, PNT_SPRITE_OVRD_W_shift, PNT_SPRITE_OVRD_W_mask);
+        if(ctx->Point.SpriteOrigin == GL_LOWER_LEFT)
+            SETbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_TOP_1_bit);
+        else
+            CLEARbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_TOP_1_bit);
+    }
+    else
+    {
+        CLEARbit(r700->SPI_INTERP_CONTROL_0.u32All, PNT_SPRITE_ENA_bit);
+    }
+
+
     ui = (unNumOfReg < ui) ? ui : unNumOfReg;
 
     SETfield(r700->ps.SQ_PGM_RESOURCES_PS.u32All, ui, NUM_GPRS_shift, NUM_GPRS_mask);
@@ -393,27 +606,14 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
     SETfield(r700->ps.SQ_PGM_EXPORTS_PS.u32All, fp->r700Shader.exportMode,
              EXPORT_MODE_shift, EXPORT_MODE_mask);
 
-    R600_STATECHANGE(context, db);
-
-    if(fp->r700Shader.killIsUsed)
-    {
-	    SETbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
-    }
-    else
-    {
-        CLEARbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
-    }
-
-    if(fp->r700Shader.depthIsExported)
-    {
-	    SETbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
-    }
-    else
-    {
-        CLEARbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
-    }
-
     // emit ps input map
+    struct r700_vertex_program_cont *vpc =
+		       (struct r700_vertex_program_cont *)ctx->VertexProgram._Current;
+    GLbitfield OutputsWritten = vpc->mesa_program.Base.OutputsWritten;
+    
+    for(ui = 0; ui < R700_MAX_SHADER_EXPORTS; ui++)
+        r700->SPI_PS_INPUT_CNTL[ui].u32All = 0;
+
     unBit = 1 << FRAG_ATTRIB_WPOS;
     if(mesa_fp->Base.InputsRead & unBit)
     {
@@ -427,8 +627,8 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
                     CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
     }
 
-    unBit = 1 << FRAG_ATTRIB_COL0;
-    if(mesa_fp->Base.InputsRead & unBit)
+    unBit = 1 << VERT_RESULT_COL0;
+    if(OutputsWritten & unBit)
     {
 	    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL0];
 	    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
@@ -440,8 +640,8 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
     }
 
-    unBit = 1 << FRAG_ATTRIB_COL1;
-    if(mesa_fp->Base.InputsRead & unBit)
+    unBit = 1 << VERT_RESULT_COL1;
+    if(OutputsWritten & unBit)
     {
 	    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL1];
 	    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
@@ -453,8 +653,8 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
     }
 
-    unBit = 1 << FRAG_ATTRIB_FOGC;
-    if(mesa_fp->Base.InputsRead & unBit)
+    unBit = 1 << VERT_RESULT_FOGC;
+    if(OutputsWritten & unBit)
     {
             ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FOGC];
             SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
@@ -468,25 +668,79 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 
     for(i=0; i<8; i++)
     {
-	    unBit = 1 << (FRAG_ATTRIB_TEX0 + i);
-	    if(mesa_fp->Base.InputsRead & unBit)
+	    unBit = 1 << (VERT_RESULT_TEX0 + i);
+	    if(OutputsWritten & unBit)
 	    {
 		    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_TEX0 + i];
 		    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
 		    SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
 			     SEMANTIC_shift, SEMANTIC_mask);
 		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+		    /* ARB_point_sprite */
+		    if(ctx->Point.CoordReplace[i] == GL_TRUE)
+		    {
+			     SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, PT_SPRITE_TEX_bit);
+		    }
 	    }
     }
 
-    R600_STATECHANGE(context, cb);
+    unBit = 1 << FRAG_ATTRIB_FACE;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+            ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FACE];
+            SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+            SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+                     SEMANTIC_shift, SEMANTIC_mask);
+            if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+                    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+            else
+                    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+    }
+    unBit = 1 << FRAG_ATTRIB_PNTC;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+            ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_PNTC];
+            SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+            SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+                     SEMANTIC_shift, SEMANTIC_mask);
+            if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+                    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+            else
+                    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+            SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, PT_SPRITE_TEX_bit);
+    }
+
+
+
+
+    for(i=VERT_RESULT_VAR0; i<VERT_RESULT_MAX; i++)
+	{
+        unBit = 1 << i;
+        if(OutputsWritten & unBit)
+		{
+            ui = pAsm->uiFP_AttributeMap[i-VERT_RESULT_VAR0+FRAG_ATTRIB_VAR0];
+            SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+            SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+		             SEMANTIC_shift, SEMANTIC_mask);
+            if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+		        SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+            else
+		        CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+        }
+    }
+
     exportCount = (r700->ps.SQ_PGM_EXPORTS_PS.u32All & EXPORT_MODE_mask) / (1 << EXPORT_MODE_shift);
-    r700->CB_SHADER_CONTROL.u32All = (1 << exportCount) - 1;
+    if (r700->CB_SHADER_CONTROL.u32All != ((1 << exportCount) - 1))
+    {
+	    R600_STATECHANGE(context, cb);
+	    r700->CB_SHADER_CONTROL.u32All = (1 << exportCount) - 1;
+    }
 
     /* sent out shader constants. */
     paramList = fp->mesa_program.Base.Parameters;
 
-    if(NULL != paramList) {
+    if(NULL != paramList) 
+    {
 	    _mesa_load_state_parameters(ctx, paramList);
 
 	    if (paramList->NumParameters > R700_MAX_DX9_CONSTS)
@@ -499,14 +753,33 @@ GLboolean r700SetupFragmentProgram(GLcontext * ctx)
 	    unNumParamData = paramList->NumParameters;
 
 	    for(ui=0; ui<unNumParamData; ui++) {
-		    r700->ps.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
-		    r700->ps.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
-		    r700->ps.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
-		    r700->ps.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+		        r700->ps.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
+		        r700->ps.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
+		        r700->ps.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
+		        r700->ps.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
 	    }
     } else
 	    r700->ps.num_consts = 0;
 
+    COMPILED_SUB * pCompiledSub;
+    GLuint uj;
+    GLuint unConstOffset = r700->ps.num_consts;
+    for(ui=0; ui<pAsm->unNumPresub; ui++)
+    {
+        pCompiledSub = pAsm->presubs[ui].pCompiledSub;
+
+        r700->ps.num_consts += pCompiledSub->NumParameters;
+
+        for(uj=0; uj<pCompiledSub->NumParameters; uj++)
+        {
+            r700->ps.consts[uj + unConstOffset][0].f32All = pCompiledSub->ParameterValues[uj][0];
+		    r700->ps.consts[uj + unConstOffset][1].f32All = pCompiledSub->ParameterValues[uj][1];
+		    r700->ps.consts[uj + unConstOffset][2].f32All = pCompiledSub->ParameterValues[uj][2];
+		    r700->ps.consts[uj + unConstOffset][3].f32All = pCompiledSub->ParameterValues[uj][3];
+        }
+        unConstOffset += pCompiledSub->NumParameters;
+    }
+
     return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.h b/src/mesa/drivers/dri/r600/r700_fragprog.h
index cbb108d212..39c59c9201 100644
--- a/src/mesa/drivers/dri/r600/r700_fragprog.h
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.h
@@ -48,13 +48,17 @@ struct r700_fragment_program
 };
 
 /* Internal */
+void insert_wpos_code(GLcontext *ctx, struct gl_fragment_program *fprog);
+
 void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
-			  struct gl_fragment_program *mesa_fp);
+			  struct gl_fragment_program *mesa_fp,
+                          GLcontext *ctx); 
 GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
 					   struct gl_fragment_program   *mesa_fp);
 
 GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
-				      struct gl_fragment_program   *mesa_vp);
+				      struct gl_fragment_program   *mesa_vp,
+                                      GLcontext *ctx); 
 
 /* Interface */
 extern void r700SelectFragmentShader(GLcontext *ctx);
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
index c345b9d8ac..eab27cbd84 100644
--- a/src/mesa/drivers/dri/r600/r700_render.c
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -59,9 +59,7 @@
 
 void r700WaitForIdle(context_t *context);
 void r700WaitForIdleClean(context_t *context);
-GLboolean r700SendTextureState(context_t *context);
 static unsigned int r700PrimitiveType(int prim);
-void r600UpdateTextureState(GLcontext * ctx);
 GLboolean r700SyncSurf(context_t *context,
 		       struct radeon_bo *pbo,
 		       uint32_t read_domain,
@@ -528,6 +526,9 @@ static void r700ConvertAttrib(GLcontext *ctx, int count,
 
     radeonAllocDmaRegion(&context->radeon, &attr->bo, &attr->bo_offset, 
                          sizeof(GLfloat) * input->Size * count, 32);
+
+    radeon_bo_map(attr->bo, 1);
+
     dst_ptr = (GLfloat *)ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
 
     assert(src_ptr != NULL);
@@ -561,6 +562,8 @@ static void r700ConvertAttrib(GLcontext *ctx, int count,
             break;
     }
 
+    radeon_bo_unmap(attr->bo);
+
     if (mapped_named_bo) 
     {
         ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
@@ -579,6 +582,8 @@ static void r700AlignDataToDword(GLcontext *ctx,
 
     radeonAllocDmaRegion(&context->radeon, &attr->bo, &attr->bo_offset, size, 32);
 
+    radeon_bo_map(attr->bo, 1);
+
     if (!input->BufferObj->Pointer) 
     {
         ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
@@ -598,6 +603,7 @@ static void r700AlignDataToDword(GLcontext *ctx,
         }
     }
 
+    radeon_bo_unmap(attr->bo);
     if (mapped_named_bo) 
     {
         ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
@@ -666,14 +672,18 @@ static void r700SetupStreams(GLcontext *ctx, const struct gl_client_array *input
 
                 radeonAllocDmaRegion(&context->radeon, &context->stream_desc[index].bo, 
                                      &context->stream_desc[index].bo_offset, size, 32);
+
+                radeon_bo_map(context->stream_desc[index].bo, 1);
                 assert(context->stream_desc[index].bo->ptr != NULL);
+
+
                 dst = (uint32_t *)ADD_POINTERS(context->stream_desc[index].bo->ptr, 
                                                context->stream_desc[index].bo_offset);
 
                 switch (context->stream_desc[index].dwords) 
                 {
                 case 1:                     
-                    radeonEmitVec4(dst, input[i]->Ptr, input[i]->StrideB, local_count);                         
+                    radeonEmitVec4(dst, input[i]->Ptr, input[i]->StrideB, local_count);
                     break;
                 case 2: 
                     radeonEmitVec8(dst, input[i]->Ptr, input[i]->StrideB, local_count); 
@@ -688,6 +698,7 @@ static void r700SetupStreams(GLcontext *ctx, const struct gl_client_array *input
                     assert(0); 
                     break;
                 }
+		radeon_bo_unmap(context->stream_desc[index].bo);
             }
         }
 
@@ -759,6 +770,7 @@ static void r700FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 	radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
 			     &context->ind_buf.bo_offset, size, 4);
 
+	radeon_bo_map(context->ind_buf.bo, 1);
 	assert(context->ind_buf.bo->ptr != NULL);
 	out = (GLuint *)ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
 
@@ -772,6 +784,7 @@ static void r700FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
             *out++ = in[i];
         }
 
+	radeon_bo_unmap(context->ind_buf.bo);
 #if MESA_BIG_ENDIAN
     }
     else
@@ -782,6 +795,7 @@ static void r700FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 	radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
 			     &context->ind_buf.bo_offset, size, 4);
 
+	radeon_bo_map(context->ind_buf.bo, 1);
 	assert(context->ind_buf.bo->ptr != NULL);
 	out = (GLuint *)ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
 
@@ -794,6 +808,7 @@ static void r700FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
         {
             *out++ = in[i];
         }
+	radeon_bo_unmap(context->ind_buf.bo);
 #endif
     }
 
@@ -839,11 +854,13 @@ static void r700SetupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer
 
 	radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
 			     &context->ind_buf.bo_offset, size, 4);
+	radeon_bo_map(context->ind_buf.bo, 1);
 	assert(context->ind_buf.bo->ptr != NULL);
 	dst_ptr = ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
 
         _mesa_memcpy(dst_ptr, src_ptr, size);
 
+	radeon_bo_unmap(context->ind_buf.bo);
         context->ind_buf.is_32bit = (mesa_ind_buf->type == GL_UNSIGNED_INT);
         context->ind_buf.count = mesa_ind_buf->count;
 
@@ -891,7 +908,7 @@ static GLboolean r700TryDrawPrims(GLcontext *ctx,
     r700SetScissor(context);
     r700SetupVertexProgram(ctx);
     r700SetupFragmentProgram(ctx);
-    r600UpdateTextureState(ctx);
+    r700UpdateShaderStates(ctx);
 
     GLuint emit_end = r700PredictRenderSize(ctx, prim, ib, nr_prims)
                     + context->radeon.cmdbuf.cs->cdw;
diff --git a/src/mesa/drivers/dri/r600/r700_shader.c b/src/mesa/drivers/dri/r600/r700_shader.c
index 955ea4e4e1..2eed1acc2f 100644
--- a/src/mesa/drivers/dri/r600/r700_shader.c
+++ b/src/mesa/drivers/dri/r600/r700_shader.c
@@ -159,13 +159,18 @@ void Init_R700_Shader(R700_Shader * pShader)
 	pShader->lstVTXInstructions.uNumOfNode=0;
 }
 
+void SetActiveCFlist(R700_Shader *pShader, TypedShaderList * plstCF)
+{
+    pShader->plstCFInstructions_active = plstCF;
+}
+
 void AddCFInstruction(R700_Shader *pShader, R700ControlFlowInstruction *pCFInst)
 {
     R700ControlFlowSXClause*  pSXClause; 
     R700ControlFlowSMXClause* pSMXClause;
 
-    pCFInst->m_uIndex = pShader->lstCFInstructions.uNumOfNode;
-    AddInstToList(&(pShader->lstCFInstructions), 
+    pCFInst->m_uIndex = pShader->plstCFInstructions_active->uNumOfNode;
+    AddInstToList(pShader->plstCFInstructions_active, 
                   (R700ShaderInstruction*)pCFInst);
     pShader->uShaderBinaryDWORDSize += GetInstructionSize(pCFInst->m_ShaderInstType);
 
diff --git a/src/mesa/drivers/dri/r600/r700_shader.h b/src/mesa/drivers/dri/r600/r700_shader.h
index c6a058617e..0599ffd901 100644
--- a/src/mesa/drivers/dri/r600/r700_shader.h
+++ b/src/mesa/drivers/dri/r600/r700_shader.h
@@ -109,6 +109,7 @@ typedef struct R700_Shader
     GLuint  uStackSize;
     GLuint  uMaxCallDepth;
 
+    TypedShaderList * plstCFInstructions_active;
 	TypedShaderList lstCFInstructions;
 	TypedShaderList lstALUInstructions;
 	TypedShaderList lstTEXInstructions;
@@ -132,13 +133,13 @@ void TakeInstOutFromList(TypedShaderList * plstCFInstructions, R700ShaderInstruc
 void ResolveLinks(R700_Shader *pShader);
 void Assemble(R700_Shader *pShader);
 
-
 //Interface
 void Init_R700_Shader(R700_Shader * pShader);
 void AddCFInstruction(R700_Shader *pShader, R700ControlFlowInstruction *pCFInst);
 void AddVTXInstruction(R700_Shader *pShader, R700VertexInstruction *pVTXInst);
 void AddTEXInstruction(R700_Shader *pShader, R700TextureInstruction *pTEXInst);
 void AddALUInstruction(R700_Shader *pShader, R700ALUInstruction *pALUInst);
+void SetActiveCFlist(R700_Shader *pShader, TypedShaderList * plstCF);
 
 void LoadProgram(R700_Shader *pShader);
 void UpdateShaderRegisters(R700_Shader *pShader);
diff --git a/src/mesa/drivers/dri/r600/r700_shaderinst.h b/src/mesa/drivers/dri/r600/r700_shaderinst.h
index 2829cca0a3..cdb9a570f7 100644
--- a/src/mesa/drivers/dri/r600/r700_shaderinst.h
+++ b/src/mesa/drivers/dri/r600/r700_shaderinst.h
@@ -42,6 +42,13 @@
 #define SQ_FETCH_RESOURCE_VS_OFFSET    0x000000a0
 #define SQ_FETCH_RESOURCE_VS_COUNT     0x000000b0
 
+//richard dec.10 glsl
+#define SQ_TEX_SAMPLER_PS_OFFSET       0x00000000
+#define SQ_TEX_SAMPLER_PS_COUNT        0x00000012
+#define SQ_TEX_SAMPLER_VS_OFFSET       0x00000012
+#define SQ_TEX_SAMPLER_VS_COUNT        0x00000012
+//-------------------
+
 #define SHADERINST_TYPEMASK_CF  0x10
 #define SHADERINST_TYPEMASK_ALU 0x20
 #define SHADERINST_TYPEMASK_TEX 0x40
diff --git a/src/mesa/drivers/dri/r600/r700_state.c b/src/mesa/drivers/dri/r600/r700_state.c
index b278887266..3c8cb579f9 100644
--- a/src/mesa/drivers/dri/r600/r700_state.c
+++ b/src/mesa/drivers/dri/r600/r700_state.c
@@ -54,7 +54,7 @@
 #include "r700_fragprog.h"
 #include "r700_vertprog.h"
 
-
+void r600UpdateTextureState(GLcontext * ctx);
 static void r700SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state);
 static void r700UpdatePolygonMode(GLcontext * ctx);
 static void r700SetPolygonOffsetState(GLcontext * ctx, GLboolean state);
@@ -85,7 +85,7 @@ void r700UpdateViewportOffset(GLcontext * ctx) //------------------
 {
 	context_t *context = R700_CONTEXT(ctx);
 	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
+	__DRIdrawable *dPriv = radeon_get_drawable(&context->radeon);
 	GLfloat xoffset = (GLfloat) dPriv->x;
 	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -191,6 +191,67 @@ static void r700InvalidateState(GLcontext * ctx, GLuint new_state) //-----------
     context->radeon.NewGLState |= new_state;
 }
 
+static void r700SetDBRenderState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	struct r700_fragment_program *fp = (struct r700_fragment_program *)
+		(ctx->FragmentProgram._Current);
+
+	R600_STATECHANGE(context, db);
+
+	SETbit(r700->DB_SHADER_CONTROL.u32All, DUAL_EXPORT_ENABLE_bit);
+	SETfield(r700->DB_SHADER_CONTROL.u32All, EARLY_Z_THEN_LATE_Z, Z_ORDER_shift, Z_ORDER_mask);
+	/* XXX need to enable htile for hiz/s */
+	SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
+	SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
+	SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
+
+	if (context->radeon.query.current)
+	{
+		SETbit(r700->DB_RENDER_OVERRIDE.u32All, NOOP_CULL_DISABLE_bit);
+		if (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV770)
+		{
+			SETbit(r700->DB_RENDER_CONTROL.u32All, PERFECT_ZPASS_COUNTS_bit);
+		}
+	}
+	else
+	{
+		CLEARbit(r700->DB_RENDER_OVERRIDE.u32All, NOOP_CULL_DISABLE_bit);
+		if (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV770)
+		{
+			CLEARbit(r700->DB_RENDER_CONTROL.u32All, PERFECT_ZPASS_COUNTS_bit);
+		}
+	}
+
+	if (fp)
+	{
+		if (fp->r700Shader.killIsUsed)
+		{
+			SETbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+		}
+		else
+		{
+			CLEARbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+		}
+
+		if (fp->r700Shader.depthIsExported)
+		{
+			SETbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
+		}
+		else
+		{
+			CLEARbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
+		}
+	}
+}
+
+void r700UpdateShaderStates(GLcontext * ctx)
+{
+	r700SetDBRenderState(ctx);
+	r600UpdateTextureState(ctx);
+}
+
 static void r700SetDepthState(GLcontext * ctx)
 {
 	context_t *context = R700_CONTEXT(ctx);
@@ -644,6 +705,10 @@ static void r700UpdateCulling(GLcontext * ctx)
             CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit); /* default: ccw */
             break;
     }
+
+    /* Winding is inverted when rendering to FBO */
+    if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+	    r700->PA_SU_SC_MODE_CNTL.u32All ^= FACE_bit;
 }
 
 static void r700UpdateLineStipple(GLcontext * ctx)
@@ -1006,7 +1071,7 @@ static void r700UpdateWindow(GLcontext * ctx, int id) //--------------------
 {
 	context_t *context = R700_CONTEXT(ctx);
 	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
+	__DRIdrawable *dPriv = radeon_get_drawable(&context->radeon);
 	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
 	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1166,13 +1231,8 @@ static void r700UpdatePolygonMode(GLcontext * ctx)
 		/* Handle GL_CW (clock wise and GL_CCW (counter clock wise)
 		 * correctly by selecting the correct front and back face
 		 */
-		if (ctx->Polygon.FrontFace == GL_CCW) {
-			f = ctx->Polygon.FrontMode;
-			b = ctx->Polygon.BackMode;
-		} else {
-			f = ctx->Polygon.BackMode;
-			b = ctx->Polygon.FrontMode;
-		}
+		f = ctx->Polygon.FrontMode;
+		b = ctx->Polygon.BackMode;
 
 		/* Enable polygon mode */
 		SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DUAL_MODE, POLY_MODE_shift, POLY_MODE_mask);
@@ -1664,28 +1724,18 @@ void r700InitState(GLcontext * ctx) //-------------------
     r700InitSQConfig(ctx);
 
     r700ColorMask(ctx,
-		  ctx->Color.ColorMask[RCOMP],
-		  ctx->Color.ColorMask[GCOMP],
-		  ctx->Color.ColorMask[BCOMP],
-		  ctx->Color.ColorMask[ACOMP]);
+		  ctx->Color.ColorMask[0][RCOMP],
+		  ctx->Color.ColorMask[0][GCOMP],
+		  ctx->Color.ColorMask[0][BCOMP],
+		  ctx->Color.ColorMask[0][ACOMP]);
 
     r700Enable(ctx, GL_DEPTH_TEST, ctx->Depth.Test);
     r700DepthMask(ctx, ctx->Depth.Mask);
     r700DepthFunc(ctx, ctx->Depth.Func);
-    SETbit(r700->DB_SHADER_CONTROL.u32All, DUAL_EXPORT_ENABLE_bit);
-
     r700->DB_DEPTH_CLEAR.u32All     = 0x3F800000;
-
-    r700->DB_RENDER_CONTROL.u32All  = 0;
     SETbit(r700->DB_RENDER_CONTROL.u32All, STENCIL_COMPRESS_DISABLE_bit);
     SETbit(r700->DB_RENDER_CONTROL.u32All, DEPTH_COMPRESS_DISABLE_bit);
-    r700->DB_RENDER_OVERRIDE.u32All = 0;
-    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
-	    SETbit(r700->DB_RENDER_OVERRIDE.u32All, FORCE_SHADER_Z_ORDER_bit);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
-    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
-    SETbit(r700->DB_RENDER_OVERRIDE.u32All, NOOP_CULL_DISABLE_bit);
+    r700SetDBRenderState(ctx);
 
     r700->DB_ALPHA_TO_MASK.u32All = 0;
     SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET0_shift, ALPHA_TO_MASK_OFFSET0_mask);
diff --git a/src/mesa/drivers/dri/r600/r700_state.h b/src/mesa/drivers/dri/r600/r700_state.h
index 209189d8d7..60c6a7f23c 100644
--- a/src/mesa/drivers/dri/r600/r700_state.h
+++ b/src/mesa/drivers/dri/r600/r700_state.h
@@ -35,7 +35,7 @@
 
 extern void r700UpdateStateParameters(GLcontext * ctx, GLuint new_state);
 extern void r700UpdateShaders (GLcontext * ctx);
-extern void r700UpdateShaders2(GLcontext * ctx);
+extern void r700UpdateShaderStates(GLcontext * ctx);
 
 extern void r700UpdateViewportOffset(GLcontext * ctx);
 
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c
index ffc6068bd8..782f151f5a 100644
--- a/src/mesa/drivers/dri/r600/r700_vertprog.c
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.c
@@ -111,6 +111,15 @@ unsigned int Map_Vertex_Output(r700_AssemblerBase       *pAsm,
 		}
 	}
 
+    for(i=VERT_RESULT_VAR0; i<VERT_RESULT_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(mesa_vp->Base.OutputsWritten & unBit)
+		{
+			pAsm->ucVP_OutputMap[i] = unTotal++;
+		}
+	}
+
 	return (unTotal - unStart);
 }
 
@@ -179,7 +188,8 @@ GLboolean Process_Vertex_Program_Vfetch_Instructions2(
                                       context->stream_desc[i].size,
                                       context->stream_desc[i].element,
                                       context->stream_desc[i]._signed,
-                                      context->stream_desc[i].normalize,						            
+                                      context->stream_desc[i].normalize,
+                                      context->stream_desc[i].format,
                                      &vtxFetchMethod);
     }
 
@@ -235,6 +245,8 @@ void Map_Vertex_Program(GLcontext *ctx,
         pAsm->number_used_registers += mesa_vp->Base.NumTemporaries;
     }
 
+    pAsm->flag_reg_index = pAsm->number_used_registers++;
+
     pAsm->uFirstHelpReg = pAsm->number_used_registers;
 }
 
@@ -308,6 +320,7 @@ struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
 		vp->aos_desc[i].size   = context->stream_desc[i].size;
 		vp->aos_desc[i].stride = context->stream_desc[i].stride;
 		vp->aos_desc[i].type   = context->stream_desc[i].type;
+		vp->aos_desc[i].format = context->stream_desc[i].format;
 	}
 
 	if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
@@ -324,7 +337,18 @@ struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
 		return NULL;
 	}
 
-	if(GL_FALSE == AssembleInstr(vp->mesa_program->Base.NumInstructions,
+    InitShaderProgram(&(vp->r700AsmCode));
+
+    for(i=0; i < MAX_SAMPLERS; i++)
+    {
+        vp->r700AsmCode.SamplerUnits[i] = vp->mesa_program->Base.SamplerUnits[i];
+    }
+
+    vp->r700AsmCode.unCurNumILInsts = vp->mesa_program->Base.NumInstructions;
+
+	if(GL_FALSE == AssembleInstr(0,
+                                 0,
+                                 vp->mesa_program->Base.NumInstructions,
                                  &(vp->mesa_program->Base.Instructions[0]),
                                  &(vp->r700AsmCode)) )
 	{
@@ -336,6 +360,11 @@ struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
         return NULL;
     }
 
+    if( GL_FALSE == RelocProgram(&(vp->r700AsmCode), &(vp->mesa_program->Base)) )
+    {
+        return GL_FALSE;
+    }
+
     vp->r700Shader.nRegs = (vp->r700AsmCode.number_used_registers == 0) ? 0 
                          : (vp->r700AsmCode.number_used_registers - 1);
 
@@ -368,7 +397,8 @@ void r700SelectVertexShader(GLcontext *ctx)
 	match = GL_TRUE;
 	for(i=0; i<context->nNumActiveAos; i++)
 	{
-		if (vp->aos_desc[i].size != context->stream_desc[i].size)
+		if (vp->aos_desc[i].size != context->stream_desc[i].size ||
+		    vp->aos_desc[i].format != context->stream_desc[i].format)
 		{
 			match = GL_FALSE;
 			break;
@@ -471,6 +501,7 @@ static void r700TranslateAttrib(GLcontext *ctx, GLuint unLoc, int count, const s
 	pStreamDesc->size = input->Size;
 	pStreamDesc->dst_loc = context->nNumActiveAos;
 	pStreamDesc->element = unLoc;
+	pStreamDesc->format = input->Format;
 
 	switch (pStreamDesc->type) 
 	{ //GetSurfaceFormat
@@ -612,6 +643,12 @@ GLboolean r700SetupVertexProgram(GLcontext * ctx)
     paramList = vp->mesa_program->Base.Parameters;
 
     if(NULL != paramList) {
+        /* vp->mesa_program was cloned, not updated by glsl shader api. */
+        /* _mesa_reference_program has already checked glsl shProg is ok and set ctx->VertexProgem._Current */
+        /* so, use ctx->VertexProgem._Current */       
+        struct gl_program_parameter_list *paramListOrginal = 
+                         paramListOrginal = ctx->VertexProgram._Current->Base.Parameters;
+         
 	    _mesa_load_state_parameters(ctx, paramList);
 
 	    if (paramList->NumParameters > R700_MAX_DX9_CONSTS)
@@ -624,13 +661,42 @@ GLboolean r700SetupVertexProgram(GLcontext * ctx)
 	    unNumParamData = paramList->NumParameters;
 
 	    for(ui=0; ui<unNumParamData; ui++) {
-		    r700->vs.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
-		    r700->vs.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
-		    r700->vs.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
-		    r700->vs.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+            if(paramList->Parameters[ui].Type == PROGRAM_UNIFORM) 
+            {
+                r700->vs.consts[ui][0].f32All = paramListOrginal->ParameterValues[ui][0];
+		        r700->vs.consts[ui][1].f32All = paramListOrginal->ParameterValues[ui][1];
+		        r700->vs.consts[ui][2].f32All = paramListOrginal->ParameterValues[ui][2];
+		        r700->vs.consts[ui][3].f32All = paramListOrginal->ParameterValues[ui][3];
+            }
+            else
+            {
+		        r700->vs.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
+		        r700->vs.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
+		        r700->vs.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
+		        r700->vs.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+            }
 	    }
     } else
 	    r700->vs.num_consts = 0;
 
+    COMPILED_SUB * pCompiledSub;
+    GLuint uj;
+    GLuint unConstOffset = r700->vs.num_consts;
+    for(ui=0; ui<vp->r700AsmCode.unNumPresub; ui++)
+    {
+        pCompiledSub = vp->r700AsmCode.presubs[ui].pCompiledSub;
+
+        r700->vs.num_consts += pCompiledSub->NumParameters;
+
+        for(uj=0; uj<pCompiledSub->NumParameters; uj++)
+        {
+            r700->vs.consts[uj + unConstOffset][0].f32All = pCompiledSub->ParameterValues[uj][0];
+		    r700->vs.consts[uj + unConstOffset][1].f32All = pCompiledSub->ParameterValues[uj][1];
+		    r700->vs.consts[uj + unConstOffset][2].f32All = pCompiledSub->ParameterValues[uj][2];
+		    r700->vs.consts[uj + unConstOffset][3].f32All = pCompiledSub->ParameterValues[uj][3];
+        }
+        unConstOffset += pCompiledSub->NumParameters;
+    }
+
     return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.h b/src/mesa/drivers/dri/r600/r700_vertprog.h
index 00824c29d3..645c9ac84a 100644
--- a/src/mesa/drivers/dri/r600/r700_vertprog.h
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.h
@@ -39,6 +39,7 @@ typedef struct ArrayDesc //TEMP
 	GLint size;   //number of data element
 	GLenum type;  //data element type
 	GLsizei stride;
+	GLenum format; //GL_RGBA or GL_BGRA
 } ArrayDesc;
 
 struct r700_vertex_program 
diff --git a/src/mesa/drivers/dri/r600/radeon_bo.c b/src/mesa/drivers/dri/r600/radeon_bo.c
new file mode 120000
index 0000000000..9448ffee54
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_bo.c
@@ -0,0 +1 @@
+../radeon/radeon_bo.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_bo_int_drm.h b/src/mesa/drivers/dri/r600/radeon_bo_int_drm.h
new file mode 120000
index 0000000000..029450928b
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_bo_int_drm.h
@@ -0,0 +1 @@
+../radeon/radeon_bo_int_drm.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_cs.c b/src/mesa/drivers/dri/r600/radeon_cs.c
new file mode 120000
index 0000000000..66b7ad1eb0
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_cs.c
@@ -0,0 +1 @@
+../radeon/radeon_cs.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_cs_int_drm.h b/src/mesa/drivers/dri/r600/radeon_cs_int_drm.h
new file mode 120000
index 0000000000..462f5245d0
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_cs_int_drm.h
@@ -0,0 +1 @@
+../radeon/radeon_cs_int_drm.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/radeon/Makefile b/src/mesa/drivers/dri/radeon/Makefile
index ae2e695bfc..2b2f2c4aa7 100644
--- a/src/mesa/drivers/dri/radeon/Makefile
+++ b/src/mesa/drivers/dri/radeon/Makefile
@@ -11,7 +11,7 @@ LIBNAME = radeon_dri.so
 MINIGLX_SOURCES = server/radeon_dri.c 
 
 ifeq ($(RADEON_LDFLAGS),)
-CS_SOURCES = radeon_cs_space_drm.c
+CS_SOURCES = radeon_cs_space_drm.c radeon_bo.c radeon_cs.c
 endif
 
 RADEON_COMMON_SOURCES = \
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo.c b/src/mesa/drivers/dri/radeon/radeon_bo.c
new file mode 100644
index 0000000000..393d156cde
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo.c
@@ -0,0 +1,110 @@
+#include <radeon_bocs_wrapper.h>
+#include <radeon_bo_int_drm.h>
+
+void radeon_bo_debug(struct radeon_bo *bo,
+		     const char *op)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+
+    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X\n",
+            op, bo, bo->handle, boi->size, boi->cref);
+}
+
+struct radeon_bo *radeon_bo_open(struct radeon_bo_manager *bom,
+				 uint32_t handle,
+				 uint32_t size,
+				 uint32_t alignment,
+				 uint32_t domains,
+				 uint32_t flags)
+{
+    struct radeon_bo *bo;
+    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
+    return bo;
+}
+
+void radeon_bo_ref(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    boi->cref++;
+    boi->bom->funcs->bo_ref(boi);
+}
+
+struct radeon_bo *radeon_bo_unref(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    boi->cref--;
+    return boi->bom->funcs->bo_unref(boi);
+}
+
+int radeon_bo_map(struct radeon_bo *bo, int write)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_map(boi, write);
+}
+
+int radeon_bo_unmap(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_unmap(boi);
+}
+
+int radeon_bo_wait(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    if (!boi->bom->funcs->bo_wait)
+	return 0;
+    return boi->bom->funcs->bo_wait(boi);
+}
+
+int radeon_bo_is_busy(struct radeon_bo *bo,
+		      uint32_t *domain)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_is_busy(boi, domain);
+}
+
+int radeon_bo_set_tiling(struct radeon_bo *bo,
+			 uint32_t tiling_flags, uint32_t pitch)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_set_tiling(boi, tiling_flags, pitch);
+}
+
+int radeon_bo_get_tiling(struct radeon_bo *bo,
+			  uint32_t *tiling_flags, uint32_t *pitch)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->bom->funcs->bo_get_tiling(boi, tiling_flags, pitch);
+}
+
+int radeon_bo_is_static(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    if (boi->bom->funcs->bo_is_static)
+	return boi->bom->funcs->bo_is_static(boi);
+    return 0;
+}
+
+int radeon_bo_is_referenced_by_cs(struct radeon_bo *bo,
+				  struct radeon_cs *cs)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    return boi->cref > 1;
+}
+
+uint32_t radeon_bo_get_handle(struct radeon_bo *bo)
+{
+    return bo->handle;
+}
+
+uint32_t radeon_bo_get_src_domain(struct radeon_bo *bo)
+{
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    uint32_t src_domain;
+
+    src_domain = boi->space_accounted & 0xffff;
+    if (!src_domain)
+	src_domain = boi->space_accounted >> 16;
+
+    return src_domain;
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
index 7141371633..beb2369880 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
@@ -32,188 +32,44 @@
 
 #include <stdio.h>
 #include <stdint.h>
-//#include "radeon_track.h"
 
 /* bo object */
 #define RADEON_BO_FLAGS_MACRO_TILE  1
 #define RADEON_BO_FLAGS_MICRO_TILE  2
 
 struct radeon_bo_manager;
+struct radeon_cs;
 
 struct radeon_bo {
-    uint32_t                    alignment;
+    void                        *ptr;
+    uint32_t                    flags;
     uint32_t                    handle;
     uint32_t                    size;
-    uint32_t                    domains;
-    uint32_t                    flags;
-    unsigned                    cref;
-#ifdef RADEON_BO_TRACK
-    struct radeon_track         *track;
-#endif
-    void                        *ptr;
-    struct radeon_bo_manager    *bom;
-    uint32_t                    space_accounted;
-};
-
-/* bo functions */
-struct radeon_bo_funcs {
-    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
-                                 uint32_t handle,
-                                 uint32_t size,
-                                 uint32_t alignment,
-                                 uint32_t domains,
-                                 uint32_t flags);
-    void (*bo_ref)(struct radeon_bo *bo);
-    struct radeon_bo *(*bo_unref)(struct radeon_bo *bo);
-    int (*bo_map)(struct radeon_bo *bo, int write);
-    int (*bo_unmap)(struct radeon_bo *bo);
-    int (*bo_wait)(struct radeon_bo *bo);
-    int (*bo_is_static)(struct radeon_bo *bo);
-    int (*bo_set_tiling)(struct radeon_bo *bo, uint32_t tiling_flags,
-			  uint32_t pitch);
-    int (*bo_get_tiling)(struct radeon_bo *bo, uint32_t *tiling_flags,
-			  uint32_t *pitch);
-    int (*bo_is_busy)(struct radeon_bo *bo, uint32_t *domain);
 };
 
-struct radeon_bo_manager {
-    struct radeon_bo_funcs  *funcs;
-    int                     fd;
-
-#ifdef RADEON_BO_TRACK
-    struct radeon_tracker   tracker;
-#endif
-};
-    
-static inline void _radeon_bo_debug(struct radeon_bo *bo,
-                                    const char *op,
-                                    const char *file,
-                                    const char *func,
-                                    int line)
-{
-    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X [%s %s %d]\n",
-            op, bo, bo->handle, bo->size, bo->cref, file, func, line);
-}
-
-static inline struct radeon_bo *_radeon_bo_open(struct radeon_bo_manager *bom,
-                                                uint32_t handle,
-                                                uint32_t size,
-                                                uint32_t alignment,
-                                                uint32_t domains,
-                                                uint32_t flags,
-                                                const char *file,
-                                                const char *func,
-                                                int line)
-{
-    struct radeon_bo *bo;
-
-    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
-
-#ifdef RADEON_BO_TRACK
-    if (bo) {
-        bo->track = radeon_tracker_add_track(&bom->tracker, bo->handle);
-        radeon_track_add_event(bo->track, file, func, "open", line);
-    }
-#endif
-    return bo;
-}
-
-static inline void _radeon_bo_ref(struct radeon_bo *bo,
-                                  const char *file,
-                                  const char *func,
-                                  int line)
-{
-    bo->cref++;
-#ifdef RADEON_BO_TRACK
-    radeon_track_add_event(bo->track, file, func, "ref", line); 
-#endif
-    bo->bom->funcs->bo_ref(bo);
-}
-
-static inline struct radeon_bo *_radeon_bo_unref(struct radeon_bo *bo,
-                                                 const char *file,
-                                                 const char *func,
-                                                 int line)
-{
-    bo->cref--;
-#ifdef RADEON_BO_TRACK
-    radeon_track_add_event(bo->track, file, func, "unref", line);
-    if (bo->cref <= 0) {
-        radeon_tracker_remove_track(&bo->bom->tracker, bo->track);
-        bo->track = NULL;
-    }
-#endif
-    return bo->bom->funcs->bo_unref(bo);
-}
-
-static inline int _radeon_bo_map(struct radeon_bo *bo,
-                                 int write,
-                                 const char *file,
-                                 const char *func,
-                                 int line)
-{
-    return bo->bom->funcs->bo_map(bo, write);
-}
-
-static inline int _radeon_bo_unmap(struct radeon_bo *bo,
-                                   const char *file,
-                                   const char *func,
-                                   int line)
-{
-    return bo->bom->funcs->bo_unmap(bo);
-}
-
-static inline int _radeon_bo_wait(struct radeon_bo *bo,
-                                  const char *file,
-                                  const char *func,
-                                  int line)
-{
-    return bo->bom->funcs->bo_wait(bo);
-}
-
-static inline int _radeon_bo_is_busy(struct radeon_bo *bo,
-				     uint32_t *domain,
-                                     const char *file,
-                                     const char *func,
-                                     int line)
-{
-    return bo->bom->funcs->bo_is_busy(bo, domain);
-}
-
-static inline int radeon_bo_set_tiling(struct radeon_bo *bo,
-				       uint32_t tiling_flags, uint32_t pitch)
-{
-    return bo->bom->funcs->bo_set_tiling(bo, tiling_flags, pitch);
-}
-
-static inline int radeon_bo_get_tiling(struct radeon_bo *bo,
-				       uint32_t *tiling_flags, uint32_t *pitch)
-{
-    return bo->bom->funcs->bo_get_tiling(bo, tiling_flags, pitch);
-}
-
-static inline int radeon_bo_is_static(struct radeon_bo *bo)
-{
-	if (bo->bom->funcs->bo_is_static)
-		return bo->bom->funcs->bo_is_static(bo);
-	return 0;
-}
-
-#define radeon_bo_open(bom, h, s, a, d, f)\
-    _radeon_bo_open(bom, h, s, a, d, f, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_ref(bo)\
-    _radeon_bo_ref(bo, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_unref(bo)\
-    _radeon_bo_unref(bo, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_map(bo, w)\
-    _radeon_bo_map(bo, w, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_unmap(bo)\
-    _radeon_bo_unmap(bo, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_debug(bo, opcode)\
-    _radeon_bo_debug(bo, opcode, __FILE__, __FUNCTION__, __LINE__)
-#define radeon_bo_wait(bo) \
-    _radeon_bo_wait(bo, __FILE__, __func__, __LINE__)
-#define radeon_bo_is_busy(bo, domain) \
-    _radeon_bo_is_busy(bo, domain, __FILE__, __func__, __LINE__)
+struct radeon_bo_manager;
 
+void radeon_bo_debug(struct radeon_bo *bo,
+		     const char *op);
+
+struct radeon_bo *radeon_bo_open(struct radeon_bo_manager *bom,
+				  uint32_t handle,
+				  uint32_t size,
+				  uint32_t alignment,
+				  uint32_t domains,
+				  uint32_t flags);
+
+void radeon_bo_ref(struct radeon_bo *bo);
+struct radeon_bo *radeon_bo_unref(struct radeon_bo *bo);
+int radeon_bo_map(struct radeon_bo *bo, int write);
+int radeon_bo_unmap(struct radeon_bo *bo);
+int radeon_bo_wait(struct radeon_bo *bo);
+int radeon_bo_is_busy(struct radeon_bo *bo, uint32_t *domain);
+int radeon_bo_set_tiling(struct radeon_bo *bo, uint32_t tiling_flags, uint32_t pitch);
+int radeon_bo_get_tiling(struct radeon_bo *bo, uint32_t *tiling_flags, uint32_t *pitch);
+int radeon_bo_is_static(struct radeon_bo *bo);
+int radeon_bo_is_referenced_by_cs(struct radeon_bo *bo,
+				  struct radeon_cs *cs);
+uint32_t radeon_bo_get_handle(struct radeon_bo *bo);
+uint32_t radeon_bo_get_src_domain(struct radeon_bo *bo);
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_int_drm.h b/src/mesa/drivers/dri/radeon/radeon_bo_int_drm.h
new file mode 100644
index 0000000000..190c332475
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_int_drm.h
@@ -0,0 +1,45 @@
+#ifndef RADEON_BO_INT
+#define RADEON_BO_INT
+
+struct radeon_bo_manager {
+    struct radeon_bo_funcs  *funcs;
+    int                     fd;
+};
+
+struct radeon_bo_int {
+    void                        *ptr;
+    uint32_t                    flags;
+    uint32_t                    handle;
+    uint32_t                    size;
+    /* private members */
+    uint32_t                    alignment;
+    uint32_t                    domains;
+    unsigned                    cref;
+    struct radeon_bo_manager    *bom;
+    uint32_t                    space_accounted;
+    uint32_t                    referenced_in_cs;
+};
+
+/* bo functions */
+struct radeon_bo_funcs {
+    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
+                                 uint32_t handle,
+                                 uint32_t size,
+                                 uint32_t alignment,
+                                 uint32_t domains,
+                                 uint32_t flags);
+    void (*bo_ref)(struct radeon_bo_int *bo);
+    struct radeon_bo *(*bo_unref)(struct radeon_bo_int *bo);
+    int (*bo_map)(struct radeon_bo_int *bo, int write);
+    int (*bo_unmap)(struct radeon_bo_int *bo);
+    int (*bo_wait)(struct radeon_bo_int *bo);
+    int (*bo_is_static)(struct radeon_bo_int *bo);
+    int (*bo_set_tiling)(struct radeon_bo_int *bo, uint32_t tiling_flags,
+			  uint32_t pitch);
+    int (*bo_get_tiling)(struct radeon_bo_int *bo, uint32_t *tiling_flags,
+			  uint32_t *pitch);
+    int (*bo_is_busy)(struct radeon_bo_int *bo, uint32_t *domain);
+    int (*bo_is_referenced_by_cs)(struct radeon_bo_int *bo, struct radeon_cs *cs);
+};
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
index ce60a2f7ea..cf12664bac 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
@@ -50,6 +50,12 @@
 #include "radeon_bocs_wrapper.h"
 #include "radeon_macros.h"
 
+#ifdef HAVE_LIBDRM_RADEON
+#include "radeon_bo_int.h"
+#else
+#include "radeon_bo_int_drm.h"
+#endif
+
 /* no seriously texmem.c is this screwed up */
 struct bo_legacy_texture_object {
     driTextureObject    base;
@@ -57,7 +63,7 @@ struct bo_legacy_texture_object {
 };
 
 struct bo_legacy {
-    struct radeon_bo    base;
+    struct radeon_bo_int    base;
     int                 map_count;
     uint32_t            pending;
     int                 is_pending;
@@ -187,10 +193,10 @@ static void legacy_get_current_age(struct bo_manager_legacy *boml)
     }
 }
 
-static int legacy_is_pending(struct radeon_bo *bo)
+static int legacy_is_pending(struct radeon_bo_int *boi)
 {
-    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
-    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)boi->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)boi;
 
     if (bo_legacy->is_pending <= 0) {
         bo_legacy->is_pending = 0;
@@ -204,13 +210,13 @@ static int legacy_is_pending(struct radeon_bo *bo)
         if (bo_legacy->pnext) {
             bo_legacy->pnext->pprev = bo_legacy->pprev;
         }
-	assert(bo_legacy->is_pending <= bo->cref);
+	assert(bo_legacy->is_pending <= boi->cref);
         while (bo_legacy->is_pending--) {
-	    bo = radeon_bo_unref(bo);
-	    if (!bo)
+	    boi = (struct radeon_bo_int *)radeon_bo_unref((struct radeon_bo *)boi);
+	    if (!boi)
 	      break;
         }
-	if (bo)
+	if (boi)
 	  bo_legacy->is_pending = 0;
         boml->cpendings--;
         return 0;
@@ -218,7 +224,7 @@ static int legacy_is_pending(struct radeon_bo *bo)
     return 1;
 }
 
-static int legacy_wait_pending(struct radeon_bo *bo)
+static int legacy_wait_pending(struct radeon_bo_int *bo)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -323,7 +329,7 @@ static struct bo_legacy *bo_allocate(struct bo_manager_legacy *boml,
     return bo_legacy;
 }
 
-static int bo_dma_alloc(struct radeon_bo *bo)
+static int bo_dma_alloc(struct radeon_bo_int *bo)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -333,7 +339,7 @@ static int bo_dma_alloc(struct radeon_bo *bo)
     int r;
 
     /* align size on 4Kb */
-    size = (((4 * 1024) - 1) + bo->size) & ~((4 * 1024) - 1);
+    size = (((4 * 1024) - 1) + bo_legacy->base.size) & ~((4 * 1024) - 1);
     alloc.region = RADEON_MEM_REGION_GART;
     alloc.alignment = bo_legacy->base.alignment;
     alloc.size = size;
@@ -355,7 +361,7 @@ static int bo_dma_alloc(struct radeon_bo *bo)
     return 0;
 }
 
-static int bo_dma_free(struct radeon_bo *bo)
+static int bo_dma_free(struct radeon_bo_int *bo)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -428,7 +434,7 @@ static struct radeon_bo *bo_open(struct radeon_bo_manager *bom,
         bo_legacy = boml->bos.next;
         while (bo_legacy) {
             if (bo_legacy->base.handle == handle) {
-                radeon_bo_ref(&(bo_legacy->base));
+                radeon_bo_ref((struct radeon_bo *)&(bo_legacy->base));
                 return (struct radeon_bo*)bo_legacy;
             }
             bo_legacy = bo_legacy->next;
@@ -468,20 +474,20 @@ retry:
             return NULL;
         }
     }
-    radeon_bo_ref(&(bo_legacy->base));
+    radeon_bo_ref((struct radeon_bo *)&(bo_legacy->base));
 
     return (struct radeon_bo*)bo_legacy;
 }
 
-static void bo_ref(struct radeon_bo *bo)
+static void bo_ref(struct radeon_bo_int *bo)
 {
 }
 
-static struct radeon_bo *bo_unref(struct radeon_bo *bo)
+static struct radeon_bo *bo_unref(struct radeon_bo_int *boi)
 {
-    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)boi;
 
-    if (bo->cref <= 0) {
+    if (boi->cref <= 0) {
         bo_legacy->prev->next = bo_legacy->next;
         if (bo_legacy->next) {
             bo_legacy->next->prev = bo_legacy->prev;
@@ -491,10 +497,10 @@ static struct radeon_bo *bo_unref(struct radeon_bo *bo)
         }
         return NULL;
     }
-    return bo;
+    return (struct radeon_bo *)boi;
 }
 
-static int bo_map(struct radeon_bo *bo, int write)
+static int bo_map(struct radeon_bo_int *bo, int write)
 {
     struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
@@ -528,7 +534,7 @@ static int bo_map(struct radeon_bo *bo, int write)
     return 0;
 }
 
-static int bo_unmap(struct radeon_bo *bo)
+static int bo_unmap(struct radeon_bo_int *bo)
 {
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
 
@@ -542,7 +548,7 @@ static int bo_unmap(struct radeon_bo *bo)
     return 0;
 }
 
-static int bo_is_busy(struct radeon_bo *bo, uint32_t *domain)
+static int bo_is_busy(struct radeon_bo_int *bo, uint32_t *domain)
 {
     *domain = 0;
     if (bo->domains & RADEON_GEM_DOMAIN_GTT)
@@ -555,7 +561,7 @@ static int bo_is_busy(struct radeon_bo *bo, uint32_t *domain)
         return 0;
 }
 
-static int bo_is_static(struct radeon_bo *bo)
+static int bo_is_static(struct radeon_bo_int *bo)
 {
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
     return bo_legacy->static_bo;
@@ -574,7 +580,7 @@ static struct radeon_bo_funcs bo_legacy_funcs = {
     bo_is_busy
 };
 
-static int bo_vram_validate(struct radeon_bo *bo,
+static int bo_vram_validate(struct radeon_bo_int *bo,
                             uint32_t *soffset,
                             uint32_t *eoffset)
 {
@@ -700,29 +706,30 @@ int radeon_bo_legacy_validate(struct radeon_bo *bo,
                               uint32_t *soffset,
                               uint32_t *eoffset)
 {
-    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)boi->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
     int r;
     int retries = 0;
 
     if (bo_legacy->map_count) {
         fprintf(stderr, "bo(%p, %d) is mapped (%d) can't valide it.\n",
-                bo, bo->size, bo_legacy->map_count);
+                bo, boi->size, bo_legacy->map_count);
         return -EINVAL;
     }
-    if(bo->size == 0) {
+    if(boi->size == 0) {
         fprintf(stderr, "bo(%p) has size 0.\n", bo);
         return -EINVAL;
     }
     if (bo_legacy->static_bo || bo_legacy->validated) {
         *soffset = bo_legacy->offset;
-        *eoffset = bo_legacy->offset + bo->size;
+        *eoffset = bo_legacy->offset + boi->size;
 
         return 0;
     }
-    if (!(bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+    if (!(boi->domains & RADEON_GEM_DOMAIN_GTT)) {
 
-        r = bo_vram_validate(bo, soffset, eoffset);
+        r = bo_vram_validate(boi, soffset, eoffset);
         if (r) {
 	    legacy_track_pending(&boml->base, 0);
 	    legacy_kick_all_buffers(boml);
@@ -736,7 +743,7 @@ int radeon_bo_legacy_validate(struct radeon_bo *bo,
         }
     }
     *soffset = bo_legacy->offset;
-    *eoffset = bo_legacy->offset + bo->size;
+    *eoffset = bo_legacy->offset + boi->size;
     bo_legacy->validated = 1;
 
     return 0;
@@ -744,7 +751,8 @@ int radeon_bo_legacy_validate(struct radeon_bo *bo,
 
 void radeon_bo_legacy_pending(struct radeon_bo *bo, uint32_t pending)
 {
-    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)boi->bom;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
 
     bo_legacy->pending = pending;
@@ -799,7 +807,7 @@ static struct bo_legacy *radeon_legacy_bo_alloc_static(struct bo_manager_legacy
     if (bo->base.handle > bom->nhandle) {
         bom->nhandle = bo->base.handle + 1;
     }
-    radeon_bo_ref(&(bo->base));
+    radeon_bo_ref((struct radeon_bo *)&(bo->base));
     return bo;
 }
 
@@ -894,12 +902,13 @@ void radeon_bo_legacy_texture_age(struct radeon_bo_manager *bom)
 
 unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo)
 {
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
     struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
 
-    if (bo_legacy->static_bo || (bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+    if (bo_legacy->static_bo || (boi->domains & RADEON_GEM_DOMAIN_GTT)) {
         return 0;
     }
-    return bo->size;
+    return boi->size;
 }
 
 /*
@@ -924,7 +933,7 @@ struct radeon_bo *radeon_legacy_bo_alloc_fake(struct radeon_bo_manager *bom,
     if (bo->base.handle > boml->nhandle) {
         boml->nhandle = bo->base.handle + 1;
     }
-    radeon_bo_ref(&(bo->base));
-    return &(bo->base);
+    radeon_bo_ref((struct radeon_bo *)&(bo->base));
+    return (struct radeon_bo *)&(bo->base);
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
index 4520a7d7d4..6c2648b6bd 100644
--- a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
+++ b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
@@ -18,8 +18,11 @@
 #define RADEON_TILING_MACRO 0x1
 #define RADEON_TILING_MICRO 0x2
 #define RADEON_TILING_SWAP 0x4
+
+#ifndef RADEON_TILING_SURFACE
 #define RADEON_TILING_SURFACE 0x8 /* this object requires a surface
 				   * when mapped - i.e. front buffer */
+#endif
 
 /* to be used to build locally in mesa with no libdrm bits */
 #include "../radeon/radeon_bo_drm.h"
diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
index 8fac5c6c51..99d3ec7005 100644
--- a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
@@ -136,8 +136,13 @@ radeonBufferSubData(GLcontext * ctx,
                     const GLvoid * data,
                     struct gl_buffer_object *obj)
 {
+    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
     struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
 
+    if (radeon_bo_is_referenced_by_cs(radeon_obj->bo, radeon->cmdbuf.cs)) {
+        radeon_firevertices(radeon);
+    }
+
     radeon_bo_map(radeon_obj->bo, GL_TRUE);
 
     _mesa_memcpy(radeon_obj->bo->ptr + offset, data, size);
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 097ab7cf61..e0b853bc97 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -137,7 +137,7 @@ void radeon_get_cliprects(radeonContextPtr radeon,
 			  unsigned int *num_cliprects,
 			  int *x_off, int *y_off)
 {
-	__DRIdrawablePrivate *dPriv = radeon_get_drawable(radeon);
+	__DRIdrawable *dPriv = radeon_get_drawable(radeon);
 	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
 
 	if (radeon->constant_cliprect) {
@@ -169,8 +169,8 @@ void radeon_get_cliprects(radeonContextPtr radeon,
  */
 void radeonSetCliprects(radeonContextPtr radeon)
 {
-	__DRIdrawablePrivate *const drawable = radeon_get_drawable(radeon);
-	__DRIdrawablePrivate *const readable = radeon_get_readable(radeon);
+	__DRIdrawable *const drawable = radeon_get_drawable(radeon);
+	__DRIdrawable *const readable = radeon_get_readable(radeon);
 	struct radeon_framebuffer *const draw_rfb = drawable->driverPrivate;
 	struct radeon_framebuffer *const read_rfb = readable->driverPrivate;
 	int x_off, y_off;
@@ -229,7 +229,7 @@ void radeonUpdateScissor( GLcontext *ctx )
 	}
 	if (!rmesa->radeonScreen->kernel_mm) {
 	   /* Fix scissors for dri 1 */
-	   __DRIdrawablePrivate *dPriv = radeon_get_drawable(rmesa);
+	   __DRIdrawable *dPriv = radeon_get_drawable(rmesa);
 	   x1 += dPriv->x;
 	   x2 += dPriv->x + 1;
 	   min_x += dPriv->x;
@@ -262,29 +262,6 @@ void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
 	}
 }
 
-void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask )
-{
-   radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-   GLuint i;
-   drm_radeon_stipple_t stipple;
-
-   /* Must flip pattern upside down.
-   */
-   for ( i = 0 ; i < 32 ; i++ ) {
-      stipple.mask[31 - i] = ((GLuint *) mask)[i];
-   }
-
-   /* TODO: push this into cmd mechanism
-   */
-   radeon_firevertices(radeon);
-   LOCK_HARDWARE( radeon );
-
-   drmCommandWrite( radeon->dri.fd, DRM_RADEON_STIPPLE,
-	 &stipple, sizeof(stipple) );
-   UNLOCK_HARDWARE( radeon );
-}
-
-
 /* ================================================================
  * SwapBuffers with client-side throttling
  */
@@ -451,7 +428,7 @@ static void radeon_flip_renderbuffers(struct radeon_framebuffer *rfb)
 
 /* Copy the back color buffer to the front color buffer.
  */
-void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
+void radeonCopyBuffer( __DRIdrawable *dPriv,
 		       const drm_clip_rect_t	  *rect)
 {
 	radeonContextPtr rmesa;
@@ -519,7 +496,7 @@ void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
 	UNLOCK_HARDWARE( rmesa );
 }
 
-static int radeonScheduleSwap(__DRIdrawablePrivate *dPriv, GLboolean *missed_target)
+static int radeonScheduleSwap(__DRIdrawable *dPriv, GLboolean *missed_target)
 {
 	radeonContextPtr rmesa;
 
@@ -542,11 +519,11 @@ static int radeonScheduleSwap(__DRIdrawablePrivate *dPriv, GLboolean *missed_tar
 	return 0;
 }
 
-static GLboolean radeonPageFlip( __DRIdrawablePrivate *dPriv )
+static GLboolean radeonPageFlip( __DRIdrawable *dPriv )
 {
 	radeonContextPtr radeon;
 	GLint ret;
-	__DRIscreenPrivate *psp;
+	__DRIscreen *psp;
 	struct radeon_renderbuffer *rrb;
 	struct radeon_framebuffer *rfb;
 
@@ -594,10 +571,10 @@ static GLboolean radeonPageFlip( __DRIdrawablePrivate *dPriv )
 /**
  * Swap front and back buffer.
  */
-void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
+void radeonSwapBuffers(__DRIdrawable * dPriv)
 {
 	int64_t ust;
-	__DRIscreenPrivate *psp;
+	__DRIscreen *psp;
 
 	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
 		radeonContextPtr radeon;
@@ -638,7 +615,7 @@ void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
 	}
 }
 
-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+void radeonCopySubBuffer(__DRIdrawable * dPriv,
 			 int x, int y, int w, int h )
 {
 	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
@@ -664,6 +641,27 @@ void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
 	}
 }
 
+/**
+ * Check if we're about to draw into the front color buffer.
+ * If so, set the intel->front_buffer_dirty field to true.
+ */
+void
+radeon_check_front_buffer_rendering(GLcontext *ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	const struct gl_framebuffer *fb = ctx->DrawBuffer;
+
+	if (fb->Name == 0) {
+		/* drawing to window system buffer */
+		if (fb->_NumColorDrawBuffers > 0) {
+			if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
+				radeon->front_buffer_dirty = GL_TRUE;
+			}
+		}
+	}
+}
+
+
 void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
 {
 	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
@@ -840,7 +838,7 @@ void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
        */
 		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
 			radeon_update_renderbuffers(radeon->dri.context,
-				radeon->dri.context->driDrawablePriv);
+				radeon->dri.context->driDrawablePriv, GL_FALSE);
       }
 	}
 
@@ -857,7 +855,7 @@ void radeonReadBuffer( GLcontext *ctx, GLenum mode )
 
 		if (!was_front_buffer_reading && rmesa->is_front_buffer_reading) {
 			radeon_update_renderbuffers(rmesa->dri.context,
-						    rmesa->dri.context->driReadablePriv);
+						    rmesa->dri.context->driReadablePriv, GL_FALSE);
 	 	}
 	}
 	/* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
@@ -908,9 +906,9 @@ void radeon_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei he
 		if (radeon->is_front_buffer_rendering) {
 			ctx->Driver.Flush(ctx);
 		}
-		radeon_update_renderbuffers(driContext, driContext->driDrawablePriv);
+		radeon_update_renderbuffers(driContext, driContext->driDrawablePriv, GL_FALSE);
 		if (driContext->driDrawablePriv != driContext->driReadablePriv)
-			radeon_update_renderbuffers(driContext, driContext->driReadablePriv);
+			radeon_update_renderbuffers(driContext, driContext->driReadablePriv, GL_FALSE);
 	}
 
 	old_viewport = ctx->Driver.Viewport;
@@ -1118,22 +1116,21 @@ void radeonFlush(GLcontext *ctx)
 	   then no point flushing anything at all.
 	*/
 	if (!radeon->dma.flush && !radeon->cmdbuf.cs->cdw && is_empty_list(&radeon->dma.reserved))
-		return;
+		goto flush_front;
 
 	if (radeon->dma.flush)
 		radeon->dma.flush( ctx );
 
-	radeonEmitState(radeon);
-
 	if (radeon->cmdbuf.cs->cdw)
 		rcommonFlushCmdBuf(radeon, __FUNCTION__);
 
+flush_front:
 	if ((ctx->DrawBuffer->Name == 0) && radeon->front_buffer_dirty) {
 		__DRIscreen *const screen = radeon->radeonScreen->driScreen;
 
 		if (screen->dri2.loader && (screen->dri2.loader->base.version >= 2)
 			&& (screen->dri2.loader->flushFrontBuffer != NULL)) {
-			__DRIdrawablePrivate * drawable = radeon_get_drawable(radeon);
+			__DRIdrawable * drawable = radeon_get_drawable(radeon);
 			(*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate);
 
 			/* Only clear the dirty bit if front-buffer rendering is no longer
@@ -1147,9 +1144,6 @@ void radeonFlush(GLcontext *ctx)
 			}
 		}
 	}
-
-	make_empty_list(&radeon->query.not_flushed_head);
-
 }
 
 /* Make sure all commands have been sent to the hardware and have
@@ -1236,7 +1230,7 @@ int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller)
 		fprintf(stderr, "drmRadeonCmdBuffer: %d. Kernel failed to "
 				"parse or rejected command stream. See dmesg "
 				"for more info.\n", ret);
-		_mesa_exit(ret);
+		exit(ret);
 	}
 
 	return ret;
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.h b/src/mesa/drivers/dri/radeon/radeon_common.h
index def0cc17a9..f31f08edf3 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common.h
@@ -10,14 +10,13 @@ void radeonRecalcScissorRects(radeonContextPtr radeon);
 void radeonSetCliprects(radeonContextPtr radeon);
 void radeonUpdateScissor( GLcontext *ctx );
 void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h);
-void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask );
 
 void radeonWaitForIdleLocked(radeonContextPtr radeon);
 extern uint32_t radeonGetAge(radeonContextPtr radeon);
-void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
+void radeonCopyBuffer( __DRIdrawable *dPriv,
 		       const drm_clip_rect_t	  *rect);
-void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+void radeonSwapBuffers(__DRIdrawable * dPriv);
+void radeonCopySubBuffer(__DRIdrawable * dPriv,
 			 int x, int y, int w, int h );
 
 void radeonUpdatePageFlipping(radeonContextPtr rmesa);
@@ -43,7 +42,9 @@ void
 radeon_renderbuffer_set_bo(struct radeon_renderbuffer *rb,
 			   struct radeon_bo *bo);
 struct radeon_renderbuffer *
-radeon_create_renderbuffer(gl_format format, __DRIdrawablePrivate *driDrawPriv);
+radeon_create_renderbuffer(gl_format format, __DRIdrawable *driDrawPriv);
+
+void radeon_check_front_buffer_rendering(GLcontext *ctx);
 static inline struct radeon_renderbuffer *radeon_renderbuffer(struct gl_renderbuffer *rb)
 {
 	struct radeon_renderbuffer *rrb = (struct radeon_renderbuffer *)rb;
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index fe99644907..b9c29b937e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -181,10 +181,10 @@ static void radeonInitDriverFuncs(struct dd_function_table *functions)
 GLboolean radeonInitContext(radeonContextPtr radeon,
 			    struct dd_function_table* functions,
 			    const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
+			    __DRIcontext * driContextPriv,
 			    void *sharedContextPrivate)
 {
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	__DRIscreen *sPriv = driContextPriv->driScreenPriv;
 	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
 	GLcontext* ctx;
 	GLcontext* shareCtx;
@@ -262,10 +262,9 @@ GLboolean radeonInitContext(radeonContextPtr radeon,
 		else
 			radeon->texture_row_align = 32;
 		radeon->texture_rect_row_align = 64;
-		radeon->texture_compressed_row_align = 64;
+		radeon->texture_compressed_row_align = 32;
 	}
 
-	make_empty_list(&radeon->query.not_flushed_head);
 	radeon_init_dma(radeon);
 
 	return GL_TRUE;
@@ -292,7 +291,7 @@ static void radeon_destroy_atom_list(radeonContextPtr radeon)
  * Cleanup common context fields.
  * Called by r200DestroyContext/r300DestroyContext
  */
-void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
+void radeonDestroyContext(__DRIcontext *driContextPriv )
 {
 #ifdef RADEON_BO_TRACK
 	FILE *track;
@@ -356,7 +355,7 @@ void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
 
 /* Force the context `c' to be unbound from its buffer.
  */
-GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
+GLboolean radeonUnbindContext(__DRIcontext * driContextPriv)
 {
 	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
 
@@ -500,7 +499,8 @@ radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
 }
 
 void
-radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
+radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable,
+			    GLboolean front_only)
 {
 	unsigned int attachments[10];
 	__DRIbuffer *buffers = NULL;
@@ -526,7 +526,7 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 		struct radeon_renderbuffer *stencil_rb;
 
 		i = 0;
-		if ((radeon->is_front_buffer_rendering ||
+		if ((front_only || radeon->is_front_buffer_rendering ||
 		     radeon->is_front_buffer_reading ||
 		     !draw->color_rb[1])
 		    && draw->color_rb[0]) {
@@ -534,23 +534,25 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[0]);
 		}
 
-		if (draw->color_rb[1]) {
-			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[1]);
-		}
+		if (!front_only) {
+			if (draw->color_rb[1]) {
+				attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+				attachments[i++] = radeon_bits_per_pixel(draw->color_rb[1]);
+			}
 
-		depth_rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
-		stencil_rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
-
-		if ((depth_rb != NULL) && (stencil_rb != NULL)) {
-			attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
-			attachments[i++] = radeon_bits_per_pixel(depth_rb);
-		} else if (depth_rb != NULL) {
-			attachments[i++] = __DRI_BUFFER_DEPTH;
-			attachments[i++] = radeon_bits_per_pixel(depth_rb);
-		} else if (stencil_rb != NULL) {
-			attachments[i++] = __DRI_BUFFER_STENCIL;
-			attachments[i++] = radeon_bits_per_pixel(stencil_rb);
+			depth_rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+			stencil_rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+
+			if ((depth_rb != NULL) && (stencil_rb != NULL)) {
+				attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
+				attachments[i++] = radeon_bits_per_pixel(depth_rb);
+			} else if (depth_rb != NULL) {
+				attachments[i++] = __DRI_BUFFER_DEPTH;
+				attachments[i++] = radeon_bits_per_pixel(depth_rb);
+			} else if (stencil_rb != NULL) {
+				attachments[i++] = __DRI_BUFFER_STENCIL;
+				attachments[i++] = radeon_bits_per_pixel(stencil_rb);
+			}
 		}
 
 		buffers = (*screen->dri2.loader->getBuffersWithFormat)(drawable,
@@ -563,12 +565,14 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 		i = 0;
 		if (draw->color_rb[0])
 			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
-		if (draw->color_rb[1])
-			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-		if (radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH))
-			attachments[i++] = __DRI_BUFFER_DEPTH;
-		if (radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL))
-			attachments[i++] = __DRI_BUFFER_STENCIL;
+		if (!front_only) {
+			if (draw->color_rb[1])
+				attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+			if (radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH))
+				attachments[i++] = __DRI_BUFFER_DEPTH;
+			if (radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL))
+				attachments[i++] = __DRI_BUFFER_STENCIL;
+		}
 
 		buffers = (*screen->dri2.loader->getBuffers)(drawable,
 								 &drawable->w,
@@ -716,9 +720,9 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 /* Force the context `c' to be the current context and associate with it
  * buffer `b'.
  */
-GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-			    __DRIdrawablePrivate * driDrawPriv,
-			    __DRIdrawablePrivate * driReadPriv)
+GLboolean radeonMakeCurrent(__DRIcontext * driContextPriv,
+			    __DRIdrawable * driDrawPriv,
+			    __DRIdrawable * driReadPriv)
 {
 	radeonContextPtr radeon;
 	struct radeon_framebuffer *drfb;
@@ -736,9 +740,9 @@ GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
 	readfb = driReadPriv->driverPrivate;
 
 	if (driContextPriv->driScreenPriv->dri2.enabled) {
-		radeon_update_renderbuffers(driContextPriv, driDrawPriv);
+		radeon_update_renderbuffers(driContextPriv, driDrawPriv, GL_FALSE);
 		if (driDrawPriv != driReadPriv)
-			radeon_update_renderbuffers(driContextPriv, driReadPriv);
+			radeon_update_renderbuffers(driContextPriv, driReadPriv, GL_FALSE);
 		_mesa_reference_renderbuffer(&radeon->state.color.rb,
 			&(radeon_get_renderbuffer(&drfb->base, BUFFER_BACK_LEFT)->base));
 		_mesa_reference_renderbuffer(&radeon->state.depth.rb,
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index 0309345393..ab79d2dc0f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -92,7 +92,7 @@ struct radeon_renderbuffer
 
 	GLuint pf_pending;  /**< sequence number of pending flip */
 	GLuint vbl_pending;   /**< vblank sequence number of pending flip */
-	__DRIdrawablePrivate *dPriv;
+	__DRIdrawable *dPriv;
 };
 
 struct radeon_framebuffer
@@ -208,6 +208,10 @@ struct radeon_tex_obj {
 	 * and so on.
 	 */
 	GLboolean validated;
+	/* Minimum LOD to be used during rendering */
+	unsigned minLod;
+	/* Miximum LOD to be used during rendering */
+	unsigned maxLod;
 
 	GLuint override_offset;
 	GLboolean image_override; /* Image overridden by GLX_EXT_tfp */
@@ -324,6 +328,7 @@ struct radeon_swtcl_info {
 	GLuint vertex_attr_count;
 
 	GLuint emit_prediction;
+        struct radeon_bo *bo;
 };
 
 #define RADEON_MAX_AOS_ARRAYS		16
@@ -376,8 +381,8 @@ struct radeon_store {
 };
 
 struct radeon_dri_mirror {
-	__DRIcontextPrivate *context;	/* DRI context */
-	__DRIscreenPrivate *screen;	/* DRI screen */
+	__DRIcontext *context;	/* DRI context */
+	__DRIscreen *screen;	/* DRI screen */
 
 	drm_context_t hwContext;
 	drm_hw_lock_t *hwLock;
@@ -401,9 +406,6 @@ struct radeon_state {
 	struct radeon_depthbuffer_state depth;
 	struct radeon_scissor_state scissor;
 	struct radeon_stencilbuffer_state stencil;
-
-	struct radeon_cs_space_check bos[RADEON_MAX_BOS];
-	int validated_bo_count;
 };
 
 /**
@@ -502,7 +504,6 @@ struct radeon_context {
 
    struct {
 	struct radeon_query_object *current;
-	struct radeon_query_object not_flushed_head;
 	struct radeon_state_atom queryobj;
    } query;
 
@@ -522,12 +523,12 @@ struct radeon_context {
 
 #define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
 
-static inline __DRIdrawablePrivate* radeon_get_drawable(radeonContextPtr radeon)
+static inline __DRIdrawable* radeon_get_drawable(radeonContextPtr radeon)
 {
 	return radeon->dri.context->driDrawablePriv;
 }
 
-static inline __DRIdrawablePrivate* radeon_get_readable(radeonContextPtr radeon)
+static inline __DRIdrawable* radeon_get_readable(radeonContextPtr radeon)
 {
 	return radeon->dri.context->driReadablePriv;
 }
@@ -580,15 +581,16 @@ static INLINE uint32_t radeonPackFloat24(float f)
 GLboolean radeonInitContext(radeonContextPtr radeon,
 			    struct dd_function_table* functions,
 			    const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
+			    __DRIcontext * driContextPriv,
 			    void *sharedContextPrivate);
 
 void radeonCleanupContext(radeonContextPtr radeon);
-GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
-void radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable);
-GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-			    __DRIdrawablePrivate * driDrawPriv,
-			    __DRIdrawablePrivate * driReadPriv);
-extern void radeonDestroyContext(__DRIcontextPrivate * driContextPriv);
+GLboolean radeonUnbindContext(__DRIcontext * driContextPriv);
+void radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable,
+				 GLboolean front_only);
+GLboolean radeonMakeCurrent(__DRIcontext * driContextPriv,
+			    __DRIdrawable * driDrawPriv,
+			    __DRIdrawable * driReadPriv);
+extern void radeonDestroyContext(__DRIcontext * driContextPriv);
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index 5e700be4a5..3cd305b0a2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -208,10 +208,10 @@ static void r100_init_vtbl(radeonContextPtr radeon)
  */
 GLboolean
 r100CreateContext( const __GLcontextModes *glVisual,
-                     __DRIcontextPrivate *driContextPriv,
+                     __DRIcontext *driContextPriv,
                      void *sharedContextPrivate)
 {
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
    struct dd_function_table functions;
    r100ContextPtr rmesa;
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index 4e2c52c835..dfedc38bfd 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -331,8 +331,12 @@ struct r100_hw_state {
 	struct radeon_state_atom stp;
 };
 
+struct radeon_stipple_state {
+	GLuint mask[32];
+};
 
 struct r100_state {
+	struct radeon_stipple_state stipple;
 	struct radeon_texture_state texture;
 };
 
@@ -447,7 +451,7 @@ struct r100_context {
 #define RADEON_OLD_PACKETS 1
 
 extern GLboolean r100CreateContext( const __GLcontextModes *glVisual,
-				    __DRIcontextPrivate *driContextPriv,
+				    __DRIcontext *driContextPriv,
 				    void *sharedContextPrivate);
   
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs.c b/src/mesa/drivers/dri/radeon/radeon_cs.c
new file mode 100644
index 0000000000..17e7433369
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs.c
@@ -0,0 +1,95 @@
+
+#include <stdio.h>
+#include <stdint.h>
+#include "drm.h"
+#include "radeon_drm.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_cs_int_drm.h"
+
+struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
+			    uint32_t ndw)
+{
+    struct radeon_cs_int *csi = csm->funcs->cs_create(csm, ndw);
+    return (struct radeon_cs *)csi;
+}
+
+int radeon_cs_write_reloc(struct radeon_cs *cs,
+			  struct radeon_bo *bo,
+			  uint32_t read_domain,
+			  uint32_t write_domain,
+			  uint32_t flags)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+
+    return csi->csm->funcs->cs_write_reloc(csi,
+					   bo,
+					   read_domain,
+					   write_domain,
+					   flags);
+}
+
+int radeon_cs_begin(struct radeon_cs *cs,
+		    uint32_t ndw,
+		    const char *file,
+		    const char *func,
+		    int line)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_begin(csi, ndw, file, func, line);
+}
+
+int radeon_cs_end(struct radeon_cs *cs,
+		  const char *file,
+		  const char *func,
+		  int line)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_end(csi, file, func, line);
+}
+
+int radeon_cs_emit(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_emit(csi);
+}
+
+int radeon_cs_destroy(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_destroy(csi);
+}
+
+int radeon_cs_erase(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_erase(csi);
+}
+
+int radeon_cs_need_flush(struct radeon_cs *cs)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return csi->csm->funcs->cs_need_flush(csi);
+}
+
+void radeon_cs_print(struct radeon_cs *cs, FILE *file)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    csi->csm->funcs->cs_print(csi, file);
+}
+
+void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    if (domain == RADEON_GEM_DOMAIN_VRAM)
+	csi->csm->vram_limit = limit;
+    else
+	csi->csm->gart_limit = limit;
+}
+
+void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data)
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    csi->space_flush_fn = fn;
+    csi->space_flush_data = data;
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_drm.h b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
index ab4eca31a3..a3f1750c6e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
@@ -36,6 +36,7 @@
 #include <string.h>
 #include "drm.h"
 #include "radeon_drm.h"
+#include "radeon_bo_drm.h"
 
 struct radeon_cs_reloc {
     struct radeon_bo    *bo;
@@ -49,173 +50,41 @@ struct radeon_cs_reloc {
 #define RADEON_CS_SPACE_OP_TO_BIG 1
 #define RADEON_CS_SPACE_FLUSH 2
 
-struct radeon_cs_space_check {
-    struct radeon_bo *bo;
-    uint32_t read_domains;
-    uint32_t write_domain;
-    uint32_t new_accounted;
-};
-
-#define MAX_SPACE_BOS (32)
-
-struct radeon_cs_manager;
-
 struct radeon_cs {
-    struct radeon_cs_manager    *csm;
-    void                        *relocs;
-    uint32_t                    *packets;
-    unsigned                    crelocs;
-    unsigned                    relocs_total_size;
-    unsigned                    cdw;
-    unsigned                    ndw;
-    int                         section;
+    uint32_t *packets;
+    unsigned cdw;
+    unsigned ndw;
     unsigned                    section_ndw;
     unsigned                    section_cdw;
-    const char                  *section_file;
-    const char                  *section_func;
-    int                         section_line;
-    struct radeon_cs_space_check bos[MAX_SPACE_BOS];
-    int                         bo_count;
-    void                        (*space_flush_fn)(void *);
-    void                        *space_flush_data;
-};
-
-/* cs functions */
-struct radeon_cs_funcs {
-    struct radeon_cs *(*cs_create)(struct radeon_cs_manager *csm,
-                                   uint32_t ndw);
-    int (*cs_write_reloc)(struct radeon_cs *cs,
-                          struct radeon_bo *bo,
-                          uint32_t read_domain,
-                          uint32_t write_domain,
-                          uint32_t flags);
-    int (*cs_begin)(struct radeon_cs *cs,
-                    uint32_t ndw,
-                    const char *file,
-                    const char *func,
-                    int line);
-    int (*cs_end)(struct radeon_cs *cs,
-                  const char *file,
-                  const char *func,
-                  int line);
-    int (*cs_emit)(struct radeon_cs *cs);
-    int (*cs_destroy)(struct radeon_cs *cs);
-    int (*cs_erase)(struct radeon_cs *cs);
-    int (*cs_need_flush)(struct radeon_cs *cs);
-    void (*cs_print)(struct radeon_cs *cs, FILE *file);
-};
-
-struct radeon_cs_manager {
-    struct radeon_cs_funcs  *funcs;
-    int                     fd;
-    int32_t vram_limit, gart_limit;
-    int32_t vram_write_used, gart_write_used;
-    int32_t read_used;
 };
 
-static inline struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
-                                                 uint32_t ndw)
-{
-    return csm->funcs->cs_create(csm, ndw);
-}
-
-static inline int radeon_cs_write_reloc(struct radeon_cs *cs,
-                                        struct radeon_bo *bo,
-                                        uint32_t read_domain,
-                                        uint32_t write_domain,
-                                        uint32_t flags)
-{
-    return cs->csm->funcs->cs_write_reloc(cs,
-                                          bo,
-                                          read_domain,
-                                          write_domain,
-                                          flags);
-}
-
-static inline int radeon_cs_begin(struct radeon_cs *cs,
-                                  uint32_t ndw,
-                                  const char *file,
-                                  const char *func,
-                                  int line)
-{
-    return cs->csm->funcs->cs_begin(cs, ndw, file, func, line);
-}
-
-static inline int radeon_cs_end(struct radeon_cs *cs,
-                                const char *file,
-                                const char *func,
-                                int line)
-{
-    return cs->csm->funcs->cs_end(cs, file, func, line);
-}
-
-static inline int radeon_cs_emit(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_emit(cs);
-}
-
-static inline int radeon_cs_destroy(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_destroy(cs);
-}
-
-static inline int radeon_cs_erase(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_erase(cs);
-}
-
-static inline int radeon_cs_need_flush(struct radeon_cs *cs)
-{
-    return cs->csm->funcs->cs_need_flush(cs);
-}
-
-static inline void radeon_cs_print(struct radeon_cs *cs, FILE *file)
-{
-    cs->csm->funcs->cs_print(cs, file);
-}
-
-static inline void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
-{
-    
-    if (domain == RADEON_GEM_DOMAIN_VRAM)
-	cs->csm->vram_limit = limit;
-    else
-	cs->csm->gart_limit = limit;
-}
-
-static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
-{
-    cs->packets[cs->cdw++] = dword;
-    if (cs->section) {
-        cs->section_cdw++;
-    }
-}
-
-static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
-{
-
-    memcpy(cs->packets + cs->cdw, &qword, sizeof(qword));
-    cs->cdw+=2;
-    if (cs->section) {
-        cs->section_cdw+=2;
-    }
-}
-
-static inline void radeon_cs_write_table(struct radeon_cs *cs, void *data, uint32_t size)
-{
-    memcpy(cs->packets + cs->cdw, data, size * 4);
-    cs->cdw += size;
-    if (cs->section) {
-	    cs->section_cdw += size;
-    }
-}
+#define MAX_SPACE_BOS (32)
 
-static inline void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data)
-{
-    cs->space_flush_fn = fn;
-    cs->space_flush_data = data;
-}
+struct radeon_cs_manager;
 
+extern struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
+					  uint32_t ndw);
+
+extern int radeon_cs_begin(struct radeon_cs *cs,
+			   uint32_t ndw,
+			   const char *file,
+			   const char *func, int line);
+extern int radeon_cs_end(struct radeon_cs *cs,
+			 const char *file,
+			 const char *func,
+			 int line);
+extern int radeon_cs_emit(struct radeon_cs *cs);
+extern int radeon_cs_destroy(struct radeon_cs *cs);
+extern int radeon_cs_erase(struct radeon_cs *cs);
+extern int radeon_cs_need_flush(struct radeon_cs *cs);
+extern void radeon_cs_print(struct radeon_cs *cs, FILE *file);
+extern void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit);
+extern void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data);
+extern int radeon_cs_write_reloc(struct radeon_cs *cs,
+				 struct radeon_bo *bo,
+				 uint32_t read_domain,
+				 uint32_t write_domain,
+				 uint32_t flags);
 
 /*
  * add a persistent BO to the list
@@ -243,4 +112,30 @@ int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
 				  uint32_t read_domains,
 				  uint32_t write_domain);
 
+static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
+{
+    cs->packets[cs->cdw++] = dword;
+    if (cs->section_ndw) {
+        cs->section_cdw++;
+    }
+}
+
+static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
+{
+    memcpy(cs->packets + cs->cdw, &qword, sizeof(uint64_t));
+    cs->cdw += 2;
+    if (cs->section_ndw) {
+        cs->section_cdw += 2;
+    }
+}
+
+static inline void radeon_cs_write_table(struct radeon_cs *cs,
+					 void *data, uint32_t size)
+{
+    memcpy(cs->packets + cs->cdw, data, size * 4);
+    cs->cdw += size;
+    if (cs->section_ndw) {
+	cs->section_cdw += size;
+    }
+}
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_int_drm.h b/src/mesa/drivers/dri/radeon/radeon_cs_int_drm.h
new file mode 100644
index 0000000000..8ba76bf951
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_int_drm.h
@@ -0,0 +1,66 @@
+
+#ifndef _RADEON_CS_INT_H_
+#define _RADEON_CS_INT_H_
+
+struct radeon_cs_space_check {
+    struct radeon_bo_int *bo;
+    uint32_t read_domains;
+    uint32_t write_domain;
+    uint32_t new_accounted;
+};
+
+struct radeon_cs_int {
+    /* keep first two in same place */
+    uint32_t                    *packets;    
+    unsigned                    cdw;
+    unsigned                    ndw;
+    unsigned                    section_ndw;
+    unsigned                    section_cdw;
+    /* private members */
+    struct radeon_cs_manager    *csm;
+    void                        *relocs;
+    unsigned                    crelocs;
+    unsigned                    relocs_total_size;
+    const char                  *section_file;
+    const char                  *section_func;
+    int                         section_line;
+    struct radeon_cs_space_check bos[MAX_SPACE_BOS];
+    int                         bo_count;
+    void                        (*space_flush_fn)(void *);
+    void                        *space_flush_data;
+};
+
+/* cs functions */
+struct radeon_cs_funcs {
+    struct radeon_cs_int *(*cs_create)(struct radeon_cs_manager *csm,
+                                   uint32_t ndw);
+    int (*cs_write_reloc)(struct radeon_cs_int *cs,
+                          struct radeon_bo *bo,
+                          uint32_t read_domain,
+                          uint32_t write_domain,
+                          uint32_t flags);
+    int (*cs_begin)(struct radeon_cs_int *cs,
+                    uint32_t ndw,
+		    const char *file,
+		    const char *func,
+		    int line);
+    int (*cs_end)(struct radeon_cs_int *cs,
+		  const char *file, const char *func,
+		  int line);
+
+
+    int (*cs_emit)(struct radeon_cs_int *cs);
+    int (*cs_destroy)(struct radeon_cs_int *cs);
+    int (*cs_erase)(struct radeon_cs_int *cs);
+    int (*cs_need_flush)(struct radeon_cs_int *cs);
+    void (*cs_print)(struct radeon_cs_int *cs, FILE *file);
+};
+
+struct radeon_cs_manager {
+    struct radeon_cs_funcs  *funcs;
+    int                     fd;
+    int32_t vram_limit, gart_limit;
+    int32_t vram_write_used, gart_write_used;
+    int32_t read_used;
+};
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
index f1addb299e..bf46eb8aab 100644
--- a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
@@ -30,10 +30,18 @@
  *      Jérôme Glisse <glisse@freedesktop.org>
  */
 #include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include "drm.h"
+#include "radeon_drm.h"
 
 #include "radeon_bocs_wrapper.h"
 #include "radeon_common.h"
-
+#ifdef HAVE_LIBDRM_RADEON
+#include "radeon_cs_int.h"
+#else
+#include "radeon_cs_int_drm.h"
+#endif
 struct cs_manager_legacy {
     struct radeon_cs_manager    base;
     struct radeon_context       *ctx;
@@ -51,27 +59,27 @@ struct cs_reloc_legacy {
 };
 
 
-static struct radeon_cs *cs_create(struct radeon_cs_manager *csm,
-                                   uint32_t ndw)
+static struct radeon_cs_int *cs_create(struct radeon_cs_manager *csm,
+				       uint32_t ndw)
 {
-    struct radeon_cs *cs;
+    struct radeon_cs_int *csi;
 
-    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
-    if (cs == NULL) {
+    csi = (struct radeon_cs_int*)calloc(1, sizeof(struct radeon_cs_int));
+    if (csi == NULL) {
         return NULL;
     }
-    cs->csm = csm;
-    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
-    cs->packets = (uint32_t*)malloc(4*cs->ndw);
-    if (cs->packets == NULL) {
-        free(cs);
+    csi->csm = csm;
+    csi->ndw = (ndw + 0x3FF) & (~0x3FF);
+    csi->packets = (uint32_t*)malloc(4*csi->ndw);
+    if (csi->packets == NULL) {
+        free(csi);
         return NULL;
     }
-    cs->relocs_total_size = 0;
-    return cs;
+    csi->relocs_total_size = 0;
+    return csi;
 }
 
-static int cs_write_reloc(struct radeon_cs *cs,
+static int cs_write_reloc(struct radeon_cs_int *cs,
                           struct radeon_bo *bo,
                           uint32_t read_domain,
                           uint32_t write_domain,
@@ -150,20 +158,19 @@ static int cs_write_reloc(struct radeon_cs *cs,
     return 0;
 }
 
-static int cs_begin(struct radeon_cs *cs,
+static int cs_begin(struct radeon_cs_int *cs,
                     uint32_t ndw,
                     const char *file,
                     const char *func,
                     int line)
 {
-    if (cs->section) {
+    if (cs->section_ndw) {
         fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
                 cs->section_file, cs->section_func, cs->section_line);
         fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
-    cs->section = 1;
     cs->section_ndw = ndw;
     cs->section_cdw = 0;
     cs->section_file = file;
@@ -175,7 +182,7 @@ static int cs_begin(struct radeon_cs *cs,
         uint32_t tmp, *ptr;
 	int num = (ndw > 0x3FF) ? ndw : 0x3FF;
 
-        tmp = (cs->cdw + 1 + num) & (~num);
+        tmp = (cs->cdw + ndw + 0x3ff) & (~0x3ff);
         ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
         if (ptr == NULL) {
             return -ENOMEM;
@@ -187,18 +194,17 @@ static int cs_begin(struct radeon_cs *cs,
     return 0;
 }
 
-static int cs_end(struct radeon_cs *cs,
+static int cs_end(struct radeon_cs_int *cs,
                   const char *file,
                   const char *func,
                   int line)
 
 {
-    if (!cs->section) {
+    if (!cs->section_ndw) {
         fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
                 file, func, line);
         return -EPIPE;
     }
-    cs->section = 0;
     if (cs->section_ndw != cs->section_cdw) {
         fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
                 cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
@@ -206,10 +212,12 @@ static int cs_end(struct radeon_cs *cs,
                 file, func, line);
         return -EPIPE;
     }
+    cs->section_ndw = 0;
+
     return 0;
 }
 
-static int cs_process_relocs(struct radeon_cs *cs)
+static int cs_process_relocs(struct radeon_cs_int *cs)
 {
     struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
     struct cs_reloc_legacy *relocs;
@@ -254,7 +262,7 @@ restart:
     return 0;
 }
 
-static int cs_set_age(struct radeon_cs *cs)
+static int cs_set_age(struct radeon_cs_int *cs)
 {
     struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
     struct cs_reloc_legacy *relocs;
@@ -268,7 +276,7 @@ static int cs_set_age(struct radeon_cs *cs)
     return 0;
 }
 
-static int cs_emit(struct radeon_cs *cs)
+static int cs_emit(struct radeon_cs_int *cs)
 {
     struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
     drm_radeon_cmd_buffer_t cmd;
@@ -276,7 +284,7 @@ static int cs_emit(struct radeon_cs *cs)
     uint64_t ull;
     int r;
 
-    csm->ctx->vtbl.emit_cs_header(cs, csm->ctx);
+    csm->ctx->vtbl.emit_cs_header((struct radeon_cs *)cs, csm->ctx);
 
     /* append buffer age */
     if ( IS_R300_CLASS(csm->ctx->radeonScreen) )
@@ -289,9 +297,9 @@ static int cs_emit(struct radeon_cs *cs)
       age.scratch.reg = 2;
       age.scratch.n_bufs = 1;
       age.scratch.flags = 0;
-      radeon_cs_write_dword(cs, age.u);
-      radeon_cs_write_qword(cs, ull);
-      radeon_cs_write_dword(cs, 0);
+      radeon_cs_write_dword((struct radeon_cs *)cs, age.u);
+      radeon_cs_write_qword((struct radeon_cs *)cs, ull);
+      radeon_cs_write_dword((struct radeon_cs *)cs, 0);
     }
 
     r = cs_process_relocs(cs);
@@ -342,7 +350,7 @@ static void inline cs_free_reloc(void *relocs_p, int crelocs)
       free(relocs[i].indices);
 }
 
-static int cs_destroy(struct radeon_cs *cs)
+static int cs_destroy(struct radeon_cs_int *cs)
 {
     cs_free_reloc(cs->relocs, cs->crelocs);
     free(cs->relocs);
@@ -351,7 +359,7 @@ static int cs_destroy(struct radeon_cs *cs)
     return 0;
 }
 
-static int cs_erase(struct radeon_cs *cs)
+static int cs_erase(struct radeon_cs_int *cs)
 {
     cs_free_reloc(cs->relocs, cs->crelocs);
     free(cs->relocs);
@@ -359,18 +367,18 @@ static int cs_erase(struct radeon_cs *cs)
     cs->relocs = NULL;
     cs->crelocs = 0;
     cs->cdw = 0;
-    cs->section = 0;
+    cs->section_ndw = 0;
     return 0;
 }
 
-static int cs_need_flush(struct radeon_cs *cs)
+static int cs_need_flush(struct radeon_cs_int *cs)
 {
     /* this function used to flush when the BO usage got to
      * a certain size, now the higher levels handle this better */
     return 0;
 }
 
-static void cs_print(struct radeon_cs *cs, FILE *file)
+static void cs_print(struct radeon_cs_int *cs, FILE *file)
 {
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c b/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c
index 89cbbb5a6b..e22b437d56 100644
--- a/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c
@@ -29,6 +29,8 @@
 #include <errno.h>
 #include <stdlib.h>
 #include "radeon_bocs_wrapper.h"
+#include "radeon_bo_int_drm.h"
+#include "radeon_cs_int_drm.h"
 
 struct rad_sizes {
     int32_t op_read;
@@ -39,7 +41,7 @@ struct rad_sizes {
 static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct rad_sizes *sizes)
 {
     uint32_t read_domains, write_domain;
-    struct radeon_bo *bo;
+    struct radeon_bo_int *bo;
 
     bo = sc->bo;
     sc->new_accounted = 0;
@@ -47,7 +49,7 @@ static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct ra
     write_domain = sc->write_domain;
 
     /* legacy needs a static check */
-    if (radeon_bo_is_static(bo)) {
+    if (radeon_bo_is_static((struct radeon_bo *)sc->bo)) {
 	bo->space_accounted = sc->new_accounted = (read_domains << 16) | write_domain;
 	return 0;
     }
@@ -100,11 +102,11 @@ static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct ra
     return 0;
 }
 
-static int radeon_cs_do_space_check(struct radeon_cs *cs, struct radeon_cs_space_check *new_tmp)
+static int radeon_cs_do_space_check(struct radeon_cs_int *cs, struct radeon_cs_space_check *new_tmp)
 {
     struct radeon_cs_manager *csm = cs->csm;
     int i;
-    struct radeon_bo *bo;
+    struct radeon_bo_int *bo;
     struct rad_sizes sizes;
     int ret;
 
@@ -158,25 +160,28 @@ static int radeon_cs_do_space_check(struct radeon_cs *cs, struct radeon_cs_space
 
 void radeon_cs_space_add_persistent_bo(struct radeon_cs *cs, struct radeon_bo *bo, uint32_t read_domains, uint32_t write_domain)
 {
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
     int i;
-    for (i = 0; i < cs->bo_count; i++) {
-	if (cs->bos[i].bo == bo &&
-	    cs->bos[i].read_domains == read_domains &&
-	    cs->bos[i].write_domain == write_domain)
+    for (i = 0; i < csi->bo_count; i++) {
+	if (csi->bos[i].bo == boi &&
+	    csi->bos[i].read_domains == read_domains &&
+	    csi->bos[i].write_domain == write_domain)
 	    return;
     }
     radeon_bo_ref(bo);
-    i = cs->bo_count;
-    cs->bos[i].bo = bo;
-    cs->bos[i].read_domains = read_domains;
-    cs->bos[i].write_domain = write_domain;
-    cs->bos[i].new_accounted = 0;
-    cs->bo_count++;
-
-    assert(cs->bo_count < MAX_SPACE_BOS);
+    i = csi->bo_count;
+    csi->bos[i].bo = boi;
+    csi->bos[i].read_domains = read_domains;
+    csi->bos[i].write_domain = write_domain;
+    csi->bos[i].new_accounted = 0;
+    csi->bo_count++;
+
+    assert(csi->bo_count < MAX_SPACE_BOS);
 }
 
-static int radeon_cs_check_space_internal(struct radeon_cs *cs, struct radeon_cs_space_check *tmp_bo)
+static int radeon_cs_check_space_internal(struct radeon_cs_int *cs,
+					  struct radeon_cs_space_check *tmp_bo)
 {
     int ret;
     int flushed = 0;
@@ -198,37 +203,42 @@ again:
 int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
 				  struct radeon_bo *bo,
 				  uint32_t read_domains, uint32_t write_domain)
-{									
+{
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    struct radeon_bo_int *boi = (struct radeon_bo_int *)bo;
     struct radeon_cs_space_check temp_bo;
+    
     int ret = 0;
 
     if (bo) {
-	temp_bo.bo = bo;
+	temp_bo.bo = boi;
 	temp_bo.read_domains = read_domains;
 	temp_bo.write_domain = write_domain;
 	temp_bo.new_accounted = 0;
     }
 
-    ret = radeon_cs_check_space_internal(cs, bo ? &temp_bo : NULL);
+    ret = radeon_cs_check_space_internal(csi, bo ? &temp_bo : NULL);
     return ret;
 }
 
 int radeon_cs_space_check(struct radeon_cs *cs)
 {
-    return radeon_cs_check_space_internal(cs, NULL);
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
+    return radeon_cs_check_space_internal(csi, NULL);
 }
 
 void radeon_cs_space_reset_bos(struct radeon_cs *cs)
 {
+    struct radeon_cs_int *csi = (struct radeon_cs_int *)cs;
     int i;
-    for (i = 0; i < cs->bo_count; i++) {
-	radeon_bo_unref(cs->bos[i].bo);
-	cs->bos[i].bo = NULL;
-	cs->bos[i].read_domains = 0;
-	cs->bos[i].write_domain = 0;
-	cs->bos[i].new_accounted = 0;
+    for (i = 0; i < csi->bo_count; i++) {
+	radeon_bo_unref((struct radeon_bo *)csi->bos[i].bo);
+	csi->bos[i].bo = NULL;
+	csi->bos[i].read_domains = 0;
+	csi->bos[i].write_domain = 0;
+	csi->bos[i].new_accounted = 0;
     }
-    cs->bo_count = 0;
+    csi->bo_count = 0;
 }
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.c b/src/mesa/drivers/dri/radeon/radeon_dma.c
index c6edbae9a1..d31e4e47dd 100644
--- a/src/mesa/drivers/dri/radeon/radeon_dma.c
+++ b/src/mesa/drivers/dri/radeon/radeon_dma.c
@@ -151,6 +151,7 @@ void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
 	aos->components = size;
 	aos->count = count;
 
+	radeon_bo_map(aos->bo, 1);
 	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
 	switch (size) {
 	case 1: radeonEmitVec4(out, data, stride, count); break;
@@ -161,6 +162,7 @@ void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
 		assert(0);
 		break;
 	}
+	radeon_bo_unmap(aos->bo);
 }
 
 void radeon_init_dma(radeonContextPtr rmesa)
@@ -183,10 +185,6 @@ void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
 			__FUNCTION__, size, rmesa->dma.minimum_size);
 
 
-	/* unmap old reserved bo */
-	if (!is_empty_list(&rmesa->dma.reserved))
-		radeon_bo_unmap(first_elem(&rmesa->dma.reserved)->bo);
-
 	if (is_empty_list(&rmesa->dma.free)
 	      || last_elem(&rmesa->dma.free)->bo->size < size) {
 		dma_bo = CALLOC_STRUCT(radeon_dma_bo);
@@ -223,8 +221,6 @@ again_alloc:
         /* Cmd buff have been flushed in radeon_revalidate_bos */
 		goto again_alloc;
 	}
-
-	radeon_bo_map(first_elem(&rmesa->dma.reserved)->bo, 1);
 }
 
 /* Allocates a region from rmesa->dma.current.  If there isn't enough
@@ -281,7 +277,6 @@ void radeonFreeDmaRegions(radeonContextPtr rmesa)
 
 	foreach_s(dma_bo, temp, &rmesa->dma.reserved) {
 		remove_from_list(dma_bo);
-		radeon_bo_unmap(dma_bo->bo);
 	        radeon_bo_unref(dma_bo->bo);
 		FREE(dma_bo);
 	}
@@ -306,10 +301,6 @@ static int radeon_bo_is_idle(struct radeon_bo* bo)
 		WARN_ONCE("Your libdrm or kernel doesn't have support for busy query.\n"
 			"This may cause small performance drop for you.\n");
 	}
-	/* Protect against bug in legacy bo handling that causes bos stay
-	 * referenced even after they should be freed */
-	if (bo->cref != 1)
-		return 0;
 	return ret != -EBUSY;
 }
 
@@ -346,9 +337,7 @@ void radeonReleaseDmaRegions(radeonContextPtr rmesa)
 	foreach_s(dma_bo, temp, &rmesa->dma.wait) {
 		if (dma_bo->expire_counter == time) {
 			WARN_ONCE("Leaking dma buffer object!\n");
-			/* force free of buffer so we don't realy start
-			 * leaking stuff now*/
-			while ((dma_bo->bo = radeon_bo_unref(dma_bo->bo))) {}
+			radeon_bo_unref(dma_bo->bo);
 			remove_from_list(dma_bo);
 			FREE(dma_bo);
 			continue;
@@ -367,9 +356,6 @@ void radeonReleaseDmaRegions(radeonContextPtr rmesa)
 		insert_at_tail(&rmesa->dma.free, dma_bo);
 	}
 
-	/* unmap the last dma region */
-	if (!is_empty_list(&rmesa->dma.reserved))
-		radeon_bo_unmap(first_elem(&rmesa->dma.reserved)->bo);
 	/* move reserved to wait list */
 	foreach_s(dma_bo, temp, &rmesa->dma.reserved) {
 		/* free objects that are too small to be used because of large request */
@@ -403,11 +389,12 @@ void rcommon_flush_last_swtcl_prim( GLcontext *ctx  )
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 	struct radeon_dma *dma = &rmesa->dma;
 		
-
 	if (RADEON_DEBUG & RADEON_IOCTL)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 	dma->flush = NULL;
 
+	radeon_bo_unmap(rmesa->swtcl.bo);
+
 	if (!is_empty_list(&dma->reserved)) {
 	    GLuint current_offset = dma->current_used;
 
@@ -422,6 +409,8 @@ void rcommon_flush_last_swtcl_prim( GLcontext *ctx  )
 	    }
 	    rmesa->swtcl.numverts = 0;
 	}
+	radeon_bo_unref(rmesa->swtcl.bo);
+	rmesa->swtcl.bo = NULL;
 }
 /* Alloc space in the current dma region.
  */
@@ -432,6 +421,7 @@ rcommonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
 	void *head;
 	if (RADEON_DEBUG & RADEON_IOCTL)
 		fprintf(stderr, "%s\n", __FUNCTION__);
+
 	if(is_empty_list(&rmesa->dma.reserved)
 	      ||rmesa->dma.current_vertexptr + bytes > first_elem(&rmesa->dma.reserved)->bo->size) {
 		if (rmesa->dma.flush) {
@@ -455,7 +445,13 @@ rcommonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
                 rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
                 rmesa->dma.current_vertexptr );
 
-	head = (first_elem(&rmesa->dma.reserved)->bo->ptr + rmesa->dma.current_vertexptr);
+	if (!rmesa->swtcl.bo) {
+		rmesa->swtcl.bo = first_elem(&rmesa->dma.reserved)->bo;
+		radeon_bo_ref(rmesa->swtcl.bo);
+		radeon_bo_map(rmesa->swtcl.bo, 1);
+	}
+
+	head = (rmesa->swtcl.bo->ptr + rmesa->dma.current_vertexptr);
 	rmesa->dma.current_vertexptr += bytes;
 	rmesa->swtcl.numverts += nverts;
 	return head;
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index bf69cd9337..7b1f84a715 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -90,7 +90,7 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_R3_G3_B2:
    case GL_RGB4:
    case GL_RGB5:
-      rb->Format = MESA_FORMAT_RGB565;
+      rb->Format = _dri_texformat_rgb565;
       rb->DataType = GL_UNSIGNED_BYTE;
       cpp = 2;
       break;
@@ -99,7 +99,7 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_RGB10:
    case GL_RGB12:
    case GL_RGB16:
-      rb->Format = MESA_FORMAT_ARGB8888;
+      rb->Format = _dri_texformat_argb8888;
       rb->DataType = GL_UNSIGNED_BYTE;
       cpp = 4;
       break;
@@ -111,7 +111,7 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    case GL_RGB10_A2:
    case GL_RGBA12:
    case GL_RGBA16:
-      rb->Format = MESA_FORMAT_ARGB8888;
+      rb->Format = _dri_texformat_argb8888;
       rb->DataType = GL_UNSIGNED_BYTE;
       cpp = 4;
       break;
@@ -166,8 +166,9 @@ radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
      uint32_t size;
      uint32_t pitch = ((cpp * width + 63) & ~63) / cpp;
 
-     fprintf(stderr,"Allocating %d x %d radeon RBO (pitch %d)\n", width,
-	  height, pitch);
+     if (RADEON_DEBUG & RADEON_MEMORY)
+	     fprintf(stderr,"Allocating %d x %d radeon RBO (pitch %d)\n", width,
+		     height, pitch);
 
      size = pitch * height * cpp;
      rrb->pitch = pitch * cpp;
@@ -246,7 +247,7 @@ radeon_nop_alloc_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
  * Not used for user-created renderbuffers.
  */
 struct radeon_renderbuffer *
-radeon_create_renderbuffer(gl_format format, __DRIdrawablePrivate *driDrawPriv)
+radeon_create_renderbuffer(gl_format format, __DRIdrawable *driDrawPriv)
 {
     struct radeon_renderbuffer *rrb;
 
@@ -261,14 +262,32 @@ radeon_create_renderbuffer(gl_format format, __DRIdrawablePrivate *driDrawPriv)
 
     switch (format) {
         case MESA_FORMAT_RGB565:
+	    assert(_mesa_little_endian());
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGB;
+	    break;
+        case MESA_FORMAT_RGB565_REV:
+	    assert(!_mesa_little_endian());
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
             rrb->base._BaseFormat = GL_RGB;
 	    break;
         case MESA_FORMAT_XRGB8888:
+	    assert(_mesa_little_endian());
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGB;
+	    break;
+        case MESA_FORMAT_XRGB8888_REV:
+	    assert(!_mesa_little_endian());
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
             rrb->base._BaseFormat = GL_RGB;
 	    break;
 	case MESA_FORMAT_ARGB8888:
+	    assert(_mesa_little_endian());
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+            rrb->base._BaseFormat = GL_RGBA;
+	    break;
+	case MESA_FORMAT_ARGB8888_REV:
+	    assert(!_mesa_little_endian());
 	    rrb->base.DataType = GL_UNSIGNED_BYTE;
             rrb->base._BaseFormat = GL_RGBA;
 	    break;
@@ -351,6 +370,12 @@ radeon_framebuffer_renderbuffer(GLcontext * ctx,
 }
 
 
+/* TODO: According to EXT_fbo spec internal format of texture image
+ * once set during glTexImage call, should be preserved when
+ * attaching image to renderbuffer. When HW doesn't support
+ * rendering to format of attached image, set framebuffer
+ * completeness accordingly in radeon_validate_framebuffer (issue #79).
+ */
 static GLboolean
 radeon_update_wrapper(GLcontext *ctx, struct radeon_renderbuffer *rrb, 
 		     struct gl_texture_image *texImage)
@@ -359,21 +384,21 @@ radeon_update_wrapper(GLcontext *ctx, struct radeon_renderbuffer *rrb,
 	gl_format texFormat;
 
 restart:
-	if (texImage->TexFormat == MESA_FORMAT_ARGB8888) {
+	if (texImage->TexFormat == _dri_texformat_argb8888) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
 		DBG("Render to RGBA8 texture OK\n");
 	}
-	else if (texImage->TexFormat == MESA_FORMAT_RGB565) {
+	else if (texImage->TexFormat == _dri_texformat_rgb565) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
 		DBG("Render to RGB5 texture OK\n");
 	}
-	else if (texImage->TexFormat == MESA_FORMAT_ARGB1555) {
+	else if (texImage->TexFormat == _dri_texformat_argb1555) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
 		DBG("Render to ARGB1555 texture OK\n");
 	}
-	else if (texImage->TexFormat == MESA_FORMAT_ARGB4444) {
+	else if (texImage->TexFormat == _dri_texformat_argb4444) {
 		rrb->base.DataType = GL_UNSIGNED_BYTE;
-		DBG("Render to ARGB1555 texture OK\n");
+		DBG("Render to ARGB4444 texture OK\n");
 	}
 	else if (texImage->TexFormat == MESA_FORMAT_Z16) {
 		rrb->base.DataType = GL_UNSIGNED_SHORT;
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
index a0106d00fa..a9d50c5d07 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
@@ -449,7 +449,7 @@ void radeonEmitAOS( r100ContextPtr rmesa,
 static void radeonKernelClear(GLcontext *ctx, GLuint flags)
 {
      r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    drm_radeon_sarea_t *sarea = rmesa->radeon.sarea;
    uint32_t clear;
    GLint ret, i;
@@ -570,11 +570,15 @@ static void radeonKernelClear(GLcontext *ctx, GLuint flags)
 static void radeonClear( GLcontext *ctx, GLbitfield mask )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLuint flags = 0;
    GLuint color_mask = 0;
    GLuint orig_mask = mask;
 
+   if (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_FRONT_RIGHT)) {
+      rmesa->radeon.front_buffer_dirty = GL_TRUE;
+   }
+
    if ( RADEON_DEBUG & RADEON_IOCTL ) {
       fprintf( stderr, "radeonClear\n");
    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.c b/src/mesa/drivers/dri/radeon/radeon_lock.c
index 02de8e5fd1..9dee691938 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.c
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.c
@@ -58,11 +58,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 {
-	__DRIdrawablePrivate *const drawable = radeon_get_drawable(rmesa);
-	__DRIdrawablePrivate *const readable = radeon_get_readable(rmesa);
-	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
-
-	assert(drawable != NULL);
+	__DRIdrawable *const drawable = radeon_get_drawable(rmesa);
+	__DRIdrawable *const readable = radeon_get_readable(rmesa);
+	__DRIscreen *sPriv = rmesa->dri.screen;
 
 	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
 
@@ -74,12 +72,13 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 	 * Since the hardware state depends on having the latest drawable
 	 * clip rects, all state checking must be done _after_ this call.
 	 */
-	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
-	if (drawable != readable) {
+	if (drawable)
+		DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
+	if (readable && drawable != readable) {
 		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
 	}
 
-	if (rmesa->lastStamp != drawable->lastStamp) {
+	if (drawable && (rmesa->lastStamp != drawable->lastStamp)) {
 		radeon_window_moved(rmesa);
 		rmesa->lastStamp = drawable->lastStamp;
 	}
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
index 08e1c5d00d..d810e6080e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
@@ -76,12 +76,14 @@ static void emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
 
    /* Emit the data
     */
+   radeon_bo_map(aos->bo, 1);
    out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
    for (i = 0; i < count; i++) {
       out[0] = radeonComputeFogBlendFactor( ctx, *(GLfloat *)data );
       out++;
       data += stride;
    }
+   radeon_bo_unmap(aos->bo);
 }
 
 static void emit_s0_vec(uint32_t *out, GLvoid *data, int stride, int count)
@@ -151,6 +153,7 @@ static void emit_tex_vector(GLcontext *ctx, struct radeon_aos *aos,
 
    /* Emit the data
     */
+   radeon_bo_map(aos->bo, 1);
    out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
    switch (size) {
    case 1:
@@ -170,6 +173,7 @@ static void emit_tex_vector(GLcontext *ctx, struct radeon_aos *aos,
       exit(1);
       break;
    }
+   radeon_bo_unmap(aos->bo);
 }
 
 
@@ -196,12 +200,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       if (!rmesa->tcl.obj.buf) 
 	rcommon_emit_vector( ctx, 
 			     &(rmesa->tcl.aos[nr]),
-			     (char *)VB->ObjPtr->data,
-			     VB->ObjPtr->size,
-			     VB->ObjPtr->stride,
+			     (char *)VB->AttribPtr[_TNL_ATTRIB_POS]->data,
+			     VB->AttribPtr[_TNL_ATTRIB_POS]->size,
+			     VB->AttribPtr[_TNL_ATTRIB_POS]->stride,
 			     count);
 
-      switch( VB->ObjPtr->size ) {
+      switch( VB->AttribPtr[_TNL_ATTRIB_POS]->size ) {
       case 4: vfmt |= RADEON_CP_VC_FRMT_W0;
       case 3: vfmt |= RADEON_CP_VC_FRMT_Z;
       case 2: vfmt |= RADEON_CP_VC_FRMT_XY;
@@ -216,9 +220,9 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       if (!rmesa->tcl.norm.buf)
 	 rcommon_emit_vector( ctx, 
 			      &(rmesa->tcl.aos[nr]),
-			      (char *)VB->NormalPtr->data,
+			      (char *)VB->AttribPtr[_TNL_ATTRIB_NORMAL]->data,
 			      3,
-			      VB->NormalPtr->stride,
+			      VB->AttribPtr[_TNL_ATTRIB_NORMAL]->stride,
 			      count);
 
       vfmt |= RADEON_CP_VC_FRMT_N0;
@@ -227,9 +231,9 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
    if (inputs & VERT_BIT_COLOR0) {
       int emitsize;
-      if (VB->ColorPtr[0]->size == 4 &&
-	  (VB->ColorPtr[0]->stride != 0 ||
-	   VB->ColorPtr[0]->data[0][3] != 1.0)) {
+      if (VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size == 4 &&
+	  (VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride != 0 ||
+	   VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data[0][3] != 1.0)) {
 	 vfmt |= RADEON_CP_VC_FRMT_FPCOLOR | RADEON_CP_VC_FRMT_FPALPHA;
 	 emitsize = 4;
       }
@@ -242,9 +246,9 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       if (!rmesa->tcl.rgba.buf)
 	rcommon_emit_vector( ctx,
 			     &(rmesa->tcl.aos[nr]),
-			     (char *)VB->ColorPtr[0]->data,
+			     (char *)VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data,
 			     emitsize,
-			     VB->ColorPtr[0]->stride,
+			     VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride,
 			     count);
 
       nr++;
@@ -256,9 +260,9 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
 	rcommon_emit_vector( ctx,
 			     &(rmesa->tcl.aos[nr]),
-			     (char *)VB->SecondaryColorPtr[0]->data,
+			     (char *)VB->AttribPtr[_TNL_ATTRIB_COLOR1]->data,
 			     3,
-			     VB->SecondaryColorPtr[0]->stride,
+			     VB->AttribPtr[_TNL_ATTRIB_COLOR1]->stride,
 			     count);
       }
 
@@ -273,8 +277,8 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       if (!rmesa->tcl.fog.buf)
 	 emit_vecfog( ctx,
 		      &(rmesa->tcl.aos[nr]),
-		      (char *)VB->FogCoordPtr->data,
-		      VB->FogCoordPtr->stride,
+		      (char *)VB->AttribPtr[_TNL_ATTRIB_FOG]->data,
+		      VB->AttribPtr[_TNL_ATTRIB_FOG]->stride,
 		      count);
 
       vfmt |= RADEON_CP_VC_FRMT_FPFOG;
@@ -290,24 +294,24 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 	 if (!rmesa->tcl.tex[unit].buf)
 	    emit_tex_vector( ctx,
 			     &(rmesa->tcl.aos[nr]),
-			     (char *)VB->TexCoordPtr[unit]->data,
-			     VB->TexCoordPtr[unit]->size,
-			     VB->TexCoordPtr[unit]->stride,
+			     (char *)VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->data,
+			     VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->size,
+			     VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->stride,
 			     count );
 	 nr++;
 
 	 vfmt |= RADEON_ST_BIT(unit);
          /* assume we need the 3rd coord if texgen is active for r/q OR at least
 	    3 coords are submitted. This may not be 100% correct */
-         if (VB->TexCoordPtr[unit]->size >= 3) {
+         if (VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->size >= 3) {
 	    vtx |= RADEON_Q_BIT(unit);
 	    vfmt |= RADEON_Q_BIT(unit);
 	 }
 	 if ( (ctx->Texture.Unit[unit].TexGenEnabled & (R_BIT | Q_BIT)) )
 	    vtx |= RADEON_Q_BIT(unit);
-	 else if ((VB->TexCoordPtr[unit]->size >= 3) &&
+	 else if ((VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->size >= 3) &&
 	          ((ctx->Texture.Unit[unit]._ReallyEnabled & (TEXTURE_CUBE_BIT)) == 0)) {
-	    GLuint swaptexmatcol = (VB->TexCoordPtr[unit]->size - 3);
+	    GLuint swaptexmatcol = (VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->size - 3);
 	    if (((rmesa->NeedTexMatrix >> unit) & 1) &&
 		 (swaptexmatcol != ((rmesa->TexMatColSwap >> unit) & 1)))
 	       radeonUploadTexMatrix( rmesa, unit, swaptexmatcol ) ;
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h b/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
index 515783135d..d764ccb982 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
@@ -56,18 +56,18 @@ static void TAG(emit)( GLcontext *ctx,
 
    radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __FUNCTION__);
 
-   coord = (GLuint (*)[4])VB->ObjPtr->data;
-   coord_stride = VB->ObjPtr->stride;
+   coord = (GLuint (*)[4])VB->AttribPtr[_TNL_ATTRIB_POS]->data;
+   coord_stride = VB->AttribPtr[_TNL_ATTRIB_POS]->stride;
 
    if (DO_TEX2) {
-      if (VB->TexCoordPtr[2]) {
+      if (VB->AttribPtr[_TNL_ATTRIB_TEX2]) {
 	 const GLuint t2 = GET_TEXSOURCE(2);
-	 tc2 = (GLuint (*)[4])VB->TexCoordPtr[t2]->data;
-	 tc2_stride = VB->TexCoordPtr[t2]->stride;
-	 if (DO_PTEX && VB->TexCoordPtr[t2]->size < 3) {
+	 tc2 = (GLuint (*)[4])VB->AttribPtr[_TNL_ATTRIB_TEX0 + t2]->data;
+	 tc2_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t2]->stride;
+	 if (DO_PTEX && VB->AttribPtr[_TNL_ATTRIB_TEX0 + t2]->size < 3) {
 	    fill_tex |= (1<<2);
 	 }
-	 else if (DO_PTEX && VB->TexCoordPtr[t2]->size < 4) {
+	 else if (DO_PTEX && VB->AttribPtr[_TNL_ATTRIB_TEX0 + t2]->size < 4) {
 	    rqcoordsnoswap |= (1<<2);
 	 }
       } else {
@@ -77,14 +77,14 @@ static void TAG(emit)( GLcontext *ctx,
    }
 
    if (DO_TEX1) {
-      if (VB->TexCoordPtr[1]) {
+      if (VB->AttribPtr[_TNL_ATTRIB_TEX1]) {
 	 const GLuint t1 = GET_TEXSOURCE(1);
-	 tc1 = (GLuint (*)[4])VB->TexCoordPtr[t1]->data;
-	 tc1_stride = VB->TexCoordPtr[t1]->stride;
-	 if (DO_PTEX && VB->TexCoordPtr[t1]->size < 3) {
+	 tc1 = (GLuint (*)[4])VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->data;
+	 tc1_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->stride;
+	 if (DO_PTEX && VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->size < 3) {
 	    fill_tex |= (1<<1);
 	 }
-	 else if (DO_PTEX && VB->TexCoordPtr[t1]->size < 4) {
+	 else if (DO_PTEX && VB->AttribPtr[_TNL_ATTRIB_TEX0 + t1]->size < 4) {
 	    rqcoordsnoswap |= (1<<1);
 	 }
       } else {
@@ -94,14 +94,14 @@ static void TAG(emit)( GLcontext *ctx,
    }
 
    if (DO_TEX0) {
-      if (VB->TexCoordPtr[0]) {
+      if (VB->AttribPtr[_TNL_ATTRIB_TEX0]) {
 	 const GLuint t0 = GET_TEXSOURCE(0);
-	 tc0_stride = VB->TexCoordPtr[t0]->stride;
-	 tc0 = (GLuint (*)[4])VB->TexCoordPtr[t0]->data;
-	 if (DO_PTEX && VB->TexCoordPtr[t0]->size < 3) {
+	 tc0_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->stride;
+	 tc0 = (GLuint (*)[4])VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->data;
+	 if (DO_PTEX && VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->size < 3) {
 	    fill_tex |= (1<<0);
 	 }
-	 else if (DO_PTEX && VB->TexCoordPtr[t0]->size < 4) {
+	 else if (DO_PTEX && VB->AttribPtr[_TNL_ATTRIB_TEX0 + t0]->size < 4) {
 	    rqcoordsnoswap |= (1<<0);
 	 }
       } else {
@@ -112,9 +112,9 @@ static void TAG(emit)( GLcontext *ctx,
    }
 
    if (DO_NORM) {
-      if (VB->NormalPtr) {
-	 norm_stride = VB->NormalPtr->stride;
-	 norm = (GLuint (*)[4])VB->NormalPtr->data;
+      if (VB->AttribPtr[_TNL_ATTRIB_NORMAL]) {
+	 norm_stride = VB->AttribPtr[_TNL_ATTRIB_NORMAL]->stride;
+	 norm = (GLuint (*)[4])VB->AttribPtr[_TNL_ATTRIB_NORMAL]->data;
       } else {
 	 norm_stride = 0;
 	 norm = (GLuint (*)[4])&ctx->Current.Attrib[VERT_ATTRIB_NORMAL];
@@ -122,9 +122,9 @@ static void TAG(emit)( GLcontext *ctx,
    }
 
    if (DO_RGBA) {
-      if (VB->ColorPtr[0]) {
-	 col = VB->ColorPtr[0]->data;
-	 col_stride = VB->ColorPtr[0]->stride;
+      if (VB->AttribPtr[_TNL_ATTRIB_COLOR0]) {
+	 col = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+	 col_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
       } else {
 	 col = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
 	 col_stride = 0;
@@ -132,9 +132,9 @@ static void TAG(emit)( GLcontext *ctx,
    }
 
    if (DO_SPEC_OR_FOG) {
-      if (VB->SecondaryColorPtr[0]) {
-	 spec = VB->SecondaryColorPtr[0]->data;
-	 spec_stride = VB->SecondaryColorPtr[0]->stride;
+      if (VB->AttribPtr[_TNL_ATTRIB_COLOR1]) {
+	 spec = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->data;
+	 spec_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->stride;
       } else {
 	 spec = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_COLOR1];
 	 spec_stride = 0;
@@ -142,9 +142,9 @@ static void TAG(emit)( GLcontext *ctx,
    }
 
    if (DO_SPEC_OR_FOG) {
-      if (VB->FogCoordPtr) {
-	 fog = VB->FogCoordPtr->data;
-	 fog_stride = VB->FogCoordPtr->stride;
+      if (VB->AttribPtr[_TNL_ATTRIB_FOG]) {
+	 fog = VB->AttribPtr[_TNL_ATTRIB_FOG]->data;
+	 fog_stride = VB->AttribPtr[_TNL_ATTRIB_FOG]->stride;
       } else {
 	 fog = (GLfloat (*)[4])ctx->Current.Attrib[VERT_ATTRIB_FOG];
 	 fog_stride = 0;
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
index 78ec119302..98f96ff2a7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
@@ -326,7 +326,7 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
    if (1) {
       req |= RADEON_CP_VC_FRMT_Z;
-      if (VB->ObjPtr->size == 4) {
+      if (VB->AttribPtr[_TNL_ATTRIB_POS]->size == 4) {
 	 req |= RADEON_CP_VC_FRMT_W0;
       }
    }
@@ -348,15 +348,15 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 	 req |= RADEON_ST_BIT(unit);
 	 /* assume we need the 3rd coord if texgen is active for r/q OR at least
 	    3 coords are submitted. This may not be 100% correct */
-	 if (VB->TexCoordPtr[unit]->size >= 3) {
+	 if (VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->size >= 3) {
 	    req |= RADEON_Q_BIT(unit);
 	    vtx |= RADEON_Q_BIT(unit);
 	 }
 	 if ( (ctx->Texture.Unit[unit].TexGenEnabled & (R_BIT | Q_BIT)) )
 	    vtx |= RADEON_Q_BIT(unit);
-	 else if ((VB->TexCoordPtr[unit]->size >= 3) &&
+	 else if ((VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->size >= 3) &&
 	          ((ctx->Texture.Unit[unit]._ReallyEnabled & (TEXTURE_CUBE_BIT)) == 0)) {
-	    GLuint swaptexmatcol = (VB->TexCoordPtr[unit]->size - 3);
+	    GLuint swaptexmatcol = (VB->AttribPtr[_TNL_ATTRIB_TEX0 + unit]->size - 3);
 	    if (((rmesa->NeedTexMatrix >> unit) & 1) &&
 		 (swaptexmatcol != ((rmesa->TexMatColSwap >> unit) & 1)))
 	       radeonUploadTexMatrix( rmesa, unit, swaptexmatcol ) ;
@@ -390,19 +390,19 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
     * this, add more vertex code (for obj-2, obj-3) or preferably move
     * to maos.  
     */
-   if (VB->ObjPtr->size < 3 || 
-       (VB->ObjPtr->size == 3 && 
+   if (VB->AttribPtr[_TNL_ATTRIB_POS]->size < 3 ||
+       (VB->AttribPtr[_TNL_ATTRIB_POS]->size == 3 &&
 	(setup_tab[i].vertex_format & RADEON_CP_VC_FRMT_W0))) {
 
       _math_trans_4f( rmesa->tcl.ObjClean.data,
-		      VB->ObjPtr->data,
-		      VB->ObjPtr->stride,
+		      VB->AttribPtr[_TNL_ATTRIB_POS]->data,
+		      VB->AttribPtr[_TNL_ATTRIB_POS]->stride,
 		      GL_FLOAT,
-		      VB->ObjPtr->size,
+		      VB->AttribPtr[_TNL_ATTRIB_POS]->size,
 		      0,
 		      VB->Count );
 
-      switch (VB->ObjPtr->size) {
+      switch (VB->AttribPtr[_TNL_ATTRIB_POS]->size) {
       case 1:
 	    _mesa_vector4f_clean_elem(&rmesa->tcl.ObjClean, VB->Count, 1);
       case 2:
@@ -416,14 +416,14 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 	 break;
       }
 
-      VB->ObjPtr = &rmesa->tcl.ObjClean;
+      VB->AttribPtr[_TNL_ATTRIB_POS] = &rmesa->tcl.ObjClean;
    }
 
 
-
+   radeon_bo_map(rmesa->radeon.tcl.aos[0].bo, 1);
    setup_tab[i].emit( ctx, 0, VB->Count, 
 		      rmesa->radeon.tcl.aos[0].bo->ptr + rmesa->radeon.tcl.aos[0].offset);
-
+   radeon_bo_unmap(rmesa->radeon.tcl.aos[0].bo);
    //   rmesa->radeon.tcl.aos[0].size = setup_tab[i].vertex_size;
    rmesa->radeon.tcl.aos[0].stride = setup_tab[i].vertex_size;
    rmesa->tcl.vertex_format = setup_tab[i].vertex_format;
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index dadc72f4c1..033f26db2a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2009 Maciej Cencora.
  * Copyright (C) 2008 Nicolai Haehnle.
  *
  * All Rights Reserved.
@@ -32,50 +33,52 @@
 
 #include "main/simple_list.h"
 #include "main/texcompress.h"
-
-static GLuint radeon_compressed_texture_size(GLcontext *ctx,
-		GLsizei width, GLsizei height, GLsizei depth,
-		GLuint mesaFormat)
+#include "main/teximage.h"
+#include "main/texobj.h"
+#include "radeon_texture.h"
+
+static unsigned get_aligned_compressed_row_stride(
+		gl_format format,
+		unsigned width,
+		unsigned minStride)
 {
-	GLuint size = _mesa_format_image_size(mesaFormat, width, height, depth);
-
-	if (mesaFormat == MESA_FORMAT_RGB_DXT1 ||
-	    mesaFormat == MESA_FORMAT_RGBA_DXT1) {
-		if (width + 3 < 8)	/* width one block */
-			size = size * 4;
-		else if (width + 3 < 16)
-			size = size * 2;
-	} else {
-		/* DXT3/5, 16 bytes per block */
-	  //		WARN_ONCE("DXT 3/5 suffers from multitexturing problems!\n");
-		if (width + 3 < 8)
-			size = size * 2;
+	const unsigned blockSize = _mesa_get_format_bytes(format);
+	unsigned blockWidth, blockHeight, numXBlocks;
+
+	_mesa_get_format_block_size(format, &blockWidth, &blockHeight);
+	numXBlocks = (width + blockWidth - 1) / blockWidth;
+
+	while (numXBlocks * blockSize < minStride)
+	{
+		++numXBlocks;
 	}
 
-	return size;
+	return numXBlocks * blockSize;
 }
 
+static unsigned get_compressed_image_size(
+		gl_format format,
+		unsigned rowStride,
+		unsigned height)
+{
+	unsigned blockWidth, blockHeight;
+
+	_mesa_get_format_block_size(format, &blockWidth, &blockHeight);
+
+	return rowStride * ((height + blockHeight - 1) / blockHeight);
+}
 
-static int radeon_compressed_num_bytes(GLuint mesaFormat)
+static int find_next_power_of_two(GLuint value)
 {
-   int bytes = 0;
-   switch(mesaFormat) {
-     
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
-   case MESA_FORMAT_RGB_DXT1:
-   case MESA_FORMAT_RGBA_DXT1:
-     bytes = 2;
-     break;
-     
-   case MESA_FORMAT_RGBA_DXT3:
-   case MESA_FORMAT_RGBA_DXT5:
-     bytes = 4;
-   default:
-     break;
-   }
-   
-   return bytes;
+	int i, tmp;
+
+	i = 0;
+	tmp = value - 1;
+	while (tmp) {
+		tmp >>= 1;
+		i++;
+	}
+	return (1 << i);
 }
 
 /**
@@ -90,28 +93,28 @@ static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree
 {
 	radeon_mipmap_level *lvl = &mt->levels[level];
 	uint32_t row_align;
+	GLuint height;
+
+	height = find_next_power_of_two(lvl->height);
 
 	/* Find image size in bytes */
-	if (mt->compressed) {
-		/* TODO: Is this correct? Need test cases for compressed textures! */
-		row_align = rmesa->texture_compressed_row_align - 1;
-		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
-		lvl->size = radeon_compressed_texture_size(mt->radeon->glCtx,
-							   lvl->width, lvl->height, lvl->depth, mt->compressed);
+	if (_mesa_is_format_compressed(mt->mesaFormat)) {
+		lvl->rowstride = get_aligned_compressed_row_stride(mt->mesaFormat, lvl->width, rmesa->texture_compressed_row_align);
+		lvl->size = get_compressed_image_size(mt->mesaFormat, lvl->rowstride, height);
 	} else if (mt->target == GL_TEXTURE_RECTANGLE_NV) {
 		row_align = rmesa->texture_rect_row_align - 1;
-		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
-		lvl->size = lvl->rowstride * lvl->height;
+		lvl->rowstride = (_mesa_format_row_stride(mt->mesaFormat, lvl->width) + row_align) & ~row_align;
+		lvl->size = lvl->rowstride * height;
 	} else if (mt->tilebits & RADEON_TXO_MICRO_TILE) {
 		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
 		 * though the actual offset may be different (if texture is less than
 		 * 32 bytes width) to the untiled case */
-		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
-		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
+		lvl->rowstride = (_mesa_format_row_stride(mt->mesaFormat, lvl->width) * 2 + 31) & ~31;
+		lvl->size = lvl->rowstride * ((height + 1) / 2) * lvl->depth;
 	} else {
 		row_align = rmesa->texture_row_align - 1;
-		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
-		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
+		lvl->rowstride = (_mesa_format_row_stride(mt->mesaFormat, lvl->width) + row_align) & ~row_align;
+		lvl->size = lvl->rowstride * height * lvl->depth;
 	}
 	assert(lvl->size > 0);
 
@@ -123,7 +126,7 @@ static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree
 	if (RADEON_DEBUG & RADEON_TEXTURE)
 	  fprintf(stderr,
 		  "level %d, face %d: rs:%d %dx%d at %d\n",
-		  level, face, lvl->rowstride, lvl->width, lvl->height, lvl->faces[face].offset);
+		  level, face, lvl->rowstride, lvl->width, height, lvl->faces[face].offset);
 }
 
 static GLuint minify(GLuint size, GLuint levels)
@@ -137,22 +140,19 @@ static GLuint minify(GLuint size, GLuint levels)
 
 static void calculate_miptree_layout_r100(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
 {
-	GLuint curOffset;
-	GLuint numLevels;
-	GLuint i;
-	GLuint face;
+	GLuint curOffset, i, face, level;
 
-	numLevels = mt->lastLevel - mt->firstLevel + 1;
-	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+	assert(mt->numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
 
 	curOffset = 0;
 	for(face = 0; face < mt->faces; face++) {
 
-		for(i = 0; i < numLevels; i++) {
-			mt->levels[i].width = minify(mt->width0, i);
-			mt->levels[i].height = minify(mt->height0, i);
-			mt->levels[i].depth = minify(mt->depth0, i);
-			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+		for(i = 0, level = mt->baseLevel; i < mt->numLevels; i++, level++) {
+			mt->levels[level].valid = 1;
+			mt->levels[level].width = minify(mt->width0, i);
+			mt->levels[level].height = minify(mt->height0, i);
+			mt->levels[level].depth = minify(mt->depth0, i);
+			compute_tex_image_offset(rmesa, mt, face, level, &curOffset);
 		}
 	}
 
@@ -162,23 +162,21 @@ static void calculate_miptree_layout_r100(radeonContextPtr rmesa, radeon_mipmap_
 
 static void calculate_miptree_layout_r300(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
 {
-	GLuint curOffset;
-	GLuint numLevels;
-	GLuint i;
+	GLuint curOffset, i, level;
 
-	numLevels = mt->lastLevel - mt->firstLevel + 1;
-	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+	assert(mt->numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
 
 	curOffset = 0;
-	for(i = 0; i < numLevels; i++) {
+	for(i = 0, level = mt->baseLevel; i < mt->numLevels; i++, level++) {
 		GLuint face;
 
-		mt->levels[i].width = minify(mt->width0, i);
-		mt->levels[i].height = minify(mt->height0, i);
-		mt->levels[i].depth = minify(mt->depth0, i);
+		mt->levels[level].valid = 1;
+		mt->levels[level].width = minify(mt->width0, i);
+		mt->levels[level].height = minify(mt->height0, i);
+		mt->levels[level].depth = minify(mt->depth0, i);
 
 		for(face = 0; face < mt->faces; face++)
-			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+			compute_tex_image_offset(rmesa, mt, face, level, &curOffset);
 	}
 
 	/* Note the required size in memory */
@@ -188,27 +186,22 @@ static void calculate_miptree_layout_r300(radeonContextPtr rmesa, radeon_mipmap_
 /**
  * Create a new mipmap tree, calculate its layout and allocate memory.
  */
-radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
-		GLenum target, GLenum internal_format, GLuint firstLevel, GLuint lastLevel,
-		GLuint width0, GLuint height0, GLuint depth0,
-		GLuint bpp, GLuint tilebits, GLuint compressed)
+static radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa,
+		GLenum target, gl_format mesaFormat, GLuint baseLevel, GLuint numLevels,
+		GLuint width0, GLuint height0, GLuint depth0, GLuint tilebits)
 {
 	radeon_mipmap_tree *mt = CALLOC_STRUCT(_radeon_mipmap_tree);
 
-	mt->radeon = rmesa;
-	mt->internal_format = internal_format;
+	mt->mesaFormat = mesaFormat;
 	mt->refcount = 1;
-	mt->t = t;
 	mt->target = target;
 	mt->faces = (target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-	mt->firstLevel = firstLevel;
-	mt->lastLevel = lastLevel;
+	mt->baseLevel = baseLevel;
+	mt->numLevels = numLevels;
 	mt->width0 = width0;
 	mt->height0 = height0;
 	mt->depth0 = depth0;
-	mt->bpp = compressed ? radeon_compressed_num_bytes(compressed) : bpp;
 	mt->tilebits = tilebits;
-	mt->compressed = compressed;
 
 	if (rmesa->radeonScreen->chip_family >= CHIP_FAMILY_R300)
 		calculate_miptree_layout_r300(rmesa, mt);
@@ -223,53 +216,43 @@ radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *
 	return mt;
 }
 
-void radeon_miptree_reference(radeon_mipmap_tree *mt)
+void radeon_miptree_reference(radeon_mipmap_tree *mt, radeon_mipmap_tree **ptr)
 {
+	assert(!*ptr);
+
 	mt->refcount++;
 	assert(mt->refcount > 0);
+
+	*ptr = mt;
 }
 
-void radeon_miptree_unreference(radeon_mipmap_tree *mt)
+void radeon_miptree_unreference(radeon_mipmap_tree **ptr)
 {
+	radeon_mipmap_tree *mt = *ptr;
 	if (!mt)
 		return;
 
 	assert(mt->refcount > 0);
+
 	mt->refcount--;
 	if (!mt->refcount) {
 		radeon_bo_unref(mt->bo);
 		free(mt);
 	}
-}
 
+	*ptr = 0;
+}
 
 /**
- * Calculate first and last mip levels for the given texture object,
- * where the dimensions are taken from the given texture image at
- * the given level.
- *
- * Note: level is the OpenGL level number, which is not necessarily the same
- * as the first level that is actually present.
- *
- * The base level image of the given texture face must be non-null,
- * or this will fail.
+ * Calculate min and max LOD for the given texture object.
+ * @param[in] tObj texture object whose LOD values to calculate
+ * @param[out] pminLod minimal LOD
+ * @param[out] pmaxLod maximal LOD
  */
-static void calculate_first_last_level(struct gl_texture_object *tObj,
-				       GLuint *pfirstLevel, GLuint *plastLevel,
-				       GLuint face, GLuint level)
+static void calculate_min_max_lod(struct gl_texture_object *tObj,
+				       unsigned *pminLod, unsigned *pmaxLod)
 {
-	const struct gl_texture_image * const baseImage =
-		tObj->Image[face][level];
-
-	assert(baseImage);
-	
-	/* These must be signed values.  MinLod and MaxLod can be negative numbers,
-	* and having firstLevel and lastLevel as signed prevents the need for
-	* extra sign checks.
-	*/
-	int   firstLevel;
-	int   lastLevel;
-
+	int minLod, maxLod;
 	/* Yes, this looks overly complicated, but it's all needed.
 	*/
 	switch (tObj->Target) {
@@ -280,32 +263,30 @@ static void calculate_first_last_level(struct gl_texture_object *tObj,
 		if (tObj->MinFilter == GL_NEAREST || tObj->MinFilter == GL_LINEAR) {
 			/* GL_NEAREST and GL_LINEAR only care about GL_TEXTURE_BASE_LEVEL.
 			*/
-			firstLevel = lastLevel = tObj->BaseLevel;
+			minLod = maxLod = tObj->BaseLevel;
 		} else {
-			firstLevel = tObj->BaseLevel + (GLint)(tObj->MinLod + 0.5);
-			firstLevel = MAX2(firstLevel, tObj->BaseLevel);
-			firstLevel = MIN2(firstLevel, level + baseImage->MaxLog2);
-			lastLevel = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
-			lastLevel = MAX2(lastLevel, tObj->BaseLevel);
-			lastLevel = MIN2(lastLevel, level + baseImage->MaxLog2);
-			lastLevel = MIN2(lastLevel, tObj->MaxLevel);
-			lastLevel = MAX2(firstLevel, lastLevel); /* need at least one level */
+			minLod = tObj->BaseLevel + (GLint)(tObj->MinLod);
+			minLod = MAX2(minLod, tObj->BaseLevel);
+			minLod = MIN2(minLod, tObj->MaxLevel);
+			maxLod = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
+			maxLod = MIN2(maxLod, tObj->MaxLevel);
+			maxLod = MIN2(maxLod, tObj->Image[0][minLod]->MaxLog2 + minLod);
+			maxLod = MAX2(maxLod, minLod); /* need at least one level */
 		}
 		break;
 	case GL_TEXTURE_RECTANGLE_NV:
 	case GL_TEXTURE_4D_SGIS:
-		firstLevel = lastLevel = 0;
+		minLod = maxLod = 0;
 		break;
 	default:
 		return;
 	}
 
 	/* save these values */
-	*pfirstLevel = firstLevel;
-	*plastLevel = lastLevel;
+	*pminLod = minLod;
+	*pmaxLod = maxLod;
 }
 
-
 /**
  * Checks whether the given miptree can hold the given texture image at the
  * given face and level.
@@ -313,23 +294,17 @@ static void calculate_first_last_level(struct gl_texture_object *tObj,
 GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
 		struct gl_texture_image *texImage, GLuint face, GLuint level)
 {
-	GLboolean isCompressed = _mesa_is_format_compressed(texImage->TexFormat);
 	radeon_mipmap_level *lvl;
 
-	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
+	if (face >= mt->faces)
 		return GL_FALSE;
 
-	if (texImage->InternalFormat != mt->internal_format ||
-	    isCompressed != mt->compressed)
+	if (texImage->TexFormat != mt->mesaFormat)
 		return GL_FALSE;
 
-	if (!isCompressed &&
-	    !mt->compressed &&
-	    _mesa_get_format_bytes(texImage->TexFormat) != mt->bpp)
-		return GL_FALSE;
-
-	lvl = &mt->levels[level - mt->firstLevel];
-	if (lvl->width != texImage->Width ||
+	lvl = &mt->levels[level];
+	if (!lvl->valid ||
+	    lvl->width != texImage->Width ||
 	    lvl->height != texImage->Height ||
 	    lvl->depth != texImage->Depth)
 		return GL_FALSE;
@@ -337,64 +312,72 @@ GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
 	return GL_TRUE;
 }
 
-
 /**
  * Checks whether the given miptree has the right format to store the given texture object.
  */
-GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
+static GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
 {
 	struct gl_texture_image *firstImage;
-	GLuint compressed;
-	GLuint numfaces = 1;
-	GLuint firstLevel, lastLevel;
-	GLuint texelBytes;
-
-	calculate_first_last_level(texObj, &firstLevel, &lastLevel, 0, texObj->BaseLevel);
-	if (texObj->Target == GL_TEXTURE_CUBE_MAP)
-		numfaces = 6;
-
-	firstImage = texObj->Image[0][firstLevel];
-	compressed = _mesa_is_format_compressed(firstImage->TexFormat) ? firstImage->TexFormat : 0;
-	texelBytes = _mesa_get_format_bytes(firstImage->TexFormat);
-
-	return (mt->firstLevel == firstLevel &&
-	        mt->lastLevel == lastLevel &&
-	        mt->width0 == firstImage->Width &&
-	        mt->height0 == firstImage->Height &&
-	        mt->depth0 == firstImage->Depth &&
-	        mt->compressed == compressed &&
-	        (!mt->compressed ? (mt->bpp == texelBytes) : 1));
-}
+	unsigned numLevels;
+	radeon_mipmap_level *mtBaseLevel;
+
+	if (texObj->BaseLevel < mt->baseLevel)
+		return GL_FALSE;
 
+	mtBaseLevel = &mt->levels[texObj->BaseLevel - mt->baseLevel];
+	firstImage = texObj->Image[0][texObj->BaseLevel];
+	numLevels = MIN2(texObj->MaxLevel - texObj->BaseLevel + 1, firstImage->MaxLog2 + 1);
+
+	if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "Checking if miptree %p matches texObj %p\n", mt, texObj);
+		fprintf(stderr, "target %d vs %d\n", mt->target, texObj->Target);
+		fprintf(stderr, "format %d vs %d\n", mt->mesaFormat, firstImage->TexFormat);
+		fprintf(stderr, "numLevels %d vs %d\n", mt->numLevels, numLevels);
+		fprintf(stderr, "width0 %d vs %d\n", mtBaseLevel->width, firstImage->Width);
+		fprintf(stderr, "height0 %d vs %d\n", mtBaseLevel->height, firstImage->Height);
+		fprintf(stderr, "depth0 %d vs %d\n", mtBaseLevel->depth, firstImage->Depth);
+		if (mt->target == texObj->Target &&
+	        mt->mesaFormat == firstImage->TexFormat &&
+	        mt->numLevels >= numLevels &&
+	        mtBaseLevel->width == firstImage->Width &&
+	        mtBaseLevel->height == firstImage->Height &&
+	        mtBaseLevel->depth == firstImage->Depth) {
+			fprintf(stderr, "MATCHED\n");
+		} else {
+			fprintf(stderr, "NOT MATCHED\n");
+		}
+	}
+
+	return (mt->target == texObj->Target &&
+	        mt->mesaFormat == firstImage->TexFormat &&
+	        mt->numLevels >= numLevels &&
+	        mtBaseLevel->width == firstImage->Width &&
+	        mtBaseLevel->height == firstImage->Height &&
+	        mtBaseLevel->depth == firstImage->Depth);
+}
 
 /**
- * Try to allocate a mipmap tree for the given texture that will fit the
- * given image in the given position.
+ * Try to allocate a mipmap tree for the given texture object.
+ * @param[in] rmesa radeon context
+ * @param[in] t radeon texture object
  */
-void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
-		radeon_texture_image *image, GLuint face, GLuint level)
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t)
 {
-	GLuint compressed = _mesa_is_format_compressed(image->base.TexFormat) ? image->base.TexFormat : 0;
-	GLuint numfaces = 1;
-	GLuint firstLevel, lastLevel;
-	GLuint texelBytes;
+	struct gl_texture_object *texObj = &t->base;
+	struct gl_texture_image *texImg = texObj->Image[0][texObj->BaseLevel];
+	GLuint numLevels;
 
 	assert(!t->mt);
 
-	calculate_first_last_level(&t->base, &firstLevel, &lastLevel, face, level);
-	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
-		numfaces = 6;
-
-	if (level != firstLevel || face >= numfaces)
+	if (!texImg)
 		return;
 
-	texelBytes = _mesa_get_format_bytes(image->base.TexFormat);
+	numLevels = MIN2(texObj->MaxLevel - texObj->BaseLevel + 1, texImg->MaxLog2 + 1);
 
-	t->mt = radeon_miptree_create(rmesa, t, t->base.Target,
-		image->base.InternalFormat,
-		firstLevel, lastLevel,
-		image->base.Width, image->base.Height, image->base.Depth,
-		texelBytes, t->tile_bits, compressed);
+	t->mt = radeon_miptree_create(rmesa, t->base.Target,
+		texImg->TexFormat, texObj->BaseLevel,
+		numLevels, texImg->Width, texImg->Height,
+		texImg->Depth, t->tile_bits);
 }
 
 /* Although we use the image_offset[] array to store relative offsets
@@ -406,21 +389,234 @@ void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
 void
 radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets)
 {
-     if (mt->target != GL_TEXTURE_3D || mt->faces == 1)
-        offsets[0] = 0;
-     else {
-	int i;
-	for (i = 0; i < 6; i++)
-		offsets[i] = mt->levels[level].faces[i].offset;
-     }
+	if (mt->target != GL_TEXTURE_3D || mt->faces == 1) {
+		offsets[0] = 0;
+	} else {
+		int i;
+		for (i = 0; i < 6; i++) {
+			offsets[i] = mt->levels[level].faces[i].offset;
+		}
+	}
 }
 
 GLuint
 radeon_miptree_image_offset(radeon_mipmap_tree *mt,
 			    GLuint face, GLuint level)
 {
-   if (mt->target == GL_TEXTURE_CUBE_MAP_ARB)
-      return (mt->levels[level].faces[face].offset);
-   else
-      return mt->levels[level].faces[0].offset;
+	if (mt->target == GL_TEXTURE_CUBE_MAP_ARB)
+		return (mt->levels[level].faces[face].offset);
+	else
+		return mt->levels[level].faces[0].offset;
 }
+
+/**
+ * Ensure that the given image is stored in the given miptree from now on.
+ */
+static void migrate_image_to_miptree(radeon_mipmap_tree *mt,
+									 radeon_texture_image *image,
+									 int face, int level)
+{
+	radeon_mipmap_level *dstlvl = &mt->levels[level];
+	unsigned char *dest;
+
+	assert(image->mt != mt);
+	assert(dstlvl->valid);
+	assert(dstlvl->width == image->base.Width);
+	assert(dstlvl->height == image->base.Height);
+	assert(dstlvl->depth == image->base.Depth);
+
+	radeon_bo_map(mt->bo, GL_TRUE);
+	dest = mt->bo->ptr + dstlvl->faces[face].offset;
+
+	if (image->mt) {
+		/* Format etc. should match, so we really just need a memcpy().
+		 * In fact, that memcpy() could be done by the hardware in many
+		 * cases, provided that we have a proper memory manager.
+		 */
+		assert(mt->mesaFormat == image->base.TexFormat);
+
+		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel];
+
+		/* TODO: bring back these assertions once the FBOs are fixed */
+#if 0
+		assert(image->mtlevel == level);
+		assert(srclvl->size == dstlvl->size);
+		assert(srclvl->rowstride == dstlvl->rowstride);
+#endif
+
+		radeon_bo_map(image->mt->bo, GL_FALSE);
+
+		memcpy(dest,
+			image->mt->bo->ptr + srclvl->faces[face].offset,
+			dstlvl->size);
+		radeon_bo_unmap(image->mt->bo);
+
+		radeon_miptree_unreference(&image->mt);
+	} else if (image->base.Data) {
+		/* This condition should be removed, it's here to workaround
+		 * a segfault when mapping textures during software fallbacks.
+		 */
+		const uint32_t srcrowstride = _mesa_format_row_stride(image->base.TexFormat, image->base.Width);
+		uint32_t rows = image->base.Height * image->base.Depth;
+
+		if (_mesa_is_format_compressed(image->base.TexFormat)) {
+			uint32_t blockWidth, blockHeight;
+			_mesa_get_format_block_size(image->base.TexFormat, &blockWidth, &blockHeight);
+			rows = (rows + blockHeight - 1) / blockHeight;
+		}
+
+		copy_rows(dest, dstlvl->rowstride, image->base.Data, srcrowstride,
+				  rows, srcrowstride);
+
+		_mesa_free_texmemory(image->base.Data);
+		image->base.Data = 0;
+	}
+
+	radeon_bo_unmap(mt->bo);
+
+	radeon_miptree_reference(mt, &image->mt);
+	image->mtface = face;
+	image->mtlevel = level;
+}
+
+/**
+ * Filter matching miptrees, and select one with the most of data.
+ * @param[in] texObj radeon texture object
+ * @param[in] firstLevel first texture level to check
+ * @param[in] lastLevel last texture level to check
+ */
+static radeon_mipmap_tree * get_biggest_matching_miptree(radeonTexObj *texObj,
+														 unsigned firstLevel,
+														 unsigned lastLevel)
+{
+	const unsigned numLevels = lastLevel - firstLevel + 1;
+	unsigned *mtSizes = calloc(numLevels, sizeof(unsigned));
+	radeon_mipmap_tree **mts = calloc(numLevels, sizeof(radeon_mipmap_tree *));
+	unsigned mtCount = 0;
+	unsigned maxMtIndex = 0;
+	radeon_mipmap_tree *tmp;
+
+	for (unsigned level = firstLevel; level <= lastLevel; ++level) {
+		radeon_texture_image *img = get_radeon_texture_image(texObj->base.Image[0][level]);
+		unsigned found = 0;
+		// TODO: why this hack??
+		if (!img)
+			break;
+
+		if (!img->mt)
+			continue;
+
+		for (int i = 0; i < mtCount; ++i) {
+			if (mts[i] == img->mt) {
+				found = 1;
+				mtSizes[i] += img->mt->levels[img->mtlevel].size;
+				break;
+			}
+		}
+
+		if (!found && radeon_miptree_matches_texture(img->mt, &texObj->base)) {
+			mtSizes[mtCount] = img->mt->levels[img->mtlevel].size;
+			mts[mtCount] = img->mt;
+			mtCount++;
+		}
+	}
+
+	if (mtCount == 0) {
+		return NULL;
+	}
+
+	for (int i = 1; i < mtCount; ++i) {
+		if (mtSizes[i] > mtSizes[maxMtIndex]) {
+			maxMtIndex = i;
+		}
+	}
+
+	tmp = mts[maxMtIndex];
+	free(mtSizes);
+	free(mts);
+
+	return tmp;
+}
+
+/**
+ * Validate texture mipmap tree.
+ * If individual images are stored in different mipmap trees
+ * use the mipmap tree that has the most of the correct data.
+ */
+int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj *t = radeon_tex_obj(texObj);
+
+	if (t->validated || t->image_override) {
+		return GL_TRUE;
+	}
+
+	if (texObj->Image[0][texObj->BaseLevel]->Border > 0)
+		return GL_FALSE;
+
+	_mesa_test_texobj_completeness(rmesa->glCtx, texObj);
+	if (!texObj->_Complete) {
+		return GL_FALSE;
+	}
+
+	calculate_min_max_lod(&t->base, &t->minLod, &t->maxLod);
+
+	if (RADEON_DEBUG & RADEON_TEXTURE)
+		fprintf(stderr, "%s: Validating texture %p now, minLod = %d, maxLod = %d\n",
+				__FUNCTION__, texObj ,t->minLod, t->maxLod);
+
+	radeon_mipmap_tree *dst_miptree;
+	dst_miptree = get_biggest_matching_miptree(t, t->minLod, t->maxLod);
+
+	if (!dst_miptree) {
+		radeon_miptree_unreference(&t->mt);
+		radeon_try_alloc_miptree(rmesa, t);
+		dst_miptree = t->mt;
+		if (RADEON_DEBUG & RADEON_TEXTURE) {
+			fprintf(stderr, "%s: No matching miptree found, allocated new one %p\n", __FUNCTION__, t->mt);
+		}
+	} else if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "%s: Using miptree %p\n", __FUNCTION__, t->mt);
+	}
+
+	const unsigned faces = texObj->Target == GL_TEXTURE_CUBE_MAP ? 6 : 1;
+	unsigned face, level;
+	radeon_texture_image *img;
+	/* Validate only the levels that will actually be used during rendering */
+	for (face = 0; face < faces; ++face) {
+		for (level = t->minLod; level <= t->maxLod; ++level) {
+			img = get_radeon_texture_image(texObj->Image[face][level]);
+
+			if (RADEON_DEBUG & RADEON_TEXTURE) {
+				fprintf(stderr, "Checking image level %d, face %d, mt %p ... ", level, face, img->mt);
+			}
+			
+			if (img->mt != dst_miptree) {
+				if (RADEON_DEBUG & RADEON_TEXTURE) {
+					fprintf(stderr, "MIGRATING\n");
+				}
+				struct radeon_bo *src_bo = (img->mt) ? img->mt->bo : img->bo;
+				if (src_bo && radeon_bo_is_referenced_by_cs(src_bo, rmesa->cmdbuf.cs)) {
+					radeon_firevertices(rmesa);
+				}
+				migrate_image_to_miptree(dst_miptree, img, face, level);
+			} else if (RADEON_DEBUG & RADEON_TEXTURE) {
+				fprintf(stderr, "OK\n");
+			}
+		}
+	}
+
+	t->validated = GL_TRUE;
+
+	return GL_TRUE;
+}
+
+uint32_t get_base_teximage_offset(radeonTexObj *texObj)
+{
+	if (!texObj->mt) {
+		return 0;
+	} else {
+		return radeon_miptree_image_offset(texObj->mt, 0, texObj->minLod);
+	}
+}
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
index db28252da3..a10649b5ae 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
@@ -44,6 +44,7 @@ struct _radeon_mipmap_level {
 	GLuint depth;
 	GLuint size; /** Size of each image, in bytes */
 	GLuint rowstride; /** in bytes */
+	GLuint valid;
 	radeon_mipmap_image faces[6];
 };
 
@@ -59,43 +60,35 @@ struct _radeon_mipmap_level {
  * changed.
  */
 struct _radeon_mipmap_tree {
-	radeonContextPtr radeon;
-	radeonTexObj *t;
 	struct radeon_bo *bo;
 	GLuint refcount;
 
 	GLuint totalsize; /** total size of the miptree, in bytes */
 
 	GLenum target; /** GL_TEXTURE_xxx */
-	GLenum internal_format;
+	GLenum mesaFormat; /** MESA_FORMAT_xxx */
 	GLuint faces; /** # of faces: 6 for cubemaps, 1 otherwise */
-	GLuint firstLevel; /** First mip level stored in this mipmap tree */
-	GLuint lastLevel; /** Last mip level stored in this mipmap tree */
+	GLuint baseLevel; /** gl_texture_object->baseLevel it was created for */
+	GLuint numLevels; /** Number of mip levels stored in this mipmap tree */
 
-	GLuint width0; /** Width of firstLevel image */
-	GLuint height0; /** Height of firstLevel image */
-	GLuint depth0; /** Depth of firstLevel image */
+	GLuint width0; /** Width of baseLevel image */
+	GLuint height0; /** Height of baseLevel image */
+	GLuint depth0; /** Depth of baseLevel image */
 
-	GLuint bpp; /** Bytes per texel */
 	GLuint tilebits; /** RADEON_TXO_xxx_TILE */
-	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
 
 	radeon_mipmap_level levels[RADEON_MIPTREE_MAX_TEXTURE_LEVELS];
 };
 
-radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
-		GLenum target, GLenum internal_format, GLuint firstLevel, GLuint lastLevel,
-		GLuint width0, GLuint height0, GLuint depth0,
-		GLuint bpp, GLuint tilebits, GLuint compressed);
-void radeon_miptree_reference(radeon_mipmap_tree *mt);
-void radeon_miptree_unreference(radeon_mipmap_tree *mt);
+void radeon_miptree_reference(radeon_mipmap_tree *mt, radeon_mipmap_tree **ptr);
+void radeon_miptree_unreference(radeon_mipmap_tree **ptr);
 
 GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
 		struct gl_texture_image *texImage, GLuint face, GLuint level);
-GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj);
-void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
-			      radeon_texture_image *texImage, GLuint face, GLuint level);
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t);
 GLuint radeon_miptree_image_offset(radeon_mipmap_tree *mt,
 				   GLuint face, GLuint level);
 void radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets);
+
+uint32_t get_base_teximage_offset(radeonTexObj *texObj);
 #endif /* __RADEON_MIPMAP_TREE_H_ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_queryobj.c b/src/mesa/drivers/dri/radeon/radeon_queryobj.c
index 6539c36268..98117cdfc1 100644
--- a/src/mesa/drivers/dri/radeon/radeon_queryobj.c
+++ b/src/mesa/drivers/dri/radeon/radeon_queryobj.c
@@ -31,24 +31,11 @@
 #include "main/imports.h"
 #include "main/simple_list.h"
 
-static int radeonQueryIsFlushed(GLcontext *ctx, struct gl_query_object *q)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	struct radeon_query_object *tmp, *query = (struct radeon_query_object *)q;
-
-	foreach(tmp, &radeon->query.not_flushed_head) {
-		if (tmp == query) {
-			return 0;
-		}
-	}
-
-	return 1;
-}
-
 static void radeonQueryGetResult(GLcontext *ctx, struct gl_query_object *q)
 {
 	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
 	struct radeon_query_object *query = (struct radeon_query_object *)q;
+        uint32_t *result;
 	int i;
 
 	radeon_print(RADEON_STATE, RADEON_VERBOSE,
@@ -56,6 +43,7 @@ static void radeonQueryGetResult(GLcontext *ctx, struct gl_query_object *q)
 			__FUNCTION__, query->Base.Id, (int) query->Base.Result);
 
 	radeon_bo_map(query->bo, GL_FALSE);
+        result = query->bo->ptr;
 
 	query->Base.Result = 0;
 	if (IS_R600_CLASS(radeon->radeonScreen)) {
@@ -66,10 +54,11 @@ static void radeonQueryGetResult(GLcontext *ctx, struct gl_query_object *q)
 		 * hw writes zpass end counts to qwords 1, 3, 5, 7.
 		 * then we substract. MSB is the valid bit.
 		 */
-		uint64_t *result = query->bo->ptr;
-		for (i = 0; i < 8; i += 2) {
-			uint64_t start = result[i];
-			uint64_t end = result[i + 1];
+		for (i = 0; i < 16; i += 4) {
+			uint64_t start = (uint64_t)LE32_TO_CPU(result[i]) |
+					 (uint64_t)LE32_TO_CPU(result[i + 1]) << 32;
+			uint64_t end = (uint64_t)LE32_TO_CPU(result[i + 2]) |
+				       (uint64_t)LE32_TO_CPU(result[i + 3]) << 32;
 			if ((start & 0x8000000000000000) && (end & 0x8000000000000000)) {
 				uint64_t query_count = end - start;
 				query->Base.Result += query_count;
@@ -79,10 +68,9 @@ static void radeonQueryGetResult(GLcontext *ctx, struct gl_query_object *q)
 				     "%d start: %lx, end: %lx %ld\n", i, start, end, end - start);
 		}
 	} else {
-		uint32_t *result = query->bo->ptr;
 		for (i = 0; i < query->curr_offset/sizeof(uint32_t); ++i) {
-			query->Base.Result += result[i];
-			radeon_print(RADEON_STATE, RADEON_TRACE, "result[%d] = %d\n", i, result[i]);
+			query->Base.Result += LE32_TO_CPU(result[i]);
+			radeon_print(RADEON_STATE, RADEON_TRACE, "result[%d] = %d\n", i, LE32_TO_CPU(result[i]));
 		}
 	}
 
@@ -120,10 +108,11 @@ static void radeonDeleteQuery(GLcontext *ctx, struct gl_query_object *q)
 
 static void radeonWaitQuery(GLcontext *ctx, struct gl_query_object *q)
 {
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
 	struct radeon_query_object *query = (struct radeon_query_object *)q;
 
 	/* If the cmdbuf with packets for this query hasn't been flushed yet, do it now */
-	if (!radeonQueryIsFlushed(ctx, q))
+	if (radeon_bo_is_referenced_by_cs(query->bo, radeon->cmdbuf.cs))
 		ctx->Driver.Flush(ctx);
 
 	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s: query id %d, bo %p, offset %d\n", __FUNCTION__, q->Id, query->bo, query->curr_offset);
@@ -155,8 +144,6 @@ static void radeonBeginQuery(GLcontext *ctx, struct gl_query_object *q)
 
 	radeon->query.queryobj.dirty = GL_TRUE;
 	radeon->hw.is_dirty = GL_TRUE;
-	insert_at_tail(&radeon->query.not_flushed_head, query);
-
 }
 
 void radeonEmitQueryEnd(GLcontext *ctx)
@@ -204,7 +191,7 @@ static void radeonCheckQuery(GLcontext *ctx, struct gl_query_object *q)
 		uint32_t domain;
 
 		/* Need to perform a flush, as per ARB_occlusion_query spec */
-		if (!radeonQueryIsFlushed(ctx, q)) {
+		if (radeon_bo_is_referenced_by_cs(query->bo, radeon->cmdbuf.cs)) {
 			ctx->Driver.Flush(ctx);
 		}
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 7a124a8be6..3080a0fcd0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -214,10 +214,10 @@ static const GLuint __driNConfigOptions = 17;
 
 #endif
 
-static int getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo );
+static int getSwapInfo( __DRIdrawable *dPriv, __DRIswapInfo * sInfo );
 
 static int
-radeonGetParam(__DRIscreenPrivate *sPriv, int param, void *value)
+radeonGetParam(__DRIscreen *sPriv, int param, void *value)
 {
   int ret;
   drm_radeon_getparam_t gp = { 0 };
@@ -249,7 +249,7 @@ radeonGetParam(__DRIscreenPrivate *sPriv, int param, void *value)
 }
 
 static const __DRIconfig **
-radeonFillInModes( __DRIscreenPrivate *psp,
+radeonFillInModes( __DRIscreen *psp,
 		   unsigned pixel_bits, unsigned depth_bits,
 		   unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -390,12 +390,14 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
    screen->device_id = device_id;
    screen->chip_flags = 0;
    switch ( device_id ) {
+   case PCI_CHIP_RN50_515E:
+   case PCI_CHIP_RN50_5969:
+	return -1;
+
    case PCI_CHIP_RADEON_LY:
    case PCI_CHIP_RADEON_LZ:
    case PCI_CHIP_RADEON_QY:
    case PCI_CHIP_RADEON_QZ:
-   case PCI_CHIP_RN50_515E:
-   case PCI_CHIP_RN50_5969:
       screen->chip_family = CHIP_FAMILY_RV100;
       break;
 
@@ -909,7 +911,7 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
 /* Create the device specific screen private data struct.
  */
 static radeonScreenPtr
-radeonCreateScreen( __DRIscreenPrivate *sPriv )
+radeonCreateScreen( __DRIscreen *sPriv )
 {
    radeonScreenPtr screen;
    RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
@@ -1248,7 +1250,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
 }
 
 static radeonScreenPtr
-radeonCreateScreen2(__DRIscreenPrivate *sPriv)
+radeonCreateScreen2(__DRIscreen *sPriv)
 {
    radeonScreenPtr screen;
    int i;
@@ -1399,7 +1401,7 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
 /* Destroy the device specific screen private data struct.
  */
 static void
-radeonDestroyScreen( __DRIscreenPrivate *sPriv )
+radeonDestroyScreen( __DRIscreen *sPriv )
 {
     radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
 
@@ -1433,7 +1435,7 @@ radeonDestroyScreen( __DRIscreenPrivate *sPriv )
 /* Initialize the driver specific screen private data.
  */
 static GLboolean
-radeonInitDriver( __DRIscreenPrivate *sPriv )
+radeonInitDriver( __DRIscreen *sPriv )
 {
     if (sPriv->dri2.enabled) {
         sPriv->private = (void *) radeonCreateScreen2( sPriv );
@@ -1457,8 +1459,8 @@ radeonInitDriver( __DRIscreenPrivate *sPriv )
  * pbuffers.
  */
 static GLboolean
-radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
-                    __DRIdrawablePrivate *driDrawPriv,
+radeonCreateBuffer( __DRIscreen *driScrnPriv,
+                    __DRIdrawable *driDrawPriv,
                     const __GLcontextModes *mesaVis,
                     GLboolean isPixmap )
 {
@@ -1482,11 +1484,11 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
     _mesa_initialize_framebuffer(&rfb->base, mesaVis);
 
     if (mesaVis->redBits == 5)
-        rgbFormat = MESA_FORMAT_RGB565;
+        rgbFormat = _mesa_little_endian() ? MESA_FORMAT_RGB565 : MESA_FORMAT_RGB565_REV;
     else if (mesaVis->alphaBits == 0)
-        rgbFormat = MESA_FORMAT_XRGB8888;
+        rgbFormat = _mesa_little_endian() ? MESA_FORMAT_XRGB8888 : MESA_FORMAT_XRGB8888_REV;
     else
-        rgbFormat = MESA_FORMAT_ARGB8888;
+        rgbFormat = _mesa_little_endian() ? MESA_FORMAT_ARGB8888 : MESA_FORMAT_ARGB8888_REV;
 
     /* front color renderbuffer */
     rfb->color_rb[0] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
@@ -1557,7 +1559,7 @@ static void radeon_cleanup_renderbuffers(struct radeon_framebuffer *rfb)
 }
 
 void
-radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+radeonDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
     struct radeon_framebuffer *rfb;
     if (!driDrawPriv)
@@ -1579,7 +1581,7 @@ radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
  * \return the __GLcontextModes supported by this driver
  */
 static const __DRIconfig **
-radeonInitScreen(__DRIscreenPrivate *psp)
+radeonInitScreen(__DRIscreen *psp)
 {
 #if defined(RADEON_R100)
    static const char *driver_name = "Radeon";
@@ -1629,7 +1631,7 @@ radeonInitScreen(__DRIscreenPrivate *psp)
  * \return the __GLcontextModes supported by this driver
  */
 static const
-__DRIconfig **radeonInitScreen2(__DRIscreenPrivate *psp)
+__DRIconfig **radeonInitScreen2(__DRIscreen *psp)
 {
    GLenum fb_format[3];
    GLenum fb_type[3];
@@ -1696,7 +1698,7 @@ __DRIconfig **radeonInitScreen2(__DRIscreenPrivate *psp)
  * Get information about previous buffer swaps.
  */
 static int
-getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
+getSwapInfo( __DRIdrawable *dPriv, __DRIswapInfo * sInfo )
 {
     struct radeon_framebuffer *rfb;
 
@@ -1749,3 +1751,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .InitScreen2     = radeonInitScreen2,
 };
 
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    &driDRI2Extension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index 15744e8828..5e6d432e11 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -86,7 +86,7 @@ typedef struct radeon_screen {
 
    __volatile__ uint32_t *scratch;
 
-   __DRIscreenPrivate *driScreen;
+   __DRIscreen *driScreen;
    unsigned int sarea_priv_offset;
    unsigned int gart_buffer_offset;	/* offset in card memory space */
    unsigned int gart_texture_offset;	/* offset in card memory space */
@@ -123,5 +123,5 @@ typedef struct radeon_screen {
 #define IS_R600_CLASS(screen) \
 	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R600)
 
-extern void radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv);
+extern void radeonDestroyBuffer(__DRIdrawable *driDrawPriv);
 #endif /* __RADEON_SCREEN_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
index 2bc7d31254..8db3d2b143 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.c
+++ b/src/mesa/drivers/dri/radeon/radeon_span.c
@@ -41,6 +41,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include "main/glheader.h"
+#include "main/texformat.h"
 #include "swrast/swrast.h"
 
 #include "radeon_common.h"
@@ -400,6 +401,18 @@ static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
 #endif
 #include "spantmp2.h"
 
+#define SPANTMP_PIXEL_FMT GL_RGB
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5_REV
+
+#define TAG(x)    radeon##x##_RGB565_REV
+#define TAG2(x,y) radeon##x##_RGB565_REV##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
 /* 16 bit, ARGB1555 color spanline and pixel functions
  */
 #define SPANTMP_PIXEL_FMT GL_BGRA
@@ -414,6 +427,18 @@ static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
 #endif
 #include "spantmp2.h"
 
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5
+
+#define TAG(x)    radeon##x##_ARGB1555_REV
+#define TAG2(x,y) radeon##x##_ARGB1555_REV##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
 /* 16 bit, RGBA4 color spanline and pixel functions
  */
 #define SPANTMP_PIXEL_FMT GL_BGRA
@@ -428,6 +453,18 @@ static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
 #endif
 #include "spantmp2.h"
 
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4
+
+#define TAG(x)    radeon##x##_ARGB4444_REV
+#define TAG2(x,y) radeon##x##_ARGB4444_REV##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
 /* 32 bit, xRGB8888 color spanline and pixel functions
  */
 #define SPANTMP_PIXEL_FMT GL_BGRA
@@ -472,6 +509,42 @@ static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
 #endif
 #include "spantmp2.h"
 
+/* 32 bit, BGRx8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8
+
+#define TAG(x)    radeon##x##_BGRx8888
+#define TAG2(x,y) radeon##x##_BGRx8888##y
+#if defined(RADEON_R600)
+#define GET_VALUE(_x, _y) ((*(GLuint*)(r600_ptr_color(rrb, _x + x_off, _y + y_off)) | 0x000000ff))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)r600_ptr_color( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
+#else
+#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) | 0x000000ff))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
+#endif
+#include "spantmp2.h"
+
+/* 32 bit, BGRA8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8
+
+#define TAG(x)    radeon##x##_BGRA8888
+#define TAG2(x,y) radeon##x##_BGRA8888##y
+#if defined(RADEON_R600)
+#define GET_PTR(X,Y) r600_ptr_color(rrb, (X) + x_off, (Y) + y_off)
+#else
+#define GET_PTR(X,Y) radeon_ptr_4byte(rrb, (X) + x_off, (Y) + y_off)
+#endif
+#include "spantmp2.h"
+
 /* ================================================================
  * Depth buffer
  */
@@ -526,10 +599,10 @@ static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x000000ff;							\
    tmp |= ((d << 8) & 0xffffff00);					\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #elif defined(RADEON_R600)
 #define WRITE_DEPTH( _x, _y, d )					\
@@ -544,26 +617,26 @@ do {									\
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
-   GLuint tmp = *_ptr;							\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #endif
 
 #if defined(RADEON_R300)
 #define READ_DEPTH( d, _x, _y )						\
   do {									\
-    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) & 0xffffff00) >> 8; \
+    d = (LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off))) & 0xffffff00) >> 8; \
   }while(0)
 #elif defined(RADEON_R600)
 #define READ_DEPTH( d, _x, _y )						\
@@ -573,11 +646,11 @@ do {									\
 #elif defined(RADEON_R200)
 #define READ_DEPTH( d, _x, _y )						\
   do {									\
-    d = *(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)) & 0x00ffffff; \
+    d = LE32_TO_CPU(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off))) & 0x00ffffff; \
   }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )	\
-  d = *(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off)) & 0x00ffffff;
+  d = LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off))) & 0x00ffffff;
 #endif
 
 #define TAG(x) radeon##x##_z24
@@ -595,7 +668,7 @@ do {									\
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   *_ptr = d;								\
+   *_ptr = CPU_TO_LE32((((d) & 0xff000000) >> 24) | (((d) & 0x00ffffff) << 8));   \
 } while (0)
 #elif defined(RADEON_R600)
 #define WRITE_DEPTH( _x, _y, d )					\
@@ -615,20 +688,21 @@ do {									\
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
-   *_ptr = d;								\
+   *_ptr = CPU_TO_LE32(d);						\
 } while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
-   *_ptr = d;					\
+   *_ptr = CPU_TO_LE32(d);						\
 } while (0)
 #endif
 
 #if defined(RADEON_R300)
 #define READ_DEPTH( d, _x, _y )						\
   do { \
-    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
+    GLuint tmp = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
+    d = LE32_TO_CPU(((tmp & 0x000000ff) << 24) | ((tmp & 0xffffff00) >> 8));	\
   }while(0)
 #elif defined(RADEON_R600)
 #define READ_DEPTH( d, _x, _y )						\
@@ -639,11 +713,11 @@ do {									\
 #elif defined(RADEON_R200)
 #define READ_DEPTH( d, _x, _y )						\
   do { \
-    d = *(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off));	\
+    d = LE32_TO_CPU(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off))); \
   }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )	do {					\
-    d = *(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off )); \
+    d = LE32_TO_CPU(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off))); \
   } while (0)
 #endif
 
@@ -660,10 +734,10 @@ do {									\
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0xffffff00;							\
    tmp |= (d) & 0xff;							\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #elif defined(RADEON_R600)
 #define WRITE_STENCIL( _x, _y, d )					\
@@ -678,19 +752,19 @@ do {									\
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte(rrb, _x + x_off, _y + y_off);		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #else
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
-   *_ptr = tmp;					\
+   *_ptr = CPU_TO_LE32(tmp);                                            \
 } while (0)
 #endif
 
@@ -698,7 +772,7 @@ do {									\
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    d = tmp & 0x000000ff;						\
 } while (0)
 #elif defined(RADEON_R600)
@@ -712,14 +786,14 @@ do {									\
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
    GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
 #else
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
    GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
-   GLuint tmp = *_ptr;				\
+   GLuint tmp = LE32_TO_CPU(*_ptr);                                     \
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
 #endif
@@ -737,8 +811,7 @@ static void map_unmap_rb(struct gl_renderbuffer *rb, int flag)
 		return;
 
 	if (flag) {
-		if (rrb->bo->bom->funcs->bo_wait)
-			radeon_bo_wait(rrb->bo);
+	        radeon_bo_wait(rrb->bo);
 		r = radeon_bo_map(rrb->bo, 1);
 		if (r) {
 			fprintf(stderr, "(%s) error(%d) mapping buffer.\n",
@@ -754,18 +827,21 @@ static void map_unmap_rb(struct gl_renderbuffer *rb, int flag)
 }
 
 static void
-radeon_map_unmap_buffers(GLcontext *ctx, GLboolean map)
+radeon_map_unmap_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb,
+			     GLboolean map)
 {
 	GLuint i, j;
 
 	/* color draw buffers */
 	for (j = 0; j < ctx->DrawBuffer->_NumColorDrawBuffers; j++)
-		map_unmap_rb(ctx->DrawBuffer->_ColorDrawBuffers[j], map);
+		map_unmap_rb(fb->_ColorDrawBuffers[j], map);
+
+	map_unmap_rb(fb->_ColorReadBuffer, map);
 
 	/* check for render to textures */
 	for (i = 0; i < BUFFER_COUNT; i++) {
 		struct gl_renderbuffer_attachment *att =
-			ctx->DrawBuffer->Attachment + i;
+			fb->Attachment + i;
 		struct gl_texture_object *tex = att->Texture;
 		if (tex) {
 			/* Render to texture. Note that a mipmapped texture need not
@@ -781,15 +857,15 @@ radeon_map_unmap_buffers(GLcontext *ctx, GLboolean map)
 				radeon_teximage_unmap(image);
 		}
 	}
-
-	map_unmap_rb(ctx->ReadBuffer->_ColorReadBuffer, map);
-
+	
 	/* depth buffer (Note wrapper!) */
-	if (ctx->DrawBuffer->_DepthBuffer)
-		map_unmap_rb(ctx->DrawBuffer->_DepthBuffer->Wrapped, map);
+	if (fb->_DepthBuffer)
+		map_unmap_rb(fb->_DepthBuffer->Wrapped, map);
 
-	if (ctx->DrawBuffer->_StencilBuffer)
-		map_unmap_rb(ctx->DrawBuffer->_StencilBuffer->Wrapped, map);
+	if (fb->_StencilBuffer)
+		map_unmap_rb(fb->_StencilBuffer->Wrapped, map);
+
+	radeon_check_front_buffer_rendering(ctx);
 }
 
 static void radeonSpanRenderStart(GLcontext * ctx)
@@ -814,23 +890,30 @@ static void radeonSpanRenderStart(GLcontext * ctx)
 			ctx->Driver.MapTexture(ctx, ctx->Texture.Unit[i]._Current);
 	}
 
-	radeon_map_unmap_buffers(ctx, 1);
+	radeon_map_unmap_framebuffer(ctx, ctx->DrawBuffer, GL_TRUE);
+	if (ctx->ReadBuffer != ctx->DrawBuffer)
+		radeon_map_unmap_framebuffer(ctx, ctx->ReadBuffer, GL_TRUE);
 }
 
 static void radeonSpanRenderFinish(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 	int i;
+
 	_swrast_flush(ctx);
-	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
-		UNLOCK_HARDWARE(rmesa);
-	}
+
 	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
 		if (ctx->Texture.Unit[i]._ReallyEnabled)
 			ctx->Driver.UnmapTexture(ctx, ctx->Texture.Unit[i]._Current);
 	}
 
-	radeon_map_unmap_buffers(ctx, 0);
+	radeon_map_unmap_framebuffer(ctx, ctx->DrawBuffer, GL_FALSE);
+	if (ctx->ReadBuffer != ctx->DrawBuffer)
+		radeon_map_unmap_framebuffer(ctx, ctx->ReadBuffer, GL_FALSE);
+
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		UNLOCK_HARDWARE(rmesa);
+	}
 }
 
 void radeonInitSpanFuncs(GLcontext * ctx)
@@ -848,14 +931,24 @@ static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
 {
 	if (rrb->base.Format == MESA_FORMAT_RGB565) {
 		radeonInitPointers_RGB565(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_RGB565_REV) {
+		radeonInitPointers_RGB565_REV(&rrb->base);
 	} else if (rrb->base.Format == MESA_FORMAT_XRGB8888) {
 		radeonInitPointers_xRGB8888(&rrb->base);
+        } else if (rrb->base.Format == MESA_FORMAT_XRGB8888_REV) {
+		radeonInitPointers_BGRx8888(&rrb->base);
 	} else if (rrb->base.Format == MESA_FORMAT_ARGB8888) {
 		radeonInitPointers_ARGB8888(&rrb->base);
+        } else if (rrb->base.Format == MESA_FORMAT_ARGB8888_REV) {
+		radeonInitPointers_BGRA8888(&rrb->base);
 	} else if (rrb->base.Format == MESA_FORMAT_ARGB4444) {
 		radeonInitPointers_ARGB4444(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_ARGB4444_REV) {
+		radeonInitPointers_ARGB4444_REV(&rrb->base);
 	} else if (rrb->base.Format == MESA_FORMAT_ARGB1555) {
 		radeonInitPointers_ARGB1555(&rrb->base);
+	} else if (rrb->base.Format == MESA_FORMAT_ARGB1555_REV) {
+		radeonInitPointers_ARGB1555_REV(&rrb->base);
 	} else if (rrb->base.Format == MESA_FORMAT_Z16) {
 		radeonInitDepthPointers_z16(&rrb->base);
 	} else if (rrb->base.Format == MESA_FORMAT_X8_Z24) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index 4d0d35ee0c..1c9ec36dae 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -521,10 +521,10 @@ static void radeonColorMask( GLcontext *ctx,
      return;
 
    mask = radeonPackColor( rrb->cpp,
-			   ctx->Color.ColorMask[RCOMP],
-			   ctx->Color.ColorMask[GCOMP],
-			   ctx->Color.ColorMask[BCOMP],
-			   ctx->Color.ColorMask[ACOMP] );
+			   ctx->Color.ColorMask[0][RCOMP],
+			   ctx->Color.ColorMask[0][GCOMP],
+			   ctx->Color.ColorMask[0][BCOMP],
+			   ctx->Color.ColorMask[0][ACOMP] );
 
    if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
       RADEON_STATECHANGE( rmesa, msk );
@@ -550,6 +550,31 @@ static void radeonPolygonOffset( GLcontext *ctx,
    rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_CONSTANT] = constant.ui32;
 }
 
+static void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask )
+{
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint i;
+   drm_radeon_stipple_t stipple;
+
+   /* Must flip pattern upside down.
+    */
+   for ( i = 0 ; i < 32 ; i++ ) {
+      rmesa->state.stipple.mask[31 - i] = ((GLuint *) mask)[i];
+   }
+
+   /* TODO: push this into cmd mechanism
+    */
+   radeon_firevertices(&rmesa->radeon);
+   LOCK_HARDWARE( &rmesa->radeon );
+
+   /* FIXME: Use window x,y offsets into stipple RAM.
+    */
+   stipple.mask = rmesa->state.stipple.mask;
+   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE,
+		    &stipple, sizeof(drm_radeon_stipple_t) );
+   UNLOCK_HARDWARE( &rmesa->radeon );
+}
+
 static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
@@ -1375,7 +1400,7 @@ static void radeonClearStencil( GLcontext *ctx, GLint s )
 void radeonUpdateWindow( GLcontext *ctx )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
    GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1430,7 +1455,7 @@ static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
 void radeonUpdateViewportOffset( GLcontext *ctx )
 {
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   __DRIdrawable *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index 2d19220d8a..dd82888254 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -645,11 +645,11 @@ static void tex_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
      OUT_BATCH(CP_PACKET0(RADEON_PP_TXOFFSET_0 + (24 * i), 0));
      if (t->mt && !t->image_override) {
         if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
-            lvl = &t->mt->levels[0];
+            lvl = &t->mt->levels[t->minLod];
 	    OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
 			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
         } else {
-           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, get_base_teximage_offset(t),
 		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
         }
       } else {
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index e61f59eaea..8bf1bfbc57 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -179,7 +179,7 @@ static void radeonSetVertexFormat( GLcontext *ctx )
 
       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 	 if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-	    GLuint sz = VB->TexCoordPtr[i]->size;
+	    GLuint sz = VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->size;
 
 	    switch (sz) {
 	    case 1:
@@ -309,7 +309,7 @@ void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
    radeonEmitState(&rmesa->radeon);
    radeonEmitVertexAOS( rmesa,
 			rmesa->radeon.swtcl.vertex_size,
-			first_elem(&rmesa->radeon.dma.reserved)->bo,
+			rmesa->radeon.swtcl.bo,
 			current_offset);
 
 		      
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
index b334ea05e5..cd02bfbcf5 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
@@ -412,6 +412,7 @@ static GLuint radeonEnsureEmitSize( GLcontext * ctx , GLuint inputs )
 	space_required += vbuf;
       else
 	space_required += index + elts;
+      space_required += VB->Primitive[i].count * 3;
       space_required += AOS_BUFSZ(nr_aos);
     }
     space_required += SCISSOR_BUFSZ;
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index 60981aada2..14163f13af 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -341,24 +341,14 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
       break;
 
    case GL_TEXTURE_BORDER_COLOR:
-      radeonSetTexBorderColor( t, texObj->BorderColor );
+      radeonSetTexBorderColor( t, texObj->BorderColor.f );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
    case GL_TEXTURE_MAX_LEVEL:
    case GL_TEXTURE_MIN_LOD:
    case GL_TEXTURE_MAX_LOD:
-
-      /* This isn't the most efficient solution but there doesn't appear to
-       * be a nice alternative.  Since there's no LOD clamping,
-       * we just have to rely on loading the right subset of mipmap levels
-       * to simulate a clamped LOD.
-       */
-      if (t->mt) {
-         radeon_miptree_unreference(t->mt);
-	 t->mt = 0;
-	 t->validated = GL_FALSE;
-      }
+      t->validated = GL_FALSE;
       break;
 
    default:
@@ -388,10 +378,8 @@ static void radeonDeleteTexture( GLcontext *ctx,
      }
    }
 
-   if (t->mt) {
-      radeon_miptree_unreference(t->mt);
-      t->mt = 0;
-   }
+   radeon_miptree_unreference(&t->mt);
+
    /* Free mipmap images and the texture object itself */
    _mesa_delete_texture_object(ctx, texObj);
 }
@@ -440,7 +428,7 @@ radeonNewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
    radeonSetTexWrap( t, t->base.WrapS, t->base.WrapT );
    radeonSetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
    radeonSetTexFilter( t, t->base.MinFilter, t->base.MagFilter );
-   radeonSetTexBorderColor( t, t->base.BorderColor );
+   radeonSetTexBorderColor( t, t->base.BorderColor.f );
    return &t->base;
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index 429977a8bc..84ddcfd4fd 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -672,24 +672,13 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
     	    return;
     	}
 
-	radeon_update_renderbuffers(pDRICtx, dPriv);
-	/* back & depth buffer are useless free them right away */
-	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-        rb->bo = NULL;
-	}
-	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
-	if (rb && rb->bo) {
-		radeon_bo_unref(rb->bo);
-		rb->bo = NULL;
-	}
+	radeon_update_renderbuffers(pDRICtx, dPriv, GL_TRUE);
 	rb = rfb->color_rb[0];
 	if (rb->bo == NULL) {
 		/* Failed to BO for the buffer */
 		return;
 	}
-	
+
 	_mesa_lock_texture(radeon->glCtx, texObj);
 	if (t->bo) {
 		radeon_bo_unref(t->bo);
@@ -699,14 +688,10 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 		radeon_bo_unref(rImage->bo);
 		rImage->bo = NULL;
 	}
-	if (t->mt) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = NULL;
-	}
-	if (rImage->mt) {
-		radeon_miptree_unreference(rImage->mt);
-		rImage->mt = NULL;
-	}
+
+	radeon_miptree_unreference(&t->mt);
+	radeon_miptree_unreference(&rImage->mt);
+
 	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
 				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
 	texImage->RowStride = rb->pitch / rb->cpp;
@@ -718,8 +703,6 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 	t->tile_bits = 0;
 	t->image_override = GL_TRUE;
 	t->override_offset = 0;
-	t->pp_txpitch &= (1 << 13) -1;
-	pitch_val = rb->pitch;
 	switch (rb->cpp) {
 	case 4:
 		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
@@ -738,12 +721,17 @@ void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_
 		t->pp_txfilter |= tx_table[MESA_FORMAT_RGB565].filter;
 		break;
 	}
-        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
-		   | ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
-        t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
-	t->pp_txpitch = pitch_val;
-        t->pp_txpitch -= 32;
 
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+
+        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		| ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
+	if (target == GL_TEXTURE_RECTANGLE_NV) {
+		t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
+		t->pp_txpitch = pitch_val;
+		t->pp_txpitch -= 32;
+	}
 	t->validated = GL_TRUE;
 	_mesa_unlock_texture(radeon->glCtx, texObj);
 	return;
@@ -1021,7 +1009,7 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
 	return GL_TRUE;
    }
 
-   firstImage = t->base.Image[0][t->mt->firstLevel];   
+   firstImage = t->base.Image[0][t->minLod];
 
    if (firstImage->Border > 0) {
       fprintf(stderr, "%s: border\n", __FUNCTION__);
@@ -1049,9 +1037,9 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
 	 return GL_FALSE;
       }
    }
-   
+
    t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << RADEON_MAX_MIP_LEVEL_SHIFT;
+   t->pp_txfilter |= (t->maxLod - t->minLod) << RADEON_MAX_MIP_LEVEL_SHIFT;
 	
    t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
 		       RADEON_TXFORMAT_HEIGHT_MASK |
@@ -1060,9 +1048,9 @@ static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int
 		       RADEON_TXFORMAT_F5_HEIGHT_MASK);
    t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
 		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
-   
+
    t->tile_bits = 0;
-   
+
    if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
       ASSERT(log2Width == log2Height);
       t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index baa99b752b..03178116c1 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2009 Maciej Cencora.
  * Copyright (C) 2008 Nicolai Haehnle.
  * Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
  *
@@ -46,7 +47,7 @@
 #include "radeon_mipmap_tree.h"
 
 
-static void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
+void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
 	GLuint numrows, GLuint rowsize)
 {
 	assert(rowsize <= dststride);
@@ -81,8 +82,7 @@ void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage)
 	radeon_texture_image* image = get_radeon_texture_image(timage);
 
 	if (image->mt) {
-		radeon_miptree_unreference(image->mt);
-		image->mt = 0;
+		radeon_miptree_unreference(&image->mt);
 		assert(!image->base.Data);
 	} else {
 		_mesa_free_texture_image_data(ctx, timage);
@@ -108,7 +108,7 @@ static void teximage_set_map_data(radeon_texture_image *image)
 	lvl = &image->mt->levels[image->mtlevel];
 
 	image->base.Data = image->mt->bo->ptr + lvl->faces[image->mtface].offset;
-	image->base.RowStride = lvl->rowstride / image->mt->bpp;
+	image->base.RowStride = lvl->rowstride / _mesa_get_format_bytes(image->base.TexFormat);
 }
 
 
@@ -174,7 +174,7 @@ void radeonMapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
 
 	radeon_bo_map(t->mt->bo, GL_FALSE);
 	for(face = 0; face < t->mt->faces; ++face) {
-		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+		for(level = t->minLod; level <= t->maxLod; ++level)
 			teximage_set_map_data(get_radeon_texture_image(texObj->Image[face][level]));
 	}
 }
@@ -191,7 +191,7 @@ void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
 	  return;
 
 	for(face = 0; face < t->mt->faces; ++face) {
-		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+		for(level = t->minLod; level <= t->maxLod; ++level)
 			texObj->Image[face][level]->Data = 0;
 	}
 	radeon_bo_unmap(t->mt->bo);
@@ -240,8 +240,7 @@ static void radeon_generate_mipmap(GLcontext *ctx, GLenum target,
 			image->mtlevel = i;
 			image->mtface = face;
 
-			radeon_miptree_unreference(image->mt);
-			image->mt = NULL;
+			radeon_miptree_unreference(&image->mt);
 		}
 	}
 	
@@ -473,6 +472,19 @@ gl_format radeonChooseTextureFormat(GLcontext * ctx,
 	case GL_RGBA32F_ARB:
 		return MESA_FORMAT_RGBA_FLOAT32;
 
+#ifdef RADEON_R300
+	case GL_DEPTH_COMPONENT:
+	case GL_DEPTH_COMPONENT16:
+		return MESA_FORMAT_Z16;
+	case GL_DEPTH_COMPONENT24:
+	case GL_DEPTH_COMPONENT32:
+	case GL_DEPTH_STENCIL_EXT:
+	case GL_DEPTH24_STENCIL8_EXT:
+		if (rmesa->radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+			return MESA_FORMAT_S8_Z24;
+		else
+			return MESA_FORMAT_Z16;
+#else
 	case GL_DEPTH_COMPONENT:
 	case GL_DEPTH_COMPONENT16:
 	case GL_DEPTH_COMPONENT24:
@@ -480,6 +492,7 @@ gl_format radeonChooseTextureFormat(GLcontext * ctx,
 	case GL_DEPTH_STENCIL_EXT:
 	case GL_DEPTH24_STENCIL8_EXT:
 		return MESA_FORMAT_S8_Z24;
+#endif
 
 	/* EXT_texture_sRGB */
 	case GL_SRGB:
@@ -510,6 +523,173 @@ gl_format radeonChooseTextureFormat(GLcontext * ctx,
 	return MESA_FORMAT_NONE;		/* never get here */
 }
 
+/** Check if given image is valid within current texture object.
+ */
+static int image_matches_texture_obj(struct gl_texture_object *texObj,
+	struct gl_texture_image *texImage,
+	unsigned level)
+{
+	const struct gl_texture_image *baseImage = texObj->Image[0][texObj->BaseLevel];
+
+	if (!baseImage)
+		return 0;
+
+	if (level < texObj->BaseLevel || level > texObj->MaxLevel)
+		return 0;
+
+	const unsigned levelDiff = level - texObj->BaseLevel;
+	const unsigned refWidth = MAX2(baseImage->Width >> levelDiff, 1);
+	const unsigned refHeight = MAX2(baseImage->Height >> levelDiff, 1);
+	const unsigned refDepth = MAX2(baseImage->Depth >> levelDiff, 1);
+
+	return (texImage->Width == refWidth &&
+			texImage->Height == refHeight &&
+			texImage->Depth == refDepth);
+}
+
+static void teximage_assign_miptree(radeonContextPtr rmesa,
+	struct gl_texture_object *texObj,
+	struct gl_texture_image *texImage,
+	unsigned face,
+	unsigned level)
+{
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+
+	/* Since miptree holds only images for levels <BaseLevel..MaxLevel>
+	 * don't allocate the miptree if the teximage won't fit.
+	 */
+	if (!image_matches_texture_obj(texObj, texImage, level))
+		return;
+
+	/* Try using current miptree, or create new if there isn't any */
+	if (!t->mt || !radeon_miptree_matches_image(t->mt, texImage, face, level)) {
+		radeon_miptree_unreference(&t->mt);
+		radeon_try_alloc_miptree(rmesa, t);
+		if (RADEON_DEBUG & RADEON_TEXTURE) {
+			fprintf(stderr, "%s: texObj %p, texImage %p, face %d, level %d, "
+				"texObj miptree doesn't match, allocated new miptree %p\n",
+				__FUNCTION__, texObj, texImage, face, level, t->mt);
+		}
+	}
+
+	/* Miptree alocation may have failed,
+	 * when there was no image for baselevel specified */
+	if (t->mt) {
+		image->mtface = face;
+		image->mtlevel = level;
+		radeon_miptree_reference(t->mt, &image->mt);
+	}
+}
+
+static GLuint * allocate_image_offsets(GLcontext *ctx,
+	unsigned alignedWidth,
+	unsigned height,
+	unsigned depth)
+{
+	int i;
+	GLuint *offsets;
+
+	offsets = _mesa_malloc(depth * sizeof(GLuint)) ;
+	if (!offsets) {
+		_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTex[Sub]Image");
+		return NULL;
+	}
+
+	for (i = 0; i < depth; ++i) {
+		offsets[i] = alignedWidth * height * i;
+	}
+
+	return offsets;
+}
+
+/**
+ * Update a subregion of the given texture image.
+ */
+static void radeon_store_teximage(GLcontext* ctx, int dims,
+		GLint xoffset, GLint yoffset, GLint zoffset,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLsizei imageSize,
+		GLenum format, GLenum type,
+		const GLvoid * pixels,
+		const struct gl_pixelstore_attrib *packing,
+		struct gl_texture_object *texObj,
+		struct gl_texture_image *texImage,
+		int compressed)
+{
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+
+	GLuint dstRowStride;
+	GLuint *dstImageOffsets;
+
+	if (image->mt) {
+		dstRowStride = image->mt->levels[image->mtlevel].rowstride;
+	} else if (t->bo) {
+		/* TFP case */
+		/* TODO */
+		assert(0);
+	} else {
+		dstRowStride = _mesa_format_row_stride(texImage->TexFormat, texImage->Width);
+	}
+
+	assert(dstRowStride);
+
+	if (dims == 3) {
+		unsigned alignedWidth = dstRowStride/_mesa_get_format_bytes(texImage->TexFormat);
+		dstImageOffsets = allocate_image_offsets(ctx, alignedWidth, texImage->Height, texImage->Depth);
+		if (!dstImageOffsets) {
+			return;
+		}
+	} else {
+		dstImageOffsets = texImage->ImageOffsets;
+	}
+
+	radeon_teximage_map(image, GL_TRUE);
+
+	if (compressed) {
+		uint32_t srcRowStride, bytesPerRow, rows, block_width, block_height;
+		GLubyte *img_start;
+
+		_mesa_get_format_block_size(texImage->TexFormat, &block_width, &block_height);
+
+		if (!image->mt) {
+			dstRowStride = _mesa_format_row_stride(texImage->TexFormat, texImage->Width);
+			img_start = _mesa_compressed_image_address(xoffset, yoffset, 0,
+									texImage->TexFormat,
+									texImage->Width, texImage->Data);
+		}
+		else {
+			uint32_t offset;
+			offset = dstRowStride / _mesa_get_format_bytes(texImage->TexFormat) * yoffset / block_height + xoffset / block_width;
+			offset *= _mesa_get_format_bytes(texImage->TexFormat);
+			img_start = texImage->Data + offset;
+		}
+		srcRowStride = _mesa_format_row_stride(texImage->TexFormat, width);
+		bytesPerRow = srcRowStride;
+		rows = (height + block_height - 1) / block_height;
+
+		copy_rows(img_start, dstRowStride, pixels, srcRowStride, rows, bytesPerRow);
+	}
+	else {
+		if (!_mesa_texstore(ctx, dims, texImage->_BaseFormat,
+					texImage->TexFormat, texImage->Data,
+					xoffset, yoffset, zoffset,
+					dstRowStride,
+					dstImageOffsets,
+					width, height, depth,
+					format, type, pixels, packing)) {
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
+		}
+	}
+
+	if (dims == 3) {
+		_mesa_free(dstImageOffsets);
+	}
+
+	radeon_teximage_unmap(image);
+}
+
 /**
  * All glTexImage calls go through this function.
  */
@@ -528,13 +708,22 @@ static void radeon_teximage(
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 	radeonTexObj* t = radeon_tex_obj(texObj);
 	radeon_texture_image* image = get_radeon_texture_image(texImage);
-	GLuint dstRowStride;
 	GLint postConvWidth = width;
 	GLint postConvHeight = height;
-	GLuint texelBytes;
 	GLuint face = radeon_face_for_target(target);
 
-	radeon_firevertices(rmesa);
+	{
+		struct radeon_bo *bo;
+		bo = !image->mt ? image->bo : image->mt->bo;
+		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->cmdbuf.cs)) {
+			radeon_firevertices(rmesa);
+		}
+	}
+
+	if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "radeon_teximage%dd: texObj %p, texImage %p, face %d, level %d\n",
+				dims, texObj, texImage, face, level);
+	}
 
 	t->validated = GL_FALSE;
 
@@ -543,53 +732,35 @@ static void radeon_teximage(
 						  &postConvHeight);
 	}
 
-	if (_mesa_is_format_compressed(texImage->TexFormat)) {
-		texelBytes = 0;
-	} else {
-		texelBytes = _mesa_get_format_bytes(texImage->TexFormat);
+	if (!_mesa_is_format_compressed(texImage->TexFormat)) {
+		GLuint texelBytes = _mesa_get_format_bytes(texImage->TexFormat);
 		/* Minimum pitch of 32 bytes */
 		if (postConvWidth * texelBytes < 32) {
-		  postConvWidth = 32 / texelBytes;
-		  texImage->RowStride = postConvWidth;
+			postConvWidth = 32 / texelBytes;
+			texImage->RowStride = postConvWidth;
 		}
-		if (!image->mt) {      
+		if (!image->mt) {
 			assert(texImage->RowStride == postConvWidth);
 		}
 	}
 
-	/* Allocate memory for image */
-	radeonFreeTexImageData(ctx, texImage); /* Mesa core only clears texImage->Data but not image->mt */
-
-	if (t->mt &&
-	    t->mt->firstLevel == level &&
-	    t->mt->lastLevel == level &&
-	    t->mt->target != GL_TEXTURE_CUBE_MAP_ARB &&
-	    !radeon_miptree_matches_image(t->mt, texImage, face, level)) {
-	  radeon_miptree_unreference(t->mt);
-	  t->mt = NULL;
-	}
-
-	if (!t->mt)
-		radeon_try_alloc_miptree(rmesa, t, image, face, level);
-	if (t->mt && radeon_miptree_matches_image(t->mt, texImage, face, level)) {
-		radeon_mipmap_level *lvl;
-		image->mt = t->mt;
-		image->mtlevel = level - t->mt->firstLevel;
-		image->mtface = face;
-		radeon_miptree_reference(t->mt);
-		lvl = &image->mt->levels[image->mtlevel];
-		dstRowStride = lvl->rowstride;
-	} else {
-		int size;
-		if (_mesa_is_format_compressed(texImage->TexFormat)) {
-			size = _mesa_format_image_size(texImage->TexFormat,
-						       texImage->Width,
-						       texImage->Height,
-						       texImage->Depth);
-		} else {
-			size = texImage->Width * texImage->Height * texImage->Depth * _mesa_get_format_bytes(texImage->TexFormat);
+	/* Mesa core only clears texImage->Data but not image->mt */
+	radeonFreeTexImageData(ctx, texImage);
+
+	if (!t->bo) {
+		teximage_assign_miptree(rmesa, texObj, texImage, face, level);
+		if (!image->mt) {
+			int size = _mesa_format_image_size(texImage->TexFormat,
+								texImage->Width,
+								texImage->Height,
+								texImage->Depth);
+			texImage->Data = _mesa_alloc_texmemory(size);
+			if (RADEON_DEBUG & RADEON_TEXTURE) {
+				fprintf(stderr, "radeon_teximage%dd: texObj %p, texImage %p, "
+					" no miptree assigned, using local memory %p\n",
+					dims, texObj, texImage, texImage->Data);
+			}
 		}
-		texImage->Data = _mesa_alloc_texmemory(size);
 	}
 
 	/* Upload texture image; note that the spec allows pixels to be NULL */
@@ -603,65 +774,16 @@ static void radeon_teximage(
 	}
 
 	if (pixels) {
-		radeon_teximage_map(image, GL_TRUE);
-		if (compressed) {
-			if (image->mt) {
-				uint32_t srcRowStride, bytesPerRow, rows;
-				srcRowStride = _mesa_format_row_stride(texImage->TexFormat, width);
-				bytesPerRow = srcRowStride;
-				rows = (height + 3) / 4;
-				copy_rows(texImage->Data, image->mt->levels[level].rowstride,
-					  pixels, srcRowStride, rows, bytesPerRow);
-			} else {
-				memcpy(texImage->Data, pixels, imageSize);
-			}
-		} else {
-			GLuint dstRowStride;
-			GLuint *dstImageOffsets;
-
-			if (image->mt) {
-				radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
-				dstRowStride = lvl->rowstride;
-			} else {
-				dstRowStride = texImage->Width * _mesa_get_format_bytes(texImage->TexFormat);
-			}
-
-			if (dims == 3) {
-				int i;
-
-				dstImageOffsets = _mesa_malloc(depth * sizeof(GLuint)) ;
-				if (!dstImageOffsets)
-					_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
-
-				for (i = 0; i < depth; ++i) {
-					dstImageOffsets[i] = dstRowStride/_mesa_get_format_bytes(texImage->TexFormat) * height * i;
-				}
-			} else {
-				dstImageOffsets = texImage->ImageOffsets;
-			}
-
-			if (!_mesa_texstore(ctx, dims,
-					    texImage->_BaseFormat,
-					    texImage->TexFormat,
-					    texImage->Data, 0, 0, 0, /* dstX/Y/Zoffset */
-					    dstRowStride,
-					    dstImageOffsets,
-					    width, height, depth,
-					    format, type, pixels, packing)) {
-				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
-			}
-
-			if (dims == 3)
-				_mesa_free(dstImageOffsets);
-		}
+		radeon_store_teximage(ctx, dims,
+			0, 0, 0,
+			width, height, depth,
+			imageSize, format, type,
+			pixels, packing,
+			texObj, texImage,
+			compressed);
 	}
 
 	_mesa_unmap_teximage_pbo(ctx, packing);
-
-	if (pixels)
-	  radeon_teximage_unmap(image);
-
-
 }
 
 void radeonTexImage1D(GLcontext * ctx, GLenum target, GLint level,
@@ -714,7 +836,7 @@ void radeonTexImage3D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 /**
- * Update a subregion of the given texture image.
+ * All glTexSubImage calls go through this function.
  */
 static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int level,
 		GLint xoffset, GLint yoffset, GLint zoffset,
@@ -731,66 +853,39 @@ static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int leve
 	radeonTexObj* t = radeon_tex_obj(texObj);
 	radeon_texture_image* image = get_radeon_texture_image(texImage);
 
-	radeon_firevertices(rmesa);
+	{
+		struct radeon_bo *bo;
+		bo = !image->mt ? image->bo : image->mt->bo;
+		if (bo && radeon_bo_is_referenced_by_cs(bo, rmesa->cmdbuf.cs)) {
+			radeon_firevertices(rmesa);
+		}
+	}
+
+	if (RADEON_DEBUG & RADEON_TEXTURE) {
+		fprintf(stderr, "radeon_texsubimage%dd: texObj %p, texImage %p, face %d, level %d\n",
+				dims, texObj, texImage, radeon_face_for_target(target), level);
+	}
 
 	t->validated = GL_FALSE;
 	if (compressed) {
 		pixels = _mesa_validate_pbo_compressed_teximage(
-			ctx, imageSize, pixels, packing, "glCompressedTexImage");
+			ctx, imageSize, pixels, packing, "glCompressedTexSubImage");
 	} else {
 		pixels = _mesa_validate_pbo_teximage(ctx, dims,
-			width, height, depth, format, type, pixels, packing, "glTexSubImage1D");
+			width, height, depth, format, type, pixels, packing, "glTexSubImage");
 	}
 
 	if (pixels) {
-		GLint dstRowStride;
-		radeon_teximage_map(image, GL_TRUE);
-
-		if (image->mt) {
-			radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
-			dstRowStride = lvl->rowstride;
-		} else {
-			dstRowStride = texImage->RowStride * _mesa_get_format_bytes(texImage->TexFormat);
-		}
-
-		if (compressed) {
-			uint32_t srcRowStride, bytesPerRow, rows;
-			GLubyte *img_start;
-			if (!image->mt) {
-				dstRowStride = _mesa_format_row_stride(texImage->TexFormat, texImage->Width);
-				img_start = _mesa_compressed_image_address(xoffset, yoffset, 0,
-									   texImage->TexFormat,
-									   texImage->Width, texImage->Data);
-			}
-			else {
-				uint32_t blocks_x = dstRowStride / (image->mt->bpp * 4);
-				img_start = texImage->Data + image->mt->bpp * 4 * (blocks_x * (yoffset / 4) + xoffset / 4);
-			}
-			srcRowStride = _mesa_format_row_stride(texImage->TexFormat, width);
-			bytesPerRow = srcRowStride;
-			rows = (height + 3) / 4;
-
-			copy_rows(img_start, dstRowStride,  pixels, srcRowStride, rows,  bytesPerRow);
-			
-		}
-		else {
-			if (!_mesa_texstore(ctx, dims, texImage->_BaseFormat,
-					    texImage->TexFormat, texImage->Data,
-					    xoffset, yoffset, zoffset,
-					    dstRowStride,
-					    texImage->ImageOffsets,
-					    width, height, depth,
-					    format, type, pixels, packing)) {
-				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
-			}
-		}
+		radeon_store_teximage(ctx, dims,
+			xoffset, yoffset, zoffset,
+			width, height, depth,
+			imageSize, format, type,
+			pixels, packing,
+			texObj, texImage,
+			compressed);
 	}
 
-	radeon_teximage_unmap(image);
-
 	_mesa_unmap_teximage_pbo(ctx, packing);
-
-
 }
 
 void radeonTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
@@ -846,143 +941,6 @@ void radeonTexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
 		format, type, pixels, packing, texObj, texImage, 0);
 }
 
-
-
-/**
- * Ensure that the given image is stored in the given miptree from now on.
- */
-static void migrate_image_to_miptree(radeon_mipmap_tree *mt, radeon_texture_image *image, int face, int level)
-{
-	radeon_mipmap_level *dstlvl = &mt->levels[level - mt->firstLevel];
-	unsigned char *dest;
-
-	assert(image->mt != mt);
-	assert(dstlvl->width == image->base.Width);
-	assert(dstlvl->height == image->base.Height);
-	assert(dstlvl->depth == image->base.Depth);
-
-
-	radeon_bo_map(mt->bo, GL_TRUE);
-	dest = mt->bo->ptr + dstlvl->faces[face].offset;
-
-	if (image->mt) {
-		/* Format etc. should match, so we really just need a memcpy().
-		 * In fact, that memcpy() could be done by the hardware in many
-		 * cases, provided that we have a proper memory manager.
-		 */
-		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel-image->mt->firstLevel];
-
-		assert(srclvl->size == dstlvl->size);
-		assert(srclvl->rowstride == dstlvl->rowstride);
-
-		radeon_bo_map(image->mt->bo, GL_FALSE);
-
-		memcpy(dest,
-			image->mt->bo->ptr + srclvl->faces[face].offset,
-			dstlvl->size);
-		radeon_bo_unmap(image->mt->bo);
-
-		radeon_miptree_unreference(image->mt);
-	} else {
-		uint32_t srcrowstride;
-		uint32_t height;
-		/* need to confirm this value is correct */
-		if (mt->compressed) {
-			height = (image->base.Height + 3) / 4;
-			srcrowstride = _mesa_format_row_stride(image->base.TexFormat, image->base.Width);
-		} else {
-			height = image->base.Height * image->base.Depth;
-			srcrowstride = image->base.Width * _mesa_get_format_bytes(image->base.TexFormat);
-		}
-
-//		if (mt->tilebits)
-//			WARN_ONCE("%s: tiling not supported yet", __FUNCTION__);
-
-		copy_rows(dest, dstlvl->rowstride, image->base.Data, srcrowstride,
-			  height, srcrowstride);
-
-		_mesa_free_texmemory(image->base.Data);
-		image->base.Data = 0;
-	}
-
-	radeon_bo_unmap(mt->bo);
-
-	image->mt = mt;
-	image->mtface = face;
-	image->mtlevel = level;
-	radeon_miptree_reference(image->mt);
-}
-
-int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-	radeonTexObj *t = radeon_tex_obj(texObj);
-	radeon_texture_image *baseimage = get_radeon_texture_image(texObj->Image[0][texObj->BaseLevel]);
-	int face, level;
-
-	if (t->validated || t->image_override)
-		return GL_TRUE;
-
-	if (RADEON_DEBUG & RADEON_TEXTURE)
-		fprintf(stderr, "%s: Validating texture %p now\n", __FUNCTION__, texObj);
-
-	if (baseimage->base.Border > 0)
-		return GL_FALSE;
-
-	/* Ensure a matching miptree exists.
-	 *
-	 * Differing mipmap trees can result when the app uses TexImage to
-	 * change texture dimensions.
-	 *
-	 * Prefer to use base image's miptree if it
-	 * exists, since that most likely contains more valid data (remember
-	 * that the base level is usually significantly larger than the rest
-	 * of the miptree, so cubemaps are the only possible exception).
-	 */
-	if (baseimage->mt &&
-	    baseimage->mt != t->mt &&
-	    radeon_miptree_matches_texture(baseimage->mt, &t->base)) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = baseimage->mt;
-		radeon_miptree_reference(t->mt);
-	} else if (t->mt && !radeon_miptree_matches_texture(t->mt, &t->base)) {
-		radeon_miptree_unreference(t->mt);
-		t->mt = 0;
-	}
-
-	if (!t->mt) {
-		if (RADEON_DEBUG & RADEON_TEXTURE)
-			fprintf(stderr, " Allocate new miptree\n");
-		radeon_try_alloc_miptree(rmesa, t, baseimage, 0, texObj->BaseLevel);
-		if (!t->mt) {
-			_mesa_problem(ctx, "radeon_validate_texture failed to alloc miptree");
-			return GL_FALSE;
-		}
-	}
-
-	/* Ensure all images are stored in the single main miptree */
-	for(face = 0; face < t->mt->faces; ++face) {
-		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level) {
-			radeon_texture_image *image = get_radeon_texture_image(texObj->Image[face][level]);
-			if (RADEON_DEBUG & RADEON_TEXTURE)
-				fprintf(stderr, " face %i, level %i... %p vs %p ", face, level, t->mt, image->mt);
-			if (t->mt == image->mt || (!image->mt && !image->base.Data)) {
-				if (RADEON_DEBUG & RADEON_TEXTURE)
-					fprintf(stderr, "OK\n");
-
-				continue;
-			}
-
-			if (RADEON_DEBUG & RADEON_TEXTURE)
-				fprintf(stderr, "migrating\n");
-			migrate_image_to_miptree(t->mt, image, face, level);
-		}
-	}
-
-	return GL_TRUE;
-}
-
-
 /**
  * Need to map texture image into memory before copying image data,
  * then unmap it.
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.h b/src/mesa/drivers/dri/radeon/radeon_texture.h
index 8995546d77..906daf12d0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.h
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.h
@@ -33,7 +33,8 @@
 
 #include "main/formats.h"
 
-
+void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
+	GLuint numrows, GLuint rowsize);
 struct gl_texture_image *radeonNewTextureImage(GLcontext *ctx);
 void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage);
 
diff --git a/src/mesa/drivers/dri/savage/savage_init.h b/src/mesa/drivers/dri/savage/savage_init.h
index abb8440fc4..bfd3077d70 100644
--- a/src/mesa/drivers/dri/savage/savage_init.h
+++ b/src/mesa/drivers/dri/savage/savage_init.h
@@ -66,7 +66,7 @@ typedef struct {
    unsigned int logTextureGranularity[SAVAGE_NR_TEX_HEAPS];
    drmAddress texVirtual[SAVAGE_NR_TEX_HEAPS];
   
-   __DRIscreenPrivate *driScrnPriv;
+   __DRIscreen *driScrnPriv;
 
    savageRegion aperture;
    savageRegion agpTextures;
diff --git a/src/mesa/drivers/dri/savage/savage_xmesa.c b/src/mesa/drivers/dri/savage/savage_xmesa.c
index d307b81e8e..8e879ca41c 100644
--- a/src/mesa/drivers/dri/savage/savage_xmesa.c
+++ b/src/mesa/drivers/dri/savage/savage_xmesa.c
@@ -168,7 +168,7 @@ PUBLIC const __DRIextension *savageScreenExtensions[] = {
 };
 
 static GLboolean
-savageInitDriver(__DRIscreenPrivate *sPriv)
+savageInitDriver(__DRIscreen *sPriv)
 {
   savageScreenPrivate *savageScreen;
   SAVAGEDRIPtr         gDRIPriv = (SAVAGEDRIPtr)sPriv->pDevPriv;
@@ -272,7 +272,7 @@ savageInitDriver(__DRIscreenPrivate *sPriv)
 /* Accessed by dlsym from dri_mesa_init.c
  */
 static void
-savageDestroyScreen(__DRIscreenPrivate *sPriv)
+savageDestroyScreen(__DRIscreen *sPriv)
 {
    savageScreenPrivate *savageScreen = (savageScreenPrivate *)sPriv->private;
 
@@ -288,12 +288,12 @@ savageDestroyScreen(__DRIscreenPrivate *sPriv)
 
 static GLboolean
 savageCreateContext( const __GLcontextModes *mesaVis,
-		     __DRIcontextPrivate *driContextPriv,
+		     __DRIcontext *driContextPriv,
 		     void *sharedContextPrivate )
 {
    GLcontext *ctx, *shareCtx;
    savageContextPtr imesa;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    struct dd_function_table functions;
    savageScreenPrivate *savageScreen = (savageScreenPrivate *)sPriv->private;
    drm_savage_sarea_t *saPriv=(drm_savage_sarea_t *)(((char*)sPriv->pSAREA)+
@@ -546,7 +546,7 @@ savageCreateContext( const __GLcontextModes *mesaVis,
 }
 
 static void
-savageDestroyContext(__DRIcontextPrivate *driContextPriv)
+savageDestroyContext(__DRIcontext *driContextPriv)
 {
    savageContextPtr imesa = (savageContextPtr) driContextPriv->driverPrivate;
    GLuint i;
@@ -580,8 +580,8 @@ savageDestroyContext(__DRIcontextPrivate *driContextPriv)
 
 
 static GLboolean
-savageCreateBuffer( __DRIscreenPrivate *driScrnPriv,
-		    __DRIdrawablePrivate *driDrawPriv,
+savageCreateBuffer( __DRIscreen *driScrnPriv,
+		    __DRIdrawable *driDrawPriv,
 		    const __GLcontextModes *mesaVis,
 		    GLboolean isPixmap)
 {
@@ -675,13 +675,13 @@ savageCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 }
 
 static void
-savageDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+savageDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 #if 0
-void XMesaSwapBuffers(__DRIdrawablePrivate *driDrawPriv)
+void XMesaSwapBuffers(__DRIdrawable *driDrawPriv)
 {
    /* XXX should do swap according to the buffer, not the context! */
    savageContextPtr imesa = savageCtx; 
@@ -694,7 +694,7 @@ void XMesaSwapBuffers(__DRIdrawablePrivate *driDrawPriv)
 
 void savageXMesaSetClipRects(savageContextPtr imesa)
 {
-   __DRIdrawablePrivate *dPriv = imesa->driDrawable;
+   __DRIdrawable *dPriv = imesa->driDrawable;
 
    if ((dPriv->numBackClipRects == 0)
        || (imesa->glCtx->DrawBuffer->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT)) {
@@ -715,8 +715,8 @@ void savageXMesaSetClipRects(savageContextPtr imesa)
 
 static void savageXMesaWindowMoved( savageContextPtr imesa ) 
 {
-   __DRIdrawablePrivate *const drawable = imesa->driDrawable;
-   __DRIdrawablePrivate *const readable = imesa->driReadable;
+   __DRIdrawable *const drawable = imesa->driDrawable;
+   __DRIdrawable *const readable = imesa->driReadable;
 
    if (0)
       fprintf(stderr, "savageXMesaWindowMoved\n\n");
@@ -731,7 +731,7 @@ static void savageXMesaWindowMoved( savageContextPtr imesa )
 
 
 static GLboolean
-savageUnbindContext(__DRIcontextPrivate *driContextPriv)
+savageUnbindContext(__DRIcontext *driContextPriv)
 {
    savageContextPtr savage = (savageContextPtr) driContextPriv->driverPrivate;
    if (savage)
@@ -742,7 +742,7 @@ savageUnbindContext(__DRIcontextPrivate *driContextPriv)
 
 #if 0
 static GLboolean
-savageOpenFullScreen(__DRIcontextPrivate *driContextPriv)
+savageOpenFullScreen(__DRIcontext *driContextPriv)
 {
     
   
@@ -761,7 +761,7 @@ savageOpenFullScreen(__DRIcontextPrivate *driContextPriv)
 }
 
 static GLboolean
-savageCloseFullScreen(__DRIcontextPrivate *driContextPriv)
+savageCloseFullScreen(__DRIcontext *driContextPriv)
 {
     
     if (driContextPriv) {
@@ -777,9 +777,9 @@ savageCloseFullScreen(__DRIcontextPrivate *driContextPriv)
 #endif
 
 static GLboolean
-savageMakeCurrent(__DRIcontextPrivate *driContextPriv,
-		  __DRIdrawablePrivate *driDrawPriv,
-		  __DRIdrawablePrivate *driReadPriv)
+savageMakeCurrent(__DRIcontext *driContextPriv,
+		  __DRIdrawable *driDrawPriv,
+		  __DRIdrawable *driReadPriv)
 {
    if (driContextPriv) {
       savageContextPtr imesa
@@ -816,9 +816,9 @@ savageMakeCurrent(__DRIcontextPrivate *driContextPriv,
 
 void savageGetLock( savageContextPtr imesa, GLuint flags ) 
 {
-   __DRIdrawablePrivate *const drawable = imesa->driDrawable;
-   __DRIdrawablePrivate *const readable = imesa->driReadable;
-   __DRIscreenPrivate *sPriv = imesa->driScreen;
+   __DRIdrawable *const drawable = imesa->driDrawable;
+   __DRIdrawable *const readable = imesa->driReadable;
+   __DRIscreen *sPriv = imesa->driScreen;
    drm_savage_sarea_t *sarea = imesa->sarea;
    int me = imesa->hHWContext;
    int stamp = drawable->lastStamp; 
@@ -883,7 +883,7 @@ void savageGetLock( savageContextPtr imesa, GLuint flags )
 }
 
 static const  __DRIconfig **
-savageFillInModes( __DRIscreenPrivate *psp,
+savageFillInModes( __DRIscreen *psp,
 		   unsigned pixel_bits, unsigned depth_bits,
 		   unsigned stencil_bits, GLboolean have_back_buffer )
 {
@@ -967,7 +967,7 @@ savageFillInModes( __DRIscreenPrivate *psp,
  * \return the __GLcontextModes supported by this driver
  */
 static const __DRIconfig **
-savageInitScreen(__DRIscreenPrivate *psp)
+savageInitScreen(__DRIscreen *psp)
 {
    static const __DRIversion ddx_expected = { 2, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
@@ -1001,3 +1001,10 @@ const struct __DriverAPIRec driDriverAPI = {
    savageMakeCurrent,
    savageUnbindContext
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/savage/savagecontext.h b/src/mesa/drivers/dri/savage/savagecontext.h
index 53a37db1cb..ba1e6e1e1a 100644
--- a/src/mesa/drivers/dri/savage/savagecontext.h
+++ b/src/mesa/drivers/dri/savage/savagecontext.h
@@ -271,10 +271,10 @@ struct savage_context_t {
     drm_hw_lock_t *driHwLock;
     GLuint driFd;
 
-    __DRIdrawablePrivate *driDrawable;
-    __DRIdrawablePrivate *driReadable;
+    __DRIdrawable *driDrawable;
+    __DRIdrawable *driReadable;
 
-    __DRIscreenPrivate *driScreen;
+    __DRIscreen *driScreen;
     savageScreenPrivate *savageScreen; 
     drm_savage_sarea_t *sarea;
 
diff --git a/src/mesa/drivers/dri/savage/savageioctl.c b/src/mesa/drivers/dri/savage/savageioctl.c
index 948ed18419..d0b64e801a 100644
--- a/src/mesa/drivers/dri/savage/savageioctl.c
+++ b/src/mesa/drivers/dri/savage/savageioctl.c
@@ -337,6 +337,8 @@ static void savageDDClear( GLcontext *ctx, GLbitfield mask )
    GLint ch = ctx->DrawBuffer->_Ymax - cy;
 
    /* XXX FIX ME: the cx,cy,cw,ch vars are currently ignored! */
+   (void) ch;
+   (void) cw;
 
    if (SAVAGE_DEBUG & DEBUG_VERBOSE_MSG)
        fprintf (stderr, "%s\n", __FUNCTION__);
@@ -358,15 +360,15 @@ static void savageDDClear( GLcontext *ctx, GLbitfield mask )
    depthMask = 0;
    switch (imesa->savageScreen->cpp) {
    case 2:
-       colorMask = PACK_COLOR_565(ctx->Color.ColorMask[0],
-				  ctx->Color.ColorMask[1],
-				  ctx->Color.ColorMask[2]);
+       colorMask = PACK_COLOR_565(ctx->Color.ColorMask[0][0],
+				  ctx->Color.ColorMask[0][1],
+				  ctx->Color.ColorMask[0][2]);
        break;
    case 4:
-       colorMask = PACK_COLOR_8888(ctx->Color.ColorMask[3],
-				   ctx->Color.ColorMask[2],
-				   ctx->Color.ColorMask[1],
-				   ctx->Color.ColorMask[0]);
+       colorMask = PACK_COLOR_8888(ctx->Color.ColorMask[0][3],
+				   ctx->Color.ColorMask[0][2],
+				   ctx->Color.ColorMask[0][1],
+				   ctx->Color.ColorMask[0][0]);
        break;
    }
 
@@ -431,7 +433,7 @@ static void savageDDClear( GLcontext *ctx, GLbitfield mask )
 /*
  * Copy the back buffer to the front buffer. 
  */
-void savageSwapBuffers( __DRIdrawablePrivate *dPriv )
+void savageSwapBuffers( __DRIdrawable *dPriv )
 {
    savageContextPtr imesa;
 
@@ -535,7 +537,7 @@ void savageFlushVertices( savageContextPtr imesa )
 
 void savageFlushCmdBufLocked( savageContextPtr imesa, GLboolean discard )
 {
-    __DRIdrawablePrivate *dPriv = imesa->driDrawable;
+    __DRIdrawable *dPriv = imesa->driDrawable;
 
     if (!imesa->dmaVtxBuf.total)
 	discard = GL_FALSE;
diff --git a/src/mesa/drivers/dri/savage/savageioctl.h b/src/mesa/drivers/dri/savage/savageioctl.h
index 639605cc51..e7e80816c1 100644
--- a/src/mesa/drivers/dri/savage/savageioctl.h
+++ b/src/mesa/drivers/dri/savage/savageioctl.h
@@ -39,7 +39,7 @@ void savageFlushCmdBuf( savageContextPtr imesa, GLboolean discard );
 
 void savageDDInitIoctlFuncs( GLcontext *ctx );
 
-void savageSwapBuffers( __DRIdrawablePrivate *dPriv );
+void savageSwapBuffers( __DRIdrawable *dPriv );
 
 #define WAIT_IDLE_EMPTY(imesa) do { \
     if (SAVAGE_DEBUG & DEBUG_VERBOSE_MSG) \
diff --git a/src/mesa/drivers/dri/savage/savagerender.c b/src/mesa/drivers/dri/savage/savagerender.c
index 32c74f9467..8221edf387 100644
--- a/src/mesa/drivers/dri/savage/savagerender.c
+++ b/src/mesa/drivers/dri/savage/savagerender.c
@@ -252,13 +252,13 @@ static GLboolean run_texnorm_stage( GLcontext *ctx,
          const GLboolean normalizeS = (texObj->WrapS == GL_REPEAT);
          const GLboolean normalizeT = (reallyEnabled & TEXTURE_2D_BIT) &&
             (texObj->WrapT == GL_REPEAT);
-         const GLfloat *in = (GLfloat *)VB->TexCoordPtr[i]->data;
-         const GLint instride = VB->TexCoordPtr[i]->stride;
+         const GLfloat *in = (GLfloat *)VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->data;
+         const GLint instride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->stride;
          GLfloat (*out)[4] = store->texcoord[i].data;
          GLint j;
 
          if (!ctx->Texture.Unit[i]._ReallyEnabled ||
-             VB->TexCoordPtr[i]->size == 4)
+             VB->AttribPtr[_TNL_ATTRIB_TEX0 + i]->size == 4)
             /* Never try to normalize homogenous tex coords! */
             continue;
 
@@ -297,7 +297,7 @@ static GLboolean run_texnorm_stage( GLcontext *ctx,
          }
 
          if (normalizeS || normalizeT)
-            VB->AttribPtr[VERT_ATTRIB_TEX0+i] = VB->TexCoordPtr[i] = &store->texcoord[i];
+            VB->AttribPtr[_TNL_ATTRIB_TEX0 + i] = &store->texcoord[i];
       }
    }
 
diff --git a/src/mesa/drivers/dri/savage/savagespan.c b/src/mesa/drivers/dri/savage/savagespan.c
index 3bb6fbcc63..792e166d9c 100644
--- a/src/mesa/drivers/dri/savage/savagespan.c
+++ b/src/mesa/drivers/dri/savage/savagespan.c
@@ -34,7 +34,7 @@
 
 #define LOCAL_VARS						\
    driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   __DRIdrawablePrivate *const dPriv = drb->dPriv;		\
+   __DRIdrawable *const dPriv = drb->dPriv;		\
    GLuint cpp   = drb->cpp;					\
    GLuint pitch = drb->pitch;					\
    GLuint height = dPriv->h;					\
@@ -44,7 +44,7 @@
 
 #define LOCAL_DEPTH_VARS					\
    driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   __DRIdrawablePrivate *const dPriv = drb->dPriv;		\
+   __DRIdrawable *const dPriv = drb->dPriv;		\
    GLuint zpp   = drb->cpp;					\
    GLuint pitch = drb->pitch;					\
    GLuint height = dPriv->h;					\
diff --git a/src/mesa/drivers/dri/savage/savagetex.c b/src/mesa/drivers/dri/savage/savagetex.c
index 6c97bb6c70..97598f599e 100644
--- a/src/mesa/drivers/dri/savage/savagetex.c
+++ b/src/mesa/drivers/dri/savage/savagetex.c
@@ -507,7 +507,7 @@ savageAllocTexObj( struct gl_texture_object *texObj )
 
       savageSetTexWrapping(t,texObj->WrapS,texObj->WrapT);
       savageSetTexFilter(t,texObj->MinFilter,texObj->MagFilter);
-      savageSetTexBorderColor(t,texObj->BorderColor);
+      savageSetTexBorderColor(t,texObj->BorderColor.f);
    }
 
    return t;
@@ -2044,7 +2044,7 @@ static void savageTexParameter( GLcontext *ctx, GLenum target,
       break;
   
    case GL_TEXTURE_BORDER_COLOR:
-      savageSetTexBorderColor(t,tObj->BorderColor);
+      savageSetTexBorderColor(t,tObj->BorderColor.f);
       break;
 
    default:
diff --git a/src/mesa/drivers/dri/savage/savagetris.c b/src/mesa/drivers/dri/savage/savagetris.c
index c04763b40e..9a92541ef7 100644
--- a/src/mesa/drivers/dri/savage/savagetris.c
+++ b/src/mesa/drivers/dri/savage/savagetris.c
@@ -435,7 +435,8 @@ do {								\
 
 #define LOCAL_VARS(n)						\
    savageContextPtr imesa = SAVAGE_CONTEXT(ctx);		\
-   GLuint color[n], spec[n];					\
+   GLuint color[n] = { 0 };					\
+   GLuint spec[n] = { 0 };					\
    GLuint coloroffset =						\
       ((imesa->skip & SAVAGE_SKIP_W) ? 3 : 4);			\
    GLboolean specoffset =					\
@@ -879,13 +880,13 @@ static GLboolean savageCheckPTexHack( GLcontext *ctx )
 
    RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
 
-   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX0 ) && VB->TexCoordPtr[0]->size == 4) {
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX0 ) && VB->AttribPtr[_TNL_ATTRIB_TEX0]->size == 4) {
       if (!RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_ATTRIB_TEX1, _TNL_LAST_TEX ))
 	 return GL_TRUE; /* apply ptex hack */
       else
 	 FALLBACK(ctx, SAVAGE_FALLBACK_PROJ_TEXTURE, GL_TRUE);
    }
-   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX1 ) && VB->TexCoordPtr[1]->size == 4)
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX1 ) && VB->AttribPtr[_TNL_ATTRIB_TEX1]->size == 4)
       FALLBACK(ctx, SAVAGE_FALLBACK_PROJ_TEXTURE, GL_TRUE);
 
    return GL_FALSE; /* don't apply ptex hack */
@@ -976,13 +977,13 @@ static INLINE GLuint savageChooseVertexFormat_s3d( GLcontext *ctx )
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX0 )) {
       if (imesa->ptexHack)
 	 EMIT_ATTR( _TNL_ATTRIB_TEX0, EMIT_3F_XYW, SAVAGE_EMIT_STQ0, SAVAGE_SKIP_ST0);
-      else if (VB->TexCoordPtr[0]->size == 4)
+      else if (VB->AttribPtr[_TNL_ATTRIB_TEX0]->size == 4)
 	 assert (0); /* should be caught by savageCheckPTexHack */
-      else if (VB->TexCoordPtr[0]->size >= 2)
+      else if (VB->AttribPtr[_TNL_ATTRIB_TEX0]->size >= 2)
 	 /* The chromium menu emits some 3D tex coords even though no
 	  * 3D texture is enabled. Ignore the 3rd coordinate. */
 	 EMIT_ATTR( _TNL_ATTRIB_TEX0, EMIT_2F, SAVAGE_EMIT_ST0, SAVAGE_SKIP_ST0 );
-      else if (VB->TexCoordPtr[0]->size == 1) {
+      else if (VB->AttribPtr[_TNL_ATTRIB_TEX0]->size == 1) {
 	 EMIT_ATTR( _TNL_ATTRIB_TEX0, EMIT_1F, SAVAGE_EMIT_S0, SAVAGE_SKIP_S0 );
 	 EMIT_PAD( 4 );
       } else
@@ -1025,9 +1026,9 @@ static INLINE GLuint savageChooseVertexFormat_s4( GLcontext *ctx )
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX0 )) {
       if (imesa->ptexHack)
 	 NEED_ATTR( SAVAGE_EMIT_STQ0, SAVAGE_SKIP_ST0);
-      else if (VB->TexCoordPtr[0]->size == 4)
+      else if (VB->AttribPtr[_TNL_ATTRIB_TEX0]->size == 4)
 	 assert (0); /* should be caught by savageCheckPTexHack */
-      else if (VB->TexCoordPtr[0]->size >= 2)
+      else if (VB->AttribPtr[_TNL_ATTRIB_TEX0]->size >= 2)
 	 /* The chromium menu emits some 3D tex coords even though no
 	  * 3D texture is enabled. Ignore the 3rd coordinate. */
 	 NEED_ATTR( SAVAGE_EMIT_ST0, SAVAGE_SKIP_ST0 );
@@ -1035,10 +1036,10 @@ static INLINE GLuint savageChooseVertexFormat_s4( GLcontext *ctx )
 	 NEED_ATTR( SAVAGE_EMIT_S0, SAVAGE_SKIP_S0 );
    }
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX1 )) {
-      if (VB->TexCoordPtr[1]->size == 4)
+      if (VB->AttribPtr[_TNL_ATTRIB_TEX1]->size == 4)
 	 /* projective textures are not supported by the hardware */
 	 assert (0); /* should be caught by savageCheckPTexHack */
-      else if (VB->TexCoordPtr[1]->size >= 2)
+      else if (VB->AttribPtr[_TNL_ATTRIB_TEX1]->size >= 2)
 	 NEED_ATTR( SAVAGE_EMIT_ST1, SAVAGE_SKIP_ST1 );
       else
 	 NEED_ATTR( SAVAGE_EMIT_S1, SAVAGE_SKIP_S1 );
diff --git a/src/mesa/drivers/dri/sis/sis_clear.c b/src/mesa/drivers/dri/sis/sis_clear.c
index 323383da62..d358ef62dc 100644
--- a/src/mesa/drivers/dri/sis/sis_clear.c
+++ b/src/mesa/drivers/dri/sis/sis_clear.c
@@ -393,7 +393,6 @@ sis_clear_z_stencil_buffer( GLcontext * ctx, GLbitfield mask,
 			    GLint x, GLint y, GLint width, GLint height )
 {
    sisContextPtr smesa = SIS_CONTEXT(ctx);
-   int cmd;
 
    mWait3DCmdQueue (8);
    MMIO(REG_SRC_PITCH, (smesa->zFormat == SiS_ZFORMAT_Z16) ?
diff --git a/src/mesa/drivers/dri/sis/sis_context.c b/src/mesa/drivers/dri/sis/sis_context.c
index f501e7ad2e..0944f4d8b4 100644
--- a/src/mesa/drivers/dri/sis/sis_context.c
+++ b/src/mesa/drivers/dri/sis/sis_context.c
@@ -83,6 +83,7 @@ static struct dri_extension card_extensions[] =
     { NULL,                                NULL }
 };
 
+#if 0
 static struct dri_extension card_extensions_6326[] =
 {
     /*{ "GL_ARB_texture_border_clamp",       NULL },*/
@@ -90,6 +91,7 @@ static struct dri_extension card_extensions_6326[] =
     /*{ "GL_MESA_ycbcr_texture",             NULL },*/
     { NULL,                                NULL }
 };
+#endif
 
 static const struct dri_debug_control debug_control[] =
 {
@@ -160,11 +162,11 @@ void sisReAllocateBuffers(GLcontext *ctx, GLframebuffer *drawbuffer,
 
 GLboolean
 sisCreateContext( const __GLcontextModes *glVisual,
-		  __DRIcontextPrivate *driContextPriv,
+		  __DRIcontext *driContextPriv,
                   void *sharedContextPrivate )
 {
    GLcontext *ctx, *shareCtx;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    sisContextPtr smesa;
    sisScreenPtr sisScreen;
    int i;
@@ -337,7 +339,7 @@ sisCreateContext( const __GLcontextModes *glVisual,
 }
 
 void
-sisDestroyContext ( __DRIcontextPrivate *driContextPriv )
+sisDestroyContext ( __DRIcontext *driContextPriv )
 {
    sisContextPtr smesa = (sisContextPtr)driContextPriv->driverPrivate;
 
@@ -365,9 +367,9 @@ sisDestroyContext ( __DRIcontextPrivate *driContextPriv )
 }
 
 GLboolean
-sisMakeCurrent( __DRIcontextPrivate *driContextPriv,
-                __DRIdrawablePrivate *driDrawPriv,
-                __DRIdrawablePrivate *driReadPriv )
+sisMakeCurrent( __DRIcontext *driContextPriv,
+                __DRIdrawable *driDrawPriv,
+                __DRIdrawable *driReadPriv )
 {
    if ( driContextPriv ) {
       GET_CURRENT_CONTEXT(ctx);
@@ -396,7 +398,7 @@ sisMakeCurrent( __DRIcontextPrivate *driContextPriv,
 }
 
 GLboolean
-sisUnbindContext( __DRIcontextPrivate *driContextPriv )
+sisUnbindContext( __DRIcontext *driContextPriv )
 {
    return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/sis/sis_context.h b/src/mesa/drivers/dri/sis/sis_context.h
index bc53cb5efa..4179ee081a 100644
--- a/src/mesa/drivers/dri/sis/sis_context.h
+++ b/src/mesa/drivers/dri/sis/sis_context.h
@@ -359,9 +359,9 @@ struct sis_context
 
   /* Mirrors of some DRI state
    */
-  __DRIcontextPrivate	*driContext;	/* DRI context */
-  __DRIscreenPrivate	*driScreen;	/* DRI screen */
-  __DRIdrawablePrivate	*driDrawable;	/* DRI drawable bound to this ctx */
+  __DRIcontext	*driContext;	/* DRI context */
+  __DRIscreen	*driScreen;	/* DRI screen */
+  __DRIdrawable	*driDrawable;	/* DRI drawable bound to this ctx */
 
   unsigned int lastStamp;	        /* mirror driDrawable->lastStamp */
 
@@ -439,18 +439,18 @@ enum _sis_verbose {
 };
 
 extern GLboolean sisCreateContext( const __GLcontextModes *glVisual,
-				   __DRIcontextPrivate *driContextPriv,
+				   __DRIcontext *driContextPriv,
                                    void *sharedContextPrivate );
-extern void sisDestroyContext( __DRIcontextPrivate * );
+extern void sisDestroyContext( __DRIcontext * );
 
 void sisReAllocateBuffers(GLcontext *ctx, GLframebuffer *drawbuffer,
                           GLuint width, GLuint height);
 
-extern GLboolean sisMakeCurrent( __DRIcontextPrivate *driContextPriv,
-                                  __DRIdrawablePrivate *driDrawPriv,
-                                  __DRIdrawablePrivate *driReadPriv );
+extern GLboolean sisMakeCurrent( __DRIcontext *driContextPriv,
+                                  __DRIdrawable *driDrawPriv,
+                                  __DRIdrawable *driReadPriv );
 
-extern GLboolean sisUnbindContext( __DRIcontextPrivate *driContextPriv );
+extern GLboolean sisUnbindContext( __DRIcontext *driContextPriv );
 
 void WaitEngIdle (sisContextPtr smesa);
 void Wait2DEngIdle (sisContextPtr smesa);
diff --git a/src/mesa/drivers/dri/sis/sis_lock.c b/src/mesa/drivers/dri/sis/sis_lock.c
index 806110cad4..b8ff4e31e2 100644
--- a/src/mesa/drivers/dri/sis/sis_lock.c
+++ b/src/mesa/drivers/dri/sis/sis_lock.c
@@ -46,8 +46,8 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 void
 sisGetLock( sisContextPtr smesa, GLuint flags )
 {
-   __DRIdrawablePrivate *dPriv = smesa->driDrawable;
-   __DRIscreenPrivate *sPriv = smesa->driScreen;
+   __DRIdrawable *dPriv = smesa->driDrawable;
+   __DRIscreen *sPriv = smesa->driScreen;
    SISSAREAPrivPtr sarea = smesa->sarea;
 
    drmGetLock( smesa->driFd, smesa->hHWContext, flags );
diff --git a/src/mesa/drivers/dri/sis/sis_screen.c b/src/mesa/drivers/dri/sis/sis_screen.c
index fec9158236..d38b93ec9b 100644
--- a/src/mesa/drivers/dri/sis/sis_screen.c
+++ b/src/mesa/drivers/dri/sis/sis_screen.c
@@ -65,7 +65,7 @@ static const GLuint __driNConfigOptions = 3;
 extern const struct dri_extension card_extensions[];
 
 static const __DRIconfig **
-sisFillInModes(__DRIscreenPrivate *psp, int bpp)
+sisFillInModes(__DRIscreen *psp, int bpp)
 {
    __DRIconfig **configs;
    unsigned depth_buffer_factor;
@@ -117,7 +117,7 @@ sisFillInModes(__DRIscreenPrivate *psp, int bpp)
 /* Create the device specific screen private data struct.
  */
 static sisScreenPtr
-sisCreateScreen( __DRIscreenPrivate *sPriv )
+sisCreateScreen( __DRIscreen *sPriv )
 {
    sisScreenPtr sisScreen;
    SISDRIPtr sisDRIPriv = (SISDRIPtr)sPriv->pDevPriv;
@@ -172,7 +172,7 @@ sisCreateScreen( __DRIscreenPrivate *sPriv )
 /* Destroy the device specific screen private data struct.
  */
 static void
-sisDestroyScreen( __DRIscreenPrivate *sPriv )
+sisDestroyScreen( __DRIscreen *sPriv )
 {
    sisScreenPtr sisScreen = (sisScreenPtr)sPriv->private;
 
@@ -192,8 +192,8 @@ sisDestroyScreen( __DRIscreenPrivate *sPriv )
  * data.
  */
 static GLboolean
-sisCreateBuffer( __DRIscreenPrivate *driScrnPriv,
-                 __DRIdrawablePrivate *driDrawPriv,
+sisCreateBuffer( __DRIscreen *driScrnPriv,
+                 __DRIdrawable *driDrawPriv,
                  const __GLcontextModes *mesaVis,
                  GLboolean isPixmap )
 {
@@ -219,12 +219,12 @@ sisCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-sisDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+sisDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
-static void sisCopyBuffer( __DRIdrawablePrivate *dPriv )
+static void sisCopyBuffer( __DRIdrawable *dPriv )
 {
    sisContextPtr smesa = (sisContextPtr)dPriv->driContextPriv->driverPrivate;
    int i;
@@ -259,7 +259,7 @@ static void sisCopyBuffer( __DRIdrawablePrivate *dPriv )
 
 /* Copy the back color buffer to the front color buffer */
 static void
-sisSwapBuffers(__DRIdrawablePrivate *dPriv)
+sisSwapBuffers(__DRIdrawable *dPriv)
 {
    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
          sisContextPtr smesa = (sisContextPtr) dPriv->driContextPriv->driverPrivate;
@@ -284,7 +284,7 @@ sisSwapBuffers(__DRIdrawablePrivate *dPriv)
  * \return the __GLcontextModes supported by this driver
  */
 static const __DRIconfig **
-sisInitScreen(__DRIscreenPrivate *psp)
+sisInitScreen(__DRIscreen *psp)
 {
    static const __DRIversion ddx_expected = {0, 8, 0};
    static const __DRIversion dri_expected = {4, 0, 0};
@@ -325,3 +325,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .SwapBuffersMSC  = NULL
 
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/sis/sis_screen.h b/src/mesa/drivers/dri/sis/sis_screen.h
index 07c29cfa09..8009fecc31 100644
--- a/src/mesa/drivers/dri/sis/sis_screen.h
+++ b/src/mesa/drivers/dri/sis/sis_screen.h
@@ -50,7 +50,7 @@ typedef struct {
    int cpp;
    unsigned int screenX, screenY;
 
-   __DRIscreenPrivate *driScreen;
+   __DRIscreen *driScreen;
    unsigned int sarea_priv_offset;
 
    /* Configuration cache with default values for all contexts */
diff --git a/src/mesa/drivers/dri/sis/sis_span.c b/src/mesa/drivers/dri/sis/sis_span.c
index cfbb51007d..008b00160e 100644
--- a/src/mesa/drivers/dri/sis/sis_span.c
+++ b/src/mesa/drivers/dri/sis/sis_span.c
@@ -42,7 +42,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define LOCAL_VARS							\
    sisContextPtr smesa = SIS_CONTEXT(ctx);				\
-   __DRIdrawablePrivate *dPriv = smesa->driDrawable;			\
+   __DRIdrawable *dPriv = smesa->driDrawable;			\
    struct sis_renderbuffer *srb = (struct sis_renderbuffer *) rb;	\
    GLuint pitch = srb->pitch;						\
    char *buf = srb->map;						\
@@ -52,7 +52,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define LOCAL_DEPTH_VARS						\
    sisContextPtr smesa = SIS_CONTEXT(ctx);				\
-   __DRIdrawablePrivate *dPriv = smesa->driDrawable;			\
+   __DRIdrawable *dPriv = smesa->driDrawable;			\
    struct sis_renderbuffer *srb = (struct sis_renderbuffer *) rb;	\
    char *buf = srb->map;
 
diff --git a/src/mesa/drivers/dri/sis/sis_texstate.c b/src/mesa/drivers/dri/sis/sis_texstate.c
index a507173b21..4c22a10cf7 100644
--- a/src/mesa/drivers/dri/sis/sis_texstate.c
+++ b/src/mesa/drivers/dri/sis/sis_texstate.c
@@ -457,10 +457,10 @@ sis_set_texobj_parm( GLcontext *ctx, struct gl_texture_object *texObj,
 
    {
       GLubyte c[4];
-      CLAMPED_FLOAT_TO_UBYTE(c[0], texObj->BorderColor[0]);
-      CLAMPED_FLOAT_TO_UBYTE(c[1], texObj->BorderColor[1]);
-      CLAMPED_FLOAT_TO_UBYTE(c[2], texObj->BorderColor[2]);
-      CLAMPED_FLOAT_TO_UBYTE(c[3], texObj->BorderColor[3]);
+      CLAMPED_FLOAT_TO_UBYTE(c[0], texObj->BorderColor.f[0]);
+      CLAMPED_FLOAT_TO_UBYTE(c[1], texObj->BorderColor.f[1]);
+      CLAMPED_FLOAT_TO_UBYTE(c[2], texObj->BorderColor.f[2]);
+      CLAMPED_FLOAT_TO_UBYTE(c[3], texObj->BorderColor.f[3]);
 
       current->texture[hw_unit].hwTextureBorderColor = 
          PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
diff --git a/src/mesa/drivers/dri/sis/sis_tris.c b/src/mesa/drivers/dri/sis/sis_tris.c
index 76d12d07b3..4690274c3c 100644
--- a/src/mesa/drivers/dri/sis/sis_tris.c
+++ b/src/mesa/drivers/dri/sis/sis_tris.c
@@ -430,7 +430,8 @@ do {								\
 
 #define LOCAL_VARS(n)						\
    sisContextPtr smesa = SIS_CONTEXT(ctx);			\
-   GLuint color[n], spec[n];					\
+   GLuint color[n] = { 0 };					\
+   GLuint spec[n] = { 0 };					\
    GLuint coloroffset = smesa->coloroffset;			\
    GLuint specoffset = smesa->specoffset;			\
    (void) color; (void) spec; (void) coloroffset; (void) specoffset;
@@ -903,14 +904,14 @@ static void sisRenderStart( GLcontext *ctx )
 
    /* projective textures are not supported by the hardware */
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX0 )) {
-      if (VB->TexCoordPtr[0]->size > 2)
+      if (VB->AttribPtr[_TNL_ATTRIB_TEX0]->size > 2)
 	 tex_fallback = GL_TRUE;
       EMIT_ATTR(_TNL_ATTRIB_TEX0, EMIT_2F);
       AGPParseSet |= SiS_PS_HAS_UV0;
    }
    /* Will only hit tex1 on SiS300 */
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX1 )) {
-      if (VB->TexCoordPtr[1]->size > 2)
+      if (VB->AttribPtr[_TNL_ATTRIB_TEX1]->size > 2)
 	 tex_fallback = GL_TRUE;
       EMIT_ATTR(_TNL_ATTRIB_TEX1, EMIT_2F);
       AGPParseSet |= SiS_PS_HAS_UV1;
diff --git a/src/mesa/drivers/dri/swrast/swrast_span.c b/src/mesa/drivers/dri/swrast/swrast_span.c
index 2d3c25dcbe..f8e503463f 100644
--- a/src/mesa/drivers/dri/swrast/swrast_span.c
+++ b/src/mesa/drivers/dri/swrast/swrast_span.c
@@ -63,56 +63,42 @@ static const GLubyte kernel[16] = {
 
 /* 32-bit BGRA */
 #define STORE_PIXEL_A8R8G8B8(DST, X, Y, VALUE) \
-   DST[3] = VALUE[ACOMP]; \
-   DST[2] = VALUE[RCOMP]; \
-   DST[1] = VALUE[GCOMP]; \
-   DST[0] = VALUE[BCOMP]
+   *DST = VALUE[ACOMP] << 24 | VALUE[RCOMP] << 16 | VALUE[GCOMP] << 8 | VALUE[BCOMP]
 #define STORE_PIXEL_RGB_A8R8G8B8(DST, X, Y, VALUE) \
-   DST[3] = 0xff; \
-   DST[2] = VALUE[RCOMP]; \
-   DST[1] = VALUE[GCOMP]; \
-   DST[0] = VALUE[BCOMP]
+   *DST = 0xff << 24 | VALUE[RCOMP] << 16 | VALUE[GCOMP] << 8 | VALUE[BCOMP]
 #define FETCH_PIXEL_A8R8G8B8(DST, SRC) \
-   DST[ACOMP] = SRC[3]; \
-   DST[RCOMP] = SRC[2]; \
-   DST[GCOMP] = SRC[1]; \
-   DST[BCOMP] = SRC[0]
+   DST[ACOMP] = *SRC >> 24;            \
+   DST[RCOMP] = (*SRC >> 16) & 0xff;   \
+   DST[GCOMP] = (*SRC >> 8) & 0xff;    \
+   DST[BCOMP] = *SRC & 0xff
 
 
 /* 32-bit BGRX */
 #define STORE_PIXEL_X8R8G8B8(DST, X, Y, VALUE) \
-   DST[3] = 0xff; \
-   DST[2] = VALUE[RCOMP]; \
-   DST[1] = VALUE[GCOMP]; \
-   DST[0] = VALUE[BCOMP]
+   *DST = 0xff << 24 | VALUE[RCOMP] << 16 | VALUE[GCOMP] << 8 | VALUE[BCOMP]
 #define STORE_PIXEL_RGB_X8R8G8B8(DST, X, Y, VALUE) \
-   DST[3] = 0xff; \
-   DST[2] = VALUE[RCOMP]; \
-   DST[1] = VALUE[GCOMP]; \
-   DST[0] = VALUE[BCOMP]
+   *DST = 0xff << 24 | VALUE[RCOMP] << 16 | VALUE[GCOMP] << 8 | VALUE[BCOMP]
 #define FETCH_PIXEL_X8R8G8B8(DST, SRC) \
-   DST[ACOMP] = 0xff; \
-   DST[RCOMP] = SRC[2]; \
-   DST[GCOMP] = SRC[1]; \
-   DST[BCOMP] = SRC[0]
+   DST[ACOMP] = 0xff;                  \
+   DST[RCOMP] = (*SRC >> 16) & 0xff;   \
+   DST[GCOMP] = (*SRC >> 8) & 0xff;    \
+   DST[BCOMP] = *SRC & 0xff
 
 
 /* 16-bit BGR */
 #define STORE_PIXEL_R5G6B5(DST, X, Y, VALUE) \
    do { \
    int d = DITHER_COMP(X, Y) >> 6; \
-   GLushort *p = (GLushort *)DST; \
-   *p = ( ((DITHER_CLAMP((VALUE[RCOMP]) + d) & 0xf8) << 8) | \
-	  ((DITHER_CLAMP((VALUE[GCOMP]) + d) & 0xfc) << 3) | \
-	  ((DITHER_CLAMP((VALUE[BCOMP]) + d) & 0xf8) >> 3) ); \
+   *DST = ( ((DITHER_CLAMP((VALUE[RCOMP]) + d) & 0xf8) << 8) | \
+            ((DITHER_CLAMP((VALUE[GCOMP]) + d) & 0xfc) << 3) | \
+            ((DITHER_CLAMP((VALUE[BCOMP]) + d) & 0xf8) >> 3) ); \
    } while(0)
 #define FETCH_PIXEL_R5G6B5(DST, SRC) \
    do { \
-   GLushort p = *(GLushort *)SRC; \
    DST[ACOMP] = 0xff; \
-   DST[RCOMP] = ((p >> 8) & 0xf8) * 255 / 0xf8; \
-   DST[GCOMP] = ((p >> 3) & 0xfc) * 255 / 0xfc; \
-   DST[BCOMP] = ((p << 3) & 0xf8) * 255 / 0xf8; \
+   DST[RCOMP] = ((*SRC >> 8) & 0xf8) * 255 / 0xf8; \
+   DST[GCOMP] = ((*SRC >> 3) & 0xfc) * 255 / 0xfc; \
+   DST[BCOMP] = ((*SRC << 3) & 0xf8) * 255 / 0xf8; \
    } while(0)
 
 
@@ -145,8 +131,8 @@ static const GLubyte kernel[16] = {
 #define SPAN_VARS \
    struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
 #define INIT_PIXEL_PTR(P, X, Y) \
-   GLubyte *P = (GLubyte *)xrb->Base.Data + YFLIP(xrb, Y) * xrb->pitch + (X) * 4;
-#define INC_PIXEL_PTR(P) P += 4
+   GLuint *P = (GLuint *)xrb->Base.Data + YFLIP(xrb, Y) * xrb->pitch / 4 + (X)
+#define INC_PIXEL_PTR(P) P++
 #define STORE_PIXEL(DST, X, Y, VALUE) \
    STORE_PIXEL_A8R8G8B8(DST, X, Y, VALUE)
 #define STORE_PIXEL_RGB(DST, X, Y, VALUE) \
@@ -163,8 +149,8 @@ static const GLubyte kernel[16] = {
 #define SPAN_VARS \
    struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
 #define INIT_PIXEL_PTR(P, X, Y) \
-   GLubyte *P = (GLubyte *)xrb->Base.Data + YFLIP(xrb, Y) * xrb->pitch + (X) * 4;
-#define INC_PIXEL_PTR(P) P += 4
+   GLuint *P = (GLuint *)xrb->Base.Data + YFLIP(xrb, Y) * xrb->pitch / 4 + (X);
+#define INC_PIXEL_PTR(P) P++
 #define STORE_PIXEL(DST, X, Y, VALUE) \
    STORE_PIXEL_X8R8G8B8(DST, X, Y, VALUE)
 #define STORE_PIXEL_RGB(DST, X, Y, VALUE) \
@@ -181,8 +167,8 @@ static const GLubyte kernel[16] = {
 #define SPAN_VARS \
    struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
 #define INIT_PIXEL_PTR(P, X, Y) \
-   GLubyte *P = (GLubyte *)xrb->Base.Data + YFLIP(xrb, Y) * xrb->pitch + (X) * 2;
-#define INC_PIXEL_PTR(P) P += 2
+   GLushort *P = (GLushort *)xrb->Base.Data + YFLIP(xrb, Y) * xrb->pitch / 2 + (X);
+#define INC_PIXEL_PTR(P) P++
 #define STORE_PIXEL(DST, X, Y, VALUE) \
    STORE_PIXEL_R5G6B5(DST, X, Y, VALUE)
 #define FETCH_PIXEL(DST, SRC) \
@@ -234,8 +220,8 @@ static const GLubyte kernel[16] = {
 #define SPAN_VARS \
    struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
 #define INIT_PIXEL_PTR(P, X, Y) \
-   GLubyte *P = (GLubyte *)row;
-#define INC_PIXEL_PTR(P) P += 4
+   GLuint *P = (GLuint *)row;
+#define INC_PIXEL_PTR(P) P++
 #define STORE_PIXEL(DST, X, Y, VALUE) \
    STORE_PIXEL_A8R8G8B8(DST, X, Y, VALUE)
 #define STORE_PIXEL_RGB(DST, X, Y, VALUE) \
@@ -252,8 +238,8 @@ static const GLubyte kernel[16] = {
 #define SPAN_VARS \
    struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
 #define INIT_PIXEL_PTR(P, X, Y) \
-   GLubyte *P = (GLubyte *)row;
-#define INC_PIXEL_PTR(P) P += 4
+   GLuint *P = (GLuint *)row;
+#define INC_PIXEL_PTR(P) P++
 #define STORE_PIXEL(DST, X, Y, VALUE) \
    STORE_PIXEL_X8R8G8B8(DST, X, Y, VALUE)
 #define STORE_PIXEL_RGB(DST, X, Y, VALUE) \
@@ -270,7 +256,7 @@ static const GLubyte kernel[16] = {
 #define SPAN_VARS \
    struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
 #define INIT_PIXEL_PTR(P, X, Y) \
-   GLubyte *P = (GLubyte *)row;
+   GLushort *P = (GLushort *)row;
 #define INC_PIXEL_PTR(P) P += 2
 #define STORE_PIXEL(DST, X, Y, VALUE) \
    STORE_PIXEL_R5G6B5(DST, X, Y, VALUE)
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_context.c b/src/mesa/drivers/dri/tdfx/tdfx_context.c
index e742d414a5..edb1875f76 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_context.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_context.c
@@ -165,12 +165,12 @@ static const struct dri_debug_control debug_control[] =
 };
 
 GLboolean tdfxCreateContext( const __GLcontextModes *mesaVis,
-			     __DRIcontextPrivate *driContextPriv,
+			     __DRIcontext *driContextPriv,
                              void *sharedContextPrivate )
 {
    tdfxContextPtr fxMesa;
    GLcontext *ctx, *shareCtx;
-   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    tdfxScreenPrivate *fxScreen = (tdfxScreenPrivate *) sPriv->private;
    TDFXSAREAPriv *saPriv = (TDFXSAREAPriv *) ((char *) sPriv->pSAREA +
 					      sizeof(drm_sarea_t));
@@ -441,7 +441,7 @@ static GLboolean tdfxInitVertexFormats( tdfxContextPtr fxMesa )
  * Initialize the state in an tdfxContextPtr struct.
  */
 static GLboolean
-tdfxInitContext( __DRIdrawablePrivate *driDrawPriv, tdfxContextPtr fxMesa )
+tdfxInitContext( __DRIdrawable *driDrawPriv, tdfxContextPtr fxMesa )
 {
    /* KW: Would be nice to make one of these a member of the other.
     */
@@ -563,7 +563,7 @@ tdfxInitContext( __DRIdrawablePrivate *driDrawPriv, tdfxContextPtr fxMesa )
 
 
 void
-tdfxDestroyContext( __DRIcontextPrivate *driContextPriv )
+tdfxDestroyContext( __DRIcontext *driContextPriv )
 {
    tdfxContextPtr fxMesa = (tdfxContextPtr) driContextPriv->driverPrivate;
 
@@ -607,7 +607,7 @@ tdfxDestroyContext( __DRIcontextPrivate *driContextPriv )
 
 
 GLboolean
-tdfxUnbindContext( __DRIcontextPrivate *driContextPriv )
+tdfxUnbindContext( __DRIcontext *driContextPriv )
 {
    GET_CURRENT_CONTEXT(ctx);
    tdfxContextPtr fxMesa = TDFX_CONTEXT(ctx);
@@ -626,9 +626,9 @@ tdfxUnbindContext( __DRIcontextPrivate *driContextPriv )
 
 
 GLboolean
-tdfxMakeCurrent( __DRIcontextPrivate *driContextPriv,
-                 __DRIdrawablePrivate *driDrawPriv,
-                 __DRIdrawablePrivate *driReadPriv )
+tdfxMakeCurrent( __DRIcontext *driContextPriv,
+                 __DRIdrawable *driDrawPriv,
+                 __DRIdrawable *driReadPriv )
 {
    if ( TDFX_DEBUG & DEBUG_VERBOSE_DRI ) {
       fprintf( stderr, "%s( %p )\n", __FUNCTION__, (void *)driContextPriv );
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_context.h b/src/mesa/drivers/dri/tdfx/tdfx_context.h
index 3bcb545119..6e25cac301 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_context.h
+++ b/src/mesa/drivers/dri/tdfx/tdfx_context.h
@@ -892,18 +892,18 @@ struct tdfx_context {
    char rendererString[100];
 
    /* stuff added for DRI */
-   __DRIscreenPrivate *driScreen;
-   __DRIcontextPrivate *driContext;
+   __DRIscreen *driScreen;
+   __DRIcontext *driContext;
 
    /**
     * DRI drawable bound to this context for drawing.
     */
-   __DRIdrawablePrivate	*driDrawable;
+   __DRIdrawable	*driDrawable;
 
    /**
     * DRI drawable bound to this context for reading.
     */
-   __DRIdrawablePrivate	*driReadable;
+   __DRIdrawable	*driReadable;
 
    drm_context_t hHWContext;
    drm_hw_lock_t *driHwLock;
@@ -938,19 +938,19 @@ struct tdfx_context {
 
 extern GLboolean
 tdfxCreateContext( const __GLcontextModes *mesaVis,
-                   __DRIcontextPrivate *driContextPriv,
+                   __DRIcontext *driContextPriv,
                    void *sharedContextPrivate );
 
 extern void
-tdfxDestroyContext( __DRIcontextPrivate *driContextPriv );
+tdfxDestroyContext( __DRIcontext *driContextPriv );
 
 extern GLboolean
-tdfxUnbindContext( __DRIcontextPrivate *driContextPriv );
+tdfxUnbindContext( __DRIcontext *driContextPriv );
 
 extern GLboolean
-tdfxMakeCurrent( __DRIcontextPrivate *driContextPriv,
-                 __DRIdrawablePrivate *driDrawPriv,
-                 __DRIdrawablePrivate *driReadPriv );
+tdfxMakeCurrent( __DRIcontext *driContextPriv,
+                 __DRIdrawable *driDrawPriv,
+                 __DRIdrawable *driReadPriv );
 
 extern GLboolean
 tdfxInitGlide( tdfxContextPtr tmesa );
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_dd.c b/src/mesa/drivers/dri/tdfx/tdfx_dd.c
index 8472df607a..ed8a331549 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_dd.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_dd.c
@@ -91,7 +91,7 @@ static const GLubyte *tdfxDDGetString( GLcontext *ctx, GLenum name )
       else {
 	 /* unexpected result: replace spaces with hyphens */
 	 int i;
-	 for (i = 0; hardware[i] && (i < sizeof(hardware)); i++) {
+	 for (i = 0; i < sizeof(hardware) && hardware[i]; i++) {
 	    if (hardware[i] == ' ' || hardware[i] == '\t') {
 	       hardware[i] = '-';
 	    }
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_lock.c b/src/mesa/drivers/dri/tdfx/tdfx_lock.c
index 17cdc51ee1..4f84240104 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_lock.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_lock.c
@@ -45,10 +45,10 @@
 
 void tdfxGetLock( tdfxContextPtr fxMesa )
 {
-    __DRIcontextPrivate *cPriv = fxMesa->driContext;
-    __DRIdrawablePrivate *const drawable = cPriv->driDrawablePriv;
-    __DRIdrawablePrivate *const readable = cPriv->driReadablePriv;
-    __DRIscreenPrivate *sPriv = drawable->driScreenPriv;
+    __DRIcontext *cPriv = fxMesa->driContext;
+    __DRIdrawable *const drawable = cPriv->driDrawablePriv;
+    __DRIdrawable *const readable = cPriv->driReadablePriv;
+    __DRIscreen *sPriv = drawable->driScreenPriv;
     TDFXSAREAPriv *saPriv = (TDFXSAREAPriv *) (((char *) sPriv->pSAREA) +
 					fxMesa->fxScreen->sarea_priv_offset);
     unsigned int stamp = drawable->lastStamp;
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_pixels.c b/src/mesa/drivers/dri/tdfx/tdfx_pixels.c
index 18729d5ae0..65f0464f8a 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_pixels.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_pixels.c
@@ -495,7 +495,7 @@ tdfx_readpixels_R5G6B5(GLcontext * ctx, GLint x, GLint y,
    {
       tdfxContextPtr fxMesa = TDFX_CONTEXT(ctx);
       GrLfbInfo_t info;
-      __DRIdrawablePrivate *const readable = fxMesa->driReadable;
+      __DRIdrawable *const readable = fxMesa->driReadable;
       const GLint winX = readable->x;
       const GLint winY = readable->y + readable->h - 1;
       const GLint scrX = winX + x;
@@ -553,7 +553,7 @@ tdfx_readpixels_R8G8B8A8(GLcontext * ctx, GLint x, GLint y,
    {
       tdfxContextPtr fxMesa = TDFX_CONTEXT(ctx);
       GrLfbInfo_t info;
-      __DRIdrawablePrivate *const readable = fxMesa->driReadable;
+      __DRIdrawable *const readable = fxMesa->driReadable;
       const GLint winX = readable->x;
       const GLint winY = readable->y + readable->h - 1;
       const GLint scrX = winX + x;
@@ -611,10 +611,10 @@ tdfx_drawpixels_R8G8B8A8(GLcontext * ctx, GLint x, GLint y,
        ctx->Fog.Enabled ||
        ctx->Scissor.Enabled ||
        ctx->Stencil._Enabled ||
-       !ctx->Color.ColorMask[0] ||
-       !ctx->Color.ColorMask[1] ||
-       !ctx->Color.ColorMask[2] ||
-       !ctx->Color.ColorMask[3] ||
+       !ctx->Color.ColorMask[0][0] ||
+       !ctx->Color.ColorMask[0][1] ||
+       !ctx->Color.ColorMask[0][2] ||
+       !ctx->Color.ColorMask[0][3] ||
        ctx->Color.ColorLogicOpEnabled ||
        ctx->Texture._EnabledUnits ||
        fxMesa->Fallback)       
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_render.c b/src/mesa/drivers/dri/tdfx/tdfx_render.c
index 2cd8e12d95..979bcd4514 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_render.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_render.c
@@ -76,8 +76,8 @@ static void tdfxClear( GLcontext *ctx, GLbitfield mask )
 
    if (fxMesa->glCtx->Visual.redBits != 8) {
       /* can only do color masking if running in 24/32bpp on Napalm */
-      if (ctx->Color.ColorMask[RCOMP] != ctx->Color.ColorMask[GCOMP] ||
-          ctx->Color.ColorMask[GCOMP] != ctx->Color.ColorMask[BCOMP]) {
+      if (ctx->Color.ColorMask[0][RCOMP] != ctx->Color.ColorMask[0][GCOMP] ||
+          ctx->Color.ColorMask[0][GCOMP] != ctx->Color.ColorMask[0][BCOMP]) {
          softwareMask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
          mask &= ~(BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT);
       }
@@ -556,7 +556,7 @@ static void uploadTextureImages( tdfxContextPtr fxMesa )
  */
 void tdfxUploadClipping( tdfxContextPtr fxMesa )
 {
-   __DRIdrawablePrivate *dPriv = fxMesa->driDrawable;
+   __DRIdrawable *dPriv = fxMesa->driDrawable;
 
    assert(dPriv);
 
@@ -721,7 +721,7 @@ void tdfxEmitHwStateLocked( tdfxContextPtr fxMesa )
 	 fxMesa->Glide.grColorMask( fxMesa->Color.ColorMask[RCOMP] ||
                                     fxMesa->Color.ColorMask[GCOMP] ||
                                     fxMesa->Color.ColorMask[BCOMP],
-                                    /*fxMesa->Color.ColorMask[ACOMP]*/GL_FALSE/*[dBorca] no-no*/ );
+                                    /*fxMesa->Color.ColorMask[0][ACOMP]*/GL_FALSE/*[dBorca] no-no*/ );
       }
       fxMesa->dirty &= ~TDFX_UPLOAD_COLOR_MASK;
    }
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_screen.c b/src/mesa/drivers/dri/tdfx/tdfx_screen.c
index 2eb0024d40..4422b5dec4 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_screen.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_screen.c
@@ -70,7 +70,7 @@ static const __DRIextension *tdfxExtensions[] = {
 static const GLuint __driNConfigOptions = 1;
 
 static GLboolean
-tdfxCreateScreen( __DRIscreenPrivate *sPriv )
+tdfxCreateScreen( __DRIscreen *sPriv )
 {
    tdfxScreenPrivate *fxScreen;
    TDFXDRIPtr fxDRIPriv = (TDFXDRIPtr) sPriv->pDevPriv;
@@ -121,7 +121,7 @@ tdfxCreateScreen( __DRIscreenPrivate *sPriv )
 
 
 static void
-tdfxDestroyScreen( __DRIscreenPrivate *sPriv )
+tdfxDestroyScreen( __DRIscreen *sPriv )
 {
    tdfxScreenPrivate *fxScreen = (tdfxScreenPrivate *) sPriv->private;
 
@@ -139,7 +139,7 @@ tdfxDestroyScreen( __DRIscreenPrivate *sPriv )
 
 
 static GLboolean
-tdfxInitDriver( __DRIscreenPrivate *sPriv )
+tdfxInitDriver( __DRIscreen *sPriv )
 {
    if ( TDFX_DEBUG & DEBUG_VERBOSE_DRI ) {
       fprintf( stderr, "%s( %p )\n", __FUNCTION__, (void *)sPriv );
@@ -155,8 +155,8 @@ tdfxInitDriver( __DRIscreenPrivate *sPriv )
 
 
 static GLboolean
-tdfxCreateBuffer( __DRIscreenPrivate *driScrnPriv,
-                  __DRIdrawablePrivate *driDrawPriv,
+tdfxCreateBuffer( __DRIscreen *driScrnPriv,
+                  __DRIdrawable *driDrawPriv,
                   const __GLcontextModes *mesaVis,
                   GLboolean isPixmap )
 {
@@ -227,14 +227,14 @@ tdfxCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 
 
 static void
-tdfxDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+tdfxDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
 static void
-tdfxSwapBuffers( __DRIdrawablePrivate *driDrawPriv )
+tdfxSwapBuffers( __DRIdrawable *driDrawPriv )
 
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -253,7 +253,7 @@ tdfxSwapBuffers( __DRIdrawablePrivate *driDrawPriv )
     * we have to do a glFinish (per the GLX spec).
     */
    if ( ctx ) {
-      __DRIdrawablePrivate *curDrawPriv;
+      __DRIdrawable *curDrawPriv;
       fxMesa = TDFX_CONTEXT(ctx);
       curDrawPriv = fxMesa->driContext->driDrawablePriv;
 
@@ -341,7 +341,7 @@ tdfxSwapBuffers( __DRIdrawablePrivate *driDrawPriv )
 }
 
 static const __DRIconfig **
-tdfxFillInModes(__DRIscreenPrivate *psp,
+tdfxFillInModes(__DRIscreen *psp,
 		unsigned pixel_bits,
 		unsigned depth_bits,
 		unsigned stencil_bits,
@@ -440,3 +440,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_screen.h b/src/mesa/drivers/dri/tdfx/tdfx_screen.h
index 5a68898b36..6aa42e8667 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_screen.h
+++ b/src/mesa/drivers/dri/tdfx/tdfx_screen.h
@@ -61,7 +61,7 @@ typedef struct {
    int textureOffset;
    int textureSize;
 
-   __DRIscreenPrivate *driScrnPriv;
+   __DRIscreen *driScrnPriv;
    unsigned int sarea_priv_offset;
 
    /* Configuration cache with default values for all contexts */
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_span.c b/src/mesa/drivers/dri/tdfx/tdfx_span.c
index 6b38fa5a01..a17bcd952a 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_span.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_span.c
@@ -47,7 +47,7 @@
 
 #define LOCAL_VARS							\
    driRenderbuffer *drb = (driRenderbuffer *) rb;			\
-   __DRIdrawablePrivate *const dPriv = drb->dPriv;			\
+   __DRIdrawable *const dPriv = drb->dPriv;			\
    GLuint pitch = drb->backBuffer ? info.strideInBytes			\
      : (drb->pitch * drb->cpp);						\
    const GLuint bottom = dPriv->h - 1;					\
@@ -104,7 +104,7 @@
 
 #define HW_READ_CLIPLOOP()						\
       do {								\
-         const __DRIdrawablePrivate *dPriv = fxMesa->driDrawable;	\
+         const __DRIdrawable *dPriv = fxMesa->driDrawable;	\
          drm_clip_rect_t *rect = dPriv->pClipRects;			\
          int _nc = dPriv->numClipRects;					\
          while (_nc--) {						\
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_state.c b/src/mesa/drivers/dri/tdfx/tdfx_state.c
index cf2712720f..cdb61a0ce0 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_state.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_state.c
@@ -621,7 +621,7 @@ static int intersect_rect( drm_clip_rect_t *out,
 void tdfxUpdateClipping( GLcontext *ctx )
 {
    tdfxContextPtr fxMesa = TDFX_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = fxMesa->driDrawable;
+   __DRIdrawable *dPriv = fxMesa->driDrawable;
 
    if ( TDFX_DEBUG & DEBUG_VERBOSE_API ) {
       fprintf( stderr, "%s()\n", __FUNCTION__ );
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_tex.c b/src/mesa/drivers/dri/tdfx/tdfx_tex.c
index 0aa09e733b..e31ae97b02 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_tex.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_tex.c
@@ -1572,7 +1572,7 @@ tdfxCompressedTexImage2D (GLcontext *ctx, GLenum target,
     tdfxTexInfo *ti;
     tdfxMipMapLevel *mml;
     gl_format mesaFormat;
-    GLuint compressedSize;
+    GLuint compressedSize = 0;
 
     if (TDFX_DEBUG & DEBUG_VERBOSE_DRI) {
         fprintf(stderr, "tdfxCompressedTexImage2D: id=%d int 0x%x  %dx%d\n",
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_texstate.c b/src/mesa/drivers/dri/tdfx/tdfx_texstate.c
index bbd2c8cfee..3f737878ed 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_texstate.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_texstate.c
@@ -1314,7 +1314,7 @@ SetupDoubleTexEnvVoodoo3(GLcontext *ctx, int tmu0,
       fxMesa->TexCombine[0].InvertRGB = FXFALSE;
       fxMesa->TexCombine[0].InvertAlpha = FXFALSE;
 
-      if ((baseFormat0 == GL_RGB) && (baseFormat0 == GL_LUMINANCE)) {
+      if ((baseFormat0 == GL_RGB) || (baseFormat0 == GL_LUMINANCE)) {
          fxMesa->AlphaCombine.Function = GR_COMBINE_FUNCTION_LOCAL;
          fxMesa->AlphaCombine.Factor = GR_COMBINE_FACTOR_NONE;
          fxMesa->AlphaCombine.Local = locala;
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_vb.c b/src/mesa/drivers/dri/tdfx/tdfx_vb.c
index 4928802232..c200ba3255 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_vb.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_vb.c
@@ -69,11 +69,11 @@ static void interp_extras( GLcontext *ctx,
 
    /*fprintf(stderr, "%s\n", __FUNCTION__);*/
 
-   if (VB->ColorPtr[1]) {
+   if (VB->BackfaceColorPtr) {
       INTERP_4F( t,
-		    GET_COLOR(VB->ColorPtr[1], dst),
-		    GET_COLOR(VB->ColorPtr[1], out),
-		    GET_COLOR(VB->ColorPtr[1], in) );
+		 GET_COLOR(VB->BackfaceColorPtr, dst),
+		 GET_COLOR(VB->BackfaceColorPtr, out),
+		 GET_COLOR(VB->BackfaceColorPtr, in) );
    }
 
    if (VB->EdgeFlag) {
@@ -88,9 +88,9 @@ static void copy_pv_extras( GLcontext *ctx, GLuint dst, GLuint src )
 {
    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
 
-   if (VB->ColorPtr[1]) {
-	 COPY_4FV( GET_COLOR(VB->ColorPtr[1], dst), 
-		     GET_COLOR(VB->ColorPtr[1], src) );
+   if (VB->BackfaceColorPtr) {
+      COPY_4FV( GET_COLOR(VB->BackfaceColorPtr, dst),
+		GET_COLOR(VB->BackfaceColorPtr, src) );
    }
 
    setup_tab[TDFX_CONTEXT(ctx)->SetupIndex].copy_pv(ctx, dst, src);
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_vbtmp.h b/src/mesa/drivers/dri/tdfx/tdfx_vbtmp.h
index 9b780761f4..19baf7d0d2 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_vbtmp.h
+++ b/src/mesa/drivers/dri/tdfx/tdfx_vbtmp.h
@@ -58,32 +58,32 @@ static void TAG(emit)( GLcontext *ctx,
 /*     fprintf(stderr, "%s\n", __FUNCTION__); */
 
    if (IND & TDFX_TEX0_BIT) {
-      tc0_stride = VB->TexCoordPtr[tmu0_source]->stride;
-      tc0 = VB->TexCoordPtr[tmu0_source]->data;
+      tc0_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + tmu0_source]->stride;
+      tc0 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + tmu0_source]->data;
       u0scale = fxMesa->sScale0;
       v0scale = fxMesa->tScale0;
       if (IND & TDFX_PTEX_BIT)
-	 tc0_size = VB->TexCoordPtr[tmu0_source]->size;
+	 tc0_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + tmu0_source]->size;
    }
 
    if (IND & TDFX_TEX1_BIT) {
-      tc1 = VB->TexCoordPtr[tmu1_source]->data;
-      tc1_stride = VB->TexCoordPtr[tmu1_source]->stride;
+      tc1 = VB->AttribPtr[_TNL_ATTRIB_TEX0 + tmu1_source]->data;
+      tc1_stride = VB->AttribPtr[_TNL_ATTRIB_TEX0 + tmu1_source]->stride;
       u1scale = fxMesa->sScale1;
       v1scale = fxMesa->tScale1;
       if (IND & TDFX_PTEX_BIT)
-	 tc1_size = VB->TexCoordPtr[tmu1_source]->size;
+	 tc1_size = VB->AttribPtr[_TNL_ATTRIB_TEX0 + tmu1_source]->size;
    }
    
    if (IND & TDFX_RGBA_BIT) {
-      col = VB->ColorPtr[0]->data;
-      col_stride = VB->ColorPtr[0]->stride;
-      col_size = VB->ColorPtr[0]->size;
+      col = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->data;
+      col_stride = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->stride;
+      col_size = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
    }
    
    if (IND & TDFX_FOGC_BIT) {
-      fog = VB->FogCoordPtr->data;
-      fog_stride = VB->FogCoordPtr->stride;
+      fog = VB->AttribPtr[_TNL_ATTRIB_FOG]->data;
+      fog_stride = VB->AttribPtr[_TNL_ATTRIB_FOG]->stride;
    }
 
    {
@@ -168,14 +168,14 @@ static GLboolean TAG(check_tex_sizes)( GLcontext *ctx )
       struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
 
       if (IND & TDFX_TEX1_BIT) {
-	 if (VB->TexCoordPtr[0] == 0)
-	    VB->TexCoordPtr[0] = VB->TexCoordPtr[1];
+	 if (VB->AttribPtr[_TNL_ATTRIB_TEX0] == 0)
+	    VB->AttribPtr[_TNL_ATTRIB_TEX0] = VB->AttribPtr[_TNL_ATTRIB_TEX1];
 	 
-	 if (VB->TexCoordPtr[1]->size == 4)
+	 if (VB->AttribPtr[_TNL_ATTRIB_TEX1]->size == 4)
 	    return GL_FALSE;
       }
 
-      if (VB->TexCoordPtr[0]->size == 4)
+      if (VB->AttribPtr[_TNL_ATTRIB_TEX0]->size == 4)
 	 return GL_FALSE;
    }
 
diff --git a/src/mesa/drivers/dri/unichrome/via_context.c b/src/mesa/drivers/dri/unichrome/via_context.c
index 0524becf3e..d17a160271 100644
--- a/src/mesa/drivers/dri/unichrome/via_context.c
+++ b/src/mesa/drivers/dri/unichrome/via_context.c
@@ -148,7 +148,7 @@ viaRenderbufferStorage(GLcontext *ctx, struct gl_renderbuffer *rb,
 
 static void
 viaInitRenderbuffer(struct via_renderbuffer *vrb, GLenum format,
-		    __DRIdrawablePrivate *dPriv)
+		    __DRIdrawable *dPriv)
 {
    const GLuint name = 0;
    struct gl_renderbuffer *rb = & vrb->Base;
@@ -207,7 +207,7 @@ viaInitRenderbuffer(struct via_renderbuffer *vrb, GLenum format,
 static GLboolean
 calculate_buffer_parameters(struct via_context *vmesa,
 			    struct gl_framebuffer *fb,
-			    __DRIdrawablePrivate *dPriv)
+			    __DRIdrawable *dPriv)
 {
    const unsigned shift = vmesa->viaScreen->bitsPerPixel / 16;
    const unsigned extra = 32;
@@ -460,12 +460,12 @@ FreeBuffer(struct via_context *vmesa)
 
 GLboolean
 viaCreateContext(const __GLcontextModes *visual,
-                 __DRIcontextPrivate *driContextPriv,
+                 __DRIcontext *driContextPriv,
                  void *sharedContextPrivate)
 {
     GLcontext *ctx, *shareCtx;
     struct via_context *vmesa;
-    __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+    __DRIscreen *sPriv = driContextPriv->driScreenPriv;
     viaScreenPrivate *viaScreen = (viaScreenPrivate *)sPriv->private;
     drm_via_sarea_t *saPriv = (drm_via_sarea_t *)
         (((GLubyte *)sPriv->pSAREA) + viaScreen->sareaPrivOffset);
@@ -679,7 +679,7 @@ viaCreateContext(const __GLcontextModes *visual,
 }
 
 void
-viaDestroyContext(__DRIcontextPrivate *driContextPriv)
+viaDestroyContext(__DRIcontext *driContextPriv)
 {
     GET_CURRENT_CONTEXT(ctx);
     struct via_context *vmesa =
@@ -729,8 +729,8 @@ viaDestroyContext(__DRIcontextPrivate *driContextPriv)
 
 void viaXMesaWindowMoved(struct via_context *vmesa)
 {
-   __DRIdrawablePrivate *const drawable = vmesa->driDrawable;
-   __DRIdrawablePrivate *const readable = vmesa->driReadable;
+   __DRIdrawable *const drawable = vmesa->driDrawable;
+   __DRIdrawable *const readable = vmesa->driReadable;
    struct via_renderbuffer * draw_buffer;
    struct via_renderbuffer * read_buffer;
    GLuint bytePerPixel = vmesa->viaScreen->bitsPerPixel >> 3;
@@ -813,15 +813,15 @@ void viaXMesaWindowMoved(struct via_context *vmesa)
 }
 
 GLboolean
-viaUnbindContext(__DRIcontextPrivate *driContextPriv)
+viaUnbindContext(__DRIcontext *driContextPriv)
 {
     return GL_TRUE;
 }
 
 GLboolean
-viaMakeCurrent(__DRIcontextPrivate *driContextPriv,
-               __DRIdrawablePrivate *driDrawPriv,
-               __DRIdrawablePrivate *driReadPriv)
+viaMakeCurrent(__DRIcontext *driContextPriv,
+               __DRIdrawable *driDrawPriv,
+               __DRIdrawable *driReadPriv)
 {
     if (VIA_DEBUG & DEBUG_DRI) {
 	fprintf(stderr, "driContextPriv = %016lx\n", (unsigned long)driContextPriv);
@@ -897,8 +897,8 @@ viaMakeCurrent(__DRIcontextPrivate *driContextPriv,
 
 void viaGetLock(struct via_context *vmesa, GLuint flags)
 {
-    __DRIdrawablePrivate *dPriv = vmesa->driDrawable;
-    __DRIscreenPrivate *sPriv = vmesa->driScreen;
+    __DRIdrawable *dPriv = vmesa->driDrawable;
+    __DRIscreen *sPriv = vmesa->driScreen;
 
     drmGetLock(vmesa->driFd, vmesa->hHWContext, flags);
 
@@ -928,9 +928,9 @@ void viaGetLock(struct via_context *vmesa, GLuint flags)
 
 
 void
-viaSwapBuffers(__DRIdrawablePrivate *drawablePrivate)
+viaSwapBuffers(__DRIdrawable *drawablePrivate)
 {
-    __DRIdrawablePrivate *dPriv = (__DRIdrawablePrivate *)drawablePrivate;
+    __DRIdrawable *dPriv = (__DRIdrawable *)drawablePrivate;
 
     if (dPriv && 
 	dPriv->driContextPriv && 
diff --git a/src/mesa/drivers/dri/unichrome/via_context.h b/src/mesa/drivers/dri/unichrome/via_context.h
index 4cc9e475c2..4e1ab3a6ca 100644
--- a/src/mesa/drivers/dri/unichrome/via_context.h
+++ b/src/mesa/drivers/dri/unichrome/via_context.h
@@ -105,7 +105,7 @@ struct via_renderbuffer {
    int drawW;                  
    int drawH;    
 
-   __DRIdrawablePrivate *dPriv;
+   __DRIdrawable *dPriv;
 };
 
 
@@ -294,14 +294,14 @@ struct via_context {
    /**
     * DRI drawable bound to this context for drawing.
     */
-   __DRIdrawablePrivate	*driDrawable;
+   __DRIdrawable	*driDrawable;
 
    /**
     * DRI drawable bound to this context for reading.
     */
-   __DRIdrawablePrivate	*driReadable;
+   __DRIdrawable	*driReadable;
 
-   __DRIscreenPrivate *driScreen;
+   __DRIscreen *driScreen;
    viaScreenPrivate *viaScreen;
    drm_via_sarea_t *sarea;
    volatile GLuint* regMMIOBase;
diff --git a/src/mesa/drivers/dri/unichrome/via_ioctl.c b/src/mesa/drivers/dri/unichrome/via_ioctl.c
index b34c133600..8d4edfa305 100644
--- a/src/mesa/drivers/dri/unichrome/via_ioctl.c
+++ b/src/mesa/drivers/dri/unichrome/via_ioctl.c
@@ -205,7 +205,7 @@ static void viaFillBuffer(struct via_context *vmesa,
 static void viaClear(GLcontext *ctx, GLbitfield mask)
 {
    struct via_context *vmesa = VIA_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = vmesa->driDrawable;
+   __DRIdrawable *dPriv = vmesa->driDrawable;
    struct via_renderbuffer *const vrb = 
      (struct via_renderbuffer *) dPriv->driverPrivate;
    int flag = 0;
@@ -507,12 +507,12 @@ void viaWaitIdleLocked( struct via_context *vmesa, GLboolean light )
  * except that WAIT_IDLE() will spin the CPU polling, while this is
  * IRQ driven.
  */
-static void viaWaitIdleVBlank(  __DRIdrawablePrivate *dPriv, 
+static void viaWaitIdleVBlank(  __DRIdrawable *dPriv, 
 			       struct via_context *vmesa,
 			       GLuint value )
 {
    GLboolean missed_target;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+   __DRIscreen *psp = dPriv->driScreenPriv;
 
    VIA_FLUSH_DMA(vmesa); 
 
@@ -591,11 +591,11 @@ void viaResetPageFlippingLocked(struct via_context *vmesa)
 /*
  * Copy the back buffer to the front buffer. 
  */
-void viaCopyBuffer(__DRIdrawablePrivate *dPriv)
+void viaCopyBuffer(__DRIdrawable *dPriv)
 {
    struct via_context *vmesa = 
       (struct via_context *)dPriv->driContextPriv->driverPrivate;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+   __DRIscreen *psp = dPriv->driScreenPriv;
 
    if (VIA_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, 
@@ -635,12 +635,12 @@ void viaCopyBuffer(__DRIdrawablePrivate *dPriv)
 }
 
 
-void viaPageFlip(__DRIdrawablePrivate *dPriv)
+void viaPageFlip(__DRIdrawable *dPriv)
 {
     struct via_context *vmesa = 
        (struct via_context *)dPriv->driContextPriv->driverPrivate;
     struct via_renderbuffer buffer_tmp;
-    __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+    __DRIscreen *psp = dPriv->driScreenPriv;
 
     VIA_FLUSH_DMA(vmesa);
    if (dPriv->vblFlags == VBLANK_FLAG_SYNC &&
@@ -885,7 +885,6 @@ void viaFlushDmaLocked(struct via_context *vmesa, GLuint flags)
    }
    else if (vmesa->numClipRects) {
       drm_clip_rect_t *pbox = vmesa->pClipRects;
-      __DRIdrawablePrivate *dPriv = vmesa->driDrawable;
 
       for (i = 0; i < vmesa->numClipRects; i++) {
 	 drm_clip_rect_t b;
diff --git a/src/mesa/drivers/dri/unichrome/via_ioctl.h b/src/mesa/drivers/dri/unichrome/via_ioctl.h
index 14a833a97d..c6b32cf085 100644
--- a/src/mesa/drivers/dri/unichrome/via_ioctl.h
+++ b/src/mesa/drivers/dri/unichrome/via_ioctl.h
@@ -33,8 +33,8 @@ void viaFlushDma(struct via_context *vmesa);
 void viaFlushDmaLocked(struct via_context *vmesa, GLuint flags);
 
 void viaInitIoctlFuncs(GLcontext *ctx);
-void viaCopyBuffer(__DRIdrawablePrivate *dpriv);
-void viaPageFlip(__DRIdrawablePrivate *dpriv);
+void viaCopyBuffer(__DRIdrawable *dpriv);
+void viaPageFlip(__DRIdrawable *dpriv);
 void viaCheckDma(struct via_context *vmesa, GLuint bytes);
 void viaResetPageFlippingLocked(struct via_context *vmesa);
 void viaWaitIdle(struct via_context *vmesa, GLboolean light);
diff --git a/src/mesa/drivers/dri/unichrome/via_screen.c b/src/mesa/drivers/dri/unichrome/via_screen.c
index e0bf58ca9a..2cfb98317d 100644
--- a/src/mesa/drivers/dri/unichrome/via_screen.c
+++ b/src/mesa/drivers/dri/unichrome/via_screen.c
@@ -90,7 +90,7 @@ static void via_free_empty_buffers( drmBufMapPtr bufs )
 
 
 static GLboolean
-viaInitDriver(__DRIscreenPrivate *sPriv)
+viaInitDriver(__DRIscreen *sPriv)
 {
     viaScreenPrivate *viaScreen;
     VIADRIPtr gDRIPriv = (VIADRIPtr)sPriv->pDevPriv;
@@ -184,7 +184,7 @@ viaInitDriver(__DRIscreenPrivate *sPriv)
 }
 
 static void
-viaDestroyScreen(__DRIscreenPrivate *sPriv)
+viaDestroyScreen(__DRIscreen *sPriv)
 {
     viaScreenPrivate *viaScreen = (viaScreenPrivate *)sPriv->private;
     VIADRIPtr gDRIPriv = (VIADRIPtr)sPriv->pDevPriv;
@@ -203,8 +203,8 @@ viaDestroyScreen(__DRIscreenPrivate *sPriv)
 
 
 static GLboolean
-viaCreateBuffer(__DRIscreenPrivate *driScrnPriv,
-                __DRIdrawablePrivate *driDrawPriv,
+viaCreateBuffer(__DRIscreen *driScrnPriv,
+                __DRIdrawable *driDrawPriv,
                 const __GLcontextModes *mesaVis,
                 GLboolean isPixmap)
 {
@@ -314,13 +314,13 @@ viaCreateBuffer(__DRIscreenPrivate *driScrnPriv,
 
 
 static void
-viaDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+viaDestroyBuffer(__DRIdrawable *driDrawPriv)
 {
    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static const __DRIconfig **
-viaFillInModes( __DRIscreenPrivate *psp,
+viaFillInModes( __DRIscreen *psp,
 		unsigned pixel_bits, GLboolean have_back_buffer )
 {
     __DRIconfig **configs;
@@ -377,7 +377,7 @@ viaFillInModes( __DRIscreenPrivate *psp,
  * \return the __GLcontextModes supported by this driver
  */
 static const __DRIconfig **
-viaInitScreen(__DRIscreenPrivate *psp)
+viaInitScreen(__DRIscreen *psp)
 {
    static const __DRIversion ddx_expected = { VIA_DRIDDX_VERSION_MAJOR,
                                               VIA_DRIDDX_VERSION_MINOR,
@@ -405,7 +405,7 @@ viaInitScreen(__DRIscreenPrivate *psp)
  * Get information about previous buffer swaps.
  */
 static int
-getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
+getSwapInfo( __DRIdrawable *dPriv, __DRIswapInfo * sInfo )
 {
    struct via_context *vmesa;
 
@@ -443,3 +443,10 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL
 };
+
+/* This is the table of extensions that the loader will dlsym() for. */
+PUBLIC const __DRIextension *__driDriverExtensions[] = {
+    &driCoreExtension.base,
+    &driLegacyExtension.base,
+    NULL
+};
diff --git a/src/mesa/drivers/dri/unichrome/via_screen.h b/src/mesa/drivers/dri/unichrome/via_screen.h
index c3ef722ff0..aa662e01c0 100644
--- a/src/mesa/drivers/dri/unichrome/via_screen.h
+++ b/src/mesa/drivers/dri/unichrome/via_screen.h
@@ -61,7 +61,7 @@ typedef struct {
     drmAddress agpLinearStart;
     GLuint agpBase;
 
-    __DRIscreenPrivate *driScrnPriv;
+    __DRIscreen *driScrnPriv;
     drmBufMapPtr bufs;
     unsigned int sareaPrivOffset;
     /*=* John Sheng [2003.12.9] Tuxracer & VQ *=*/
@@ -77,21 +77,21 @@ typedef struct {
 
 extern GLboolean
 viaCreateContext(const __GLcontextModes *mesaVis,
-                 __DRIcontextPrivate *driContextPriv,
+                 __DRIcontext *driContextPriv,
                  void *sharedContextPrivate);
 
 extern void
-viaDestroyContext(__DRIcontextPrivate *driContextPriv);
+viaDestroyContext(__DRIcontext *driContextPriv);
 
 extern GLboolean
-viaUnbindContext(__DRIcontextPrivate *driContextPriv);
+viaUnbindContext(__DRIcontext *driContextPriv);
 
 extern GLboolean
-viaMakeCurrent(__DRIcontextPrivate *driContextPriv,
-               __DRIdrawablePrivate *driDrawPriv,
-               __DRIdrawablePrivate *driReadPriv);
+viaMakeCurrent(__DRIcontext *driContextPriv,
+               __DRIdrawable *driDrawPriv,
+               __DRIdrawable *driReadPriv);
 
 extern void
-viaSwapBuffers(__DRIdrawablePrivate *drawablePrivate);
+viaSwapBuffers(__DRIdrawable *drawablePrivate);
 
 #endif
diff --git a/src/mesa/drivers/dri/unichrome/via_span.c b/src/mesa/drivers/dri/unichrome/via_span.c
index e847164cd0..fa3cbf7a79 100644
--- a/src/mesa/drivers/dri/unichrome/via_span.c
+++ b/src/mesa/drivers/dri/unichrome/via_span.c
@@ -43,7 +43,7 @@
 #undef LOCAL_VARS
 #define LOCAL_VARS                                                   	\
     struct via_renderbuffer *vrb = (struct via_renderbuffer *) rb;   	\
-    __DRIdrawablePrivate *dPriv = vrb->dPriv;                           \
+    __DRIdrawable *dPriv = vrb->dPriv;                           \
     GLuint pitch = vrb->pitch;                                          \
     GLuint height = dPriv->h;                                        	\
     GLint p = 0;							\
@@ -80,7 +80,7 @@
  */
 #define LOCAL_DEPTH_VARS                                            \
     struct via_renderbuffer *vrb = (struct via_renderbuffer *) rb;  \
-    __DRIdrawablePrivate *dPriv = vrb->dPriv;                       \
+    __DRIdrawable *dPriv = vrb->dPriv;                       \
     GLuint depth_pitch = vrb->pitch;                                \
     GLuint height = dPriv->h;                                       \
     char *buf = (char *)(vrb->map)
diff --git a/src/mesa/drivers/dri/unichrome/via_state.c b/src/mesa/drivers/dri/unichrome/via_state.c
index 840e4e42da..e6e5526d34 100644
--- a/src/mesa/drivers/dri/unichrome/via_state.c
+++ b/src/mesa/drivers/dri/unichrome/via_state.c
@@ -476,7 +476,7 @@ void viaEmitState(struct via_context *vmesa)
     */
    if (ctx->Polygon.StippleFlag) {
       GLuint *stipple = &ctx->PolygonStipple[0];
-      __DRIdrawablePrivate *dPriv = vmesa->driDrawable;
+      __DRIdrawable *dPriv = vmesa->driDrawable;
       struct via_renderbuffer *const vrb = 
 	(struct via_renderbuffer *) dPriv->driverPrivate;
       GLint i;
@@ -722,7 +722,7 @@ static void viaColorMask(GLcontext *ctx,
 void viaCalcViewport(GLcontext *ctx)
 {
     struct via_context *vmesa = VIA_CONTEXT(ctx);
-    __DRIdrawablePrivate *dPriv = vmesa->driDrawable;
+    __DRIdrawable *dPriv = vmesa->driDrawable;
     struct via_renderbuffer *const vrb = 
       (struct via_renderbuffer *) dPriv->driverPrivate;
     const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -891,10 +891,10 @@ static GLboolean viaChooseTextureState(GLcontext *ctx)
             if (texObj->Image[0][texObj->BaseLevel]->Border > 0) {
 	       vmesa->regHTXnTB[0] |= (HC_HTXnTB_TBC_S | HC_HTXnTB_TBC_T);
 	       vmesa->regHTXnTBC[0] = 
-		  PACK_COLOR_888(FLOAT_TO_UBYTE(texObj->BorderColor[0]),
-				 FLOAT_TO_UBYTE(texObj->BorderColor[1]),
-				 FLOAT_TO_UBYTE(texObj->BorderColor[2]));
-	       vmesa->regHTXnTRAH[0] = FLOAT_TO_UBYTE(texObj->BorderColor[3]);
+		  PACK_COLOR_888(FLOAT_TO_UBYTE(texObj->BorderColor.f[0]),
+				 FLOAT_TO_UBYTE(texObj->BorderColor.f[1]),
+				 FLOAT_TO_UBYTE(texObj->BorderColor.f[2]));
+	       vmesa->regHTXnTRAH[0] = FLOAT_TO_UBYTE(texObj->BorderColor.f[3]);
             }
 
 	    if (texUnit0->LodBias != 0.0f) {
@@ -924,10 +924,10 @@ static GLboolean viaChooseTextureState(GLcontext *ctx)
             if (texObj->Image[0][texObj->BaseLevel]->Border > 0) {
 	       vmesa->regHTXnTB[1] |= (HC_HTXnTB_TBC_S | HC_HTXnTB_TBC_T);
 	       vmesa->regHTXnTBC[1] = 
-		  PACK_COLOR_888(FLOAT_TO_UBYTE(texObj->BorderColor[0]),
-				 FLOAT_TO_UBYTE(texObj->BorderColor[1]),
-				 FLOAT_TO_UBYTE(texObj->BorderColor[2]));
-	       vmesa->regHTXnTRAH[1] = FLOAT_TO_UBYTE(texObj->BorderColor[3]);
+		  PACK_COLOR_888(FLOAT_TO_UBYTE(texObj->BorderColor.f[0]),
+				 FLOAT_TO_UBYTE(texObj->BorderColor.f[1]),
+				 FLOAT_TO_UBYTE(texObj->BorderColor.f[2]));
+	       vmesa->regHTXnTRAH[1] = FLOAT_TO_UBYTE(texObj->BorderColor.f[3]);
             }
 
 
@@ -1238,12 +1238,12 @@ static void viaChooseColorState(GLcontext *ctx)
     else
         vmesa->regHROP = HC_HROP_P;
 
-    vmesa->regHFBBMSKL = PACK_COLOR_888(ctx->Color.ColorMask[0],
-					ctx->Color.ColorMask[1],
-					ctx->Color.ColorMask[2]);
-    vmesa->regHROP |= ctx->Color.ColorMask[3];
+    vmesa->regHFBBMSKL = PACK_COLOR_888(ctx->Color.ColorMask[0][0],
+					ctx->Color.ColorMask[0][1],
+					ctx->Color.ColorMask[0][2]);
+    vmesa->regHROP |= ctx->Color.ColorMask[0][3];
 
-    if (ctx->Color.ColorMask[3])
+    if (ctx->Color.ColorMask[0][3])
         vmesa->regEnable |= HC_HenAW_MASK;
     else
         vmesa->regEnable &= ~HC_HenAW_MASK;
diff --git a/src/mesa/drivers/dri/unichrome/via_tris.c b/src/mesa/drivers/dri/unichrome/via_tris.c
index 79e67620c9..01359d51ea 100644
--- a/src/mesa/drivers/dri/unichrome/via_tris.c
+++ b/src/mesa/drivers/dri/unichrome/via_tris.c
@@ -330,7 +330,8 @@ do {							\
 
 #define LOCAL_VARS(n)                                                   \
     struct via_context *vmesa = VIA_CONTEXT(ctx);                             \
-    GLuint color[n], spec[n];                                           \
+    GLuint color[n] = { 0 };                                          \
+    GLuint spec[n] = { 0 };                                           \
     GLuint coloroffset = vmesa->coloroffset;              \
     GLuint specoffset = vmesa->specoffset;                       \
     (void)color; (void)spec; (void)coloroffset; (void)specoffset;
@@ -832,13 +833,13 @@ static GLboolean viaCheckPTexHack( GLcontext *ctx )
 
    RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
 
-   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX0 ) && VB->TexCoordPtr[0]->size == 4) {
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX0 ) && VB->AttribPtr[_TNL_ATTRIB_TEX0]->size == 4) {
       if (!RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_ATTRIB_TEX1, _TNL_LAST_TEX ))
 	 ptexHack = GL_TRUE; 
       else
 	 fallback = GL_TRUE;
    }
-   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX1 ) && VB->TexCoordPtr[1]->size == 4)
+   if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX1 ) && VB->AttribPtr[_TNL_ATTRIB_TEX1]->size == 4)
       fallback = GL_TRUE;
 
    FALLBACK(VIA_CONTEXT(ctx), VIA_FALLBACK_PROJ_TEXTURE, fallback);